No OneTemporary
Actions

Size

7 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: vendor/llvm/dist-release_90/include/llvm/Analysis/AliasAnalysis.h
	===================================================================
	--- vendor/llvm/dist-release_90/include/llvm/Analysis/AliasAnalysis.h (revision 351302)
	+++ vendor/llvm/dist-release_90/include/llvm/Analysis/AliasAnalysis.h (revision 351303)
	@@ -1,1223 +1,1223 @@
	//===- llvm/Analysis/AliasAnalysis.h - Alias Analysis Interface -- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the generic AliasAnalysis interface, which is used as the
	// common interface used by all clients of alias analysis information, and
	// implemented by all alias analysis implementations. Mod/Ref information is
	// also captured by this interface.
	//
	// Implementations of this interface must implement the various virtual methods,
	// which automatically provides functionality for the entire suite of client
	// APIs.
	//
	// This API identifies memory regions with the MemoryLocation class. The pointer
	// component specifies the base memory address of the region. The Size specifies
	// the maximum size (in address units) of the memory region, or
	// MemoryLocation::UnknownSize if the size is not known. The TBAA tag
	// identifies the "type" of the memory reference; see the
	// TypeBasedAliasAnalysis class for details.
	//
	// Some non-obvious details include:
	// - Pointers that point to two completely different objects in memory never
	// alias, regardless of the value of the Size component.
	// - NoAlias doesn't imply inequal pointers. The most obvious example of this
	// is two pointers to constant memory. Even if they are equal, constant
	// memory is never stored to, so there will never be any dependencies.
	// In this and other situations, the pointers may be both NoAlias and
	// MustAlias at the same time. The current API can only return one result,
	// though this is rarely a problem in practice.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_ANALYSIS_ALIASANALYSIS_H
	#define LLVM_ANALYSIS_ALIASANALYSIS_H

	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/Analysis/MemoryLocation.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/PassManager.h"
	#include "llvm/Pass.h"
	#include <cstdint>
	#include <functional>
	#include <memory>
	#include <vector>

	namespace llvm {

	class AnalysisUsage;
	class BasicAAResult;
	class BasicBlock;
	class DominatorTree;
	class OrderedBasicBlock;
	class Value;

	/// The possible results of an alias query.
	///
	/// These results are always computed between two MemoryLocation objects as
	/// a query to some alias analysis.
	///
	/// Note that these are unscoped enumerations because we would like to support
	/// implicitly testing a result for the existence of any possible aliasing with
	/// a conversion to bool, but an "enum class" doesn't support this. The
	/// canonical names from the literature are suffixed and unique anyways, and so
	/// they serve as global constants in LLVM for these results.
	///
	/// See docs/AliasAnalysis.html for more information on the specific meanings
	/// of these values.
	enum AliasResult : uint8_t {
	/// The two locations do not alias at all.
	///
	/// This value is arranged to convert to false, while all other values
	/// convert to true. This allows a boolean context to convert the result to
	/// a binary flag indicating whether there is the possibility of aliasing.
	NoAlias = 0,
	/// The two locations may or may not alias. This is the least precise result.
	MayAlias,
	/// The two locations alias, but only due to a partial overlap.
	PartialAlias,
	/// The two locations precisely alias each other.
	MustAlias,
	};

	/// << operator for AliasResult.
	raw_ostream &operator<<(raw_ostream &OS, AliasResult AR);

	/// Flags indicating whether a memory access modifies or references memory.
	///
	/// This is no access at all, a modification, a reference, or both
	/// a modification and a reference. These are specifically structured such that
	/// they form a three bit matrix and bit-tests for 'mod' or 'ref' or 'must'
	/// work with any of the possible values.
	enum class ModRefInfo : uint8_t {
	/// Must is provided for completeness, but no routines will return only
	/// Must today. See definition of Must below.
	Must = 0,
	/// The access may reference the value stored in memory,
	/// a mustAlias relation was found, and no mayAlias or partialAlias found.
	MustRef = 1,
	/// The access may modify the value stored in memory,
	/// a mustAlias relation was found, and no mayAlias or partialAlias found.
	MustMod = 2,
	/// The access may reference, modify or both the value stored in memory,
	/// a mustAlias relation was found, and no mayAlias or partialAlias found.
	MustModRef = MustRef \| MustMod,
	/// The access neither references nor modifies the value stored in memory.
	NoModRef = 4,
	/// The access may reference the value stored in memory.
	Ref = NoModRef \| MustRef,
	/// The access may modify the value stored in memory.
	Mod = NoModRef \| MustMod,
	/// The access may reference and may modify the value stored in memory.
	ModRef = Ref \| Mod,

	/// About Must:
	/// Must is set in a best effort manner.
	/// We usually do not try our best to infer Must, instead it is merely
	/// another piece of "free" information that is presented when available.
	/// Must set means there was certainly a MustAlias found. For calls,
	/// where multiple arguments are checked (argmemonly), this translates to
	/// only MustAlias or NoAlias was found.
	/// Must is not set for RAR accesses, even if the two locations must
	/// alias. The reason is that two read accesses translate to an early return
	/// of NoModRef. An additional alias check to set Must may be
	/// expensive. Other cases may also not set Must(e.g. callCapturesBefore).
	/// We refer to Must being set when the most significant bit is cleared.
	/// Conversely we clear Must information by setting the Must bit to 1.
	};

	LLVM_NODISCARD inline bool isNoModRef(const ModRefInfo MRI) {
	return (static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustModRef)) ==
	static_cast<int>(ModRefInfo::Must);
	}
	LLVM_NODISCARD inline bool isModOrRefSet(const ModRefInfo MRI) {
	return static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustModRef);
	}
	LLVM_NODISCARD inline bool isModAndRefSet(const ModRefInfo MRI) {
	return (static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustModRef)) ==
	static_cast<int>(ModRefInfo::MustModRef);
	}
	LLVM_NODISCARD inline bool isModSet(const ModRefInfo MRI) {
	return static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustMod);
	}
	LLVM_NODISCARD inline bool isRefSet(const ModRefInfo MRI) {
	return static_cast<int>(MRI) & static_cast<int>(ModRefInfo::MustRef);
	}
	LLVM_NODISCARD inline bool isMustSet(const ModRefInfo MRI) {
	return !(static_cast<int>(MRI) & static_cast<int>(ModRefInfo::NoModRef));
	}

	LLVM_NODISCARD inline ModRefInfo setMod(const ModRefInfo MRI) {
	return ModRefInfo(static_cast<int>(MRI) \|
	static_cast<int>(ModRefInfo::MustMod));
	}
	LLVM_NODISCARD inline ModRefInfo setRef(const ModRefInfo MRI) {
	return ModRefInfo(static_cast<int>(MRI) \|
	static_cast<int>(ModRefInfo::MustRef));
	}
	LLVM_NODISCARD inline ModRefInfo setMust(const ModRefInfo MRI) {
	return ModRefInfo(static_cast<int>(MRI) &
	static_cast<int>(ModRefInfo::MustModRef));
	}
	LLVM_NODISCARD inline ModRefInfo setModAndRef(const ModRefInfo MRI) {
	return ModRefInfo(static_cast<int>(MRI) \|
	static_cast<int>(ModRefInfo::MustModRef));
	}
	LLVM_NODISCARD inline ModRefInfo clearMod(const ModRefInfo MRI) {
	return ModRefInfo(static_cast<int>(MRI) & static_cast<int>(ModRefInfo::Ref));
	}
	LLVM_NODISCARD inline ModRefInfo clearRef(const ModRefInfo MRI) {
	return ModRefInfo(static_cast<int>(MRI) & static_cast<int>(ModRefInfo::Mod));
	}
	LLVM_NODISCARD inline ModRefInfo clearMust(const ModRefInfo MRI) {
	return ModRefInfo(static_cast<int>(MRI) \|
	static_cast<int>(ModRefInfo::NoModRef));
	}
	LLVM_NODISCARD inline ModRefInfo unionModRef(const ModRefInfo MRI1,
	const ModRefInfo MRI2) {
	return ModRefInfo(static_cast<int>(MRI1) \| static_cast<int>(MRI2));
	}
	LLVM_NODISCARD inline ModRefInfo intersectModRef(const ModRefInfo MRI1,
	const ModRefInfo MRI2) {
	return ModRefInfo(static_cast<int>(MRI1) & static_cast<int>(MRI2));
	}

	/// The locations at which a function might access memory.
	///
	/// These are primarily used in conjunction with the \c AccessKind bits to
	/// describe both the nature of access and the locations of access for a
	/// function call.
	enum FunctionModRefLocation {
	/// Base case is no access to memory.
	FMRL_Nowhere = 0,
	/// Access to memory via argument pointers.
	FMRL_ArgumentPointees = 8,
	/// Memory that is inaccessible via LLVM IR.
	FMRL_InaccessibleMem = 16,
	/// Access to any memory.
	FMRL_Anywhere = 32 \| FMRL_InaccessibleMem \| FMRL_ArgumentPointees
	};

	/// Summary of how a function affects memory in the program.
	///
	/// Loads from constant globals are not considered memory accesses for this
	/// interface. Also, functions may freely modify stack space local to their
	/// invocation without having to report it through these interfaces.
	enum FunctionModRefBehavior {
	/// This function does not perform any non-local loads or stores to memory.
	///
	/// This property corresponds to the GCC 'const' attribute.
	/// This property corresponds to the LLVM IR 'readnone' attribute.
	/// This property corresponds to the IntrNoMem LLVM intrinsic flag.
	FMRB_DoesNotAccessMemory =
	FMRL_Nowhere \| static_cast<int>(ModRefInfo::NoModRef),

	/// The only memory references in this function (if it has any) are
	/// non-volatile loads from objects pointed to by its pointer-typed
	/// arguments, with arbitrary offsets.
	///
	/// This property corresponds to the IntrReadArgMem LLVM intrinsic flag.
	FMRB_OnlyReadsArgumentPointees =
	FMRL_ArgumentPointees \| static_cast<int>(ModRefInfo::Ref),

	/// The only memory references in this function (if it has any) are
	/// non-volatile loads and stores from objects pointed to by its
	/// pointer-typed arguments, with arbitrary offsets.
	///
	/// This property corresponds to the IntrArgMemOnly LLVM intrinsic flag.
	FMRB_OnlyAccessesArgumentPointees =
	FMRL_ArgumentPointees \| static_cast<int>(ModRefInfo::ModRef),

	/// The only memory references in this function (if it has any) are
	/// references of memory that is otherwise inaccessible via LLVM IR.
	///
	/// This property corresponds to the LLVM IR inaccessiblememonly attribute.
	FMRB_OnlyAccessesInaccessibleMem =
	FMRL_InaccessibleMem \| static_cast<int>(ModRefInfo::ModRef),

	/// The function may perform non-volatile loads and stores of objects
	/// pointed to by its pointer-typed arguments, with arbitrary offsets, and
	/// it may also perform loads and stores of memory that is otherwise
	/// inaccessible via LLVM IR.
	///
	/// This property corresponds to the LLVM IR
	/// inaccessiblemem_or_argmemonly attribute.
	FMRB_OnlyAccessesInaccessibleOrArgMem = FMRL_InaccessibleMem \|
	FMRL_ArgumentPointees \|
	static_cast<int>(ModRefInfo::ModRef),

	/// This function does not perform any non-local stores or volatile loads,
	/// but may read from any memory location.
	///
	/// This property corresponds to the GCC 'pure' attribute.
	/// This property corresponds to the LLVM IR 'readonly' attribute.
	/// This property corresponds to the IntrReadMem LLVM intrinsic flag.
	FMRB_OnlyReadsMemory = FMRL_Anywhere \| static_cast<int>(ModRefInfo::Ref),

	// This function does not read from memory anywhere, but may write to any
	// memory location.
	//
	// This property corresponds to the LLVM IR 'writeonly' attribute.
	// This property corresponds to the IntrWriteMem LLVM intrinsic flag.
	FMRB_DoesNotReadMemory = FMRL_Anywhere \| static_cast<int>(ModRefInfo::Mod),

	/// This indicates that the function could not be classified into one of the
	/// behaviors above.
	FMRB_UnknownModRefBehavior =
	FMRL_Anywhere \| static_cast<int>(ModRefInfo::ModRef)
	};

	// Wrapper method strips bits significant only in FunctionModRefBehavior,
	// to obtain a valid ModRefInfo. The benefit of using the wrapper is that if
	// ModRefInfo enum changes, the wrapper can be updated to & with the new enum
	// entry with all bits set to 1.
	LLVM_NODISCARD inline ModRefInfo
	createModRefInfo(const FunctionModRefBehavior FMRB) {
	return ModRefInfo(FMRB & static_cast<int>(ModRefInfo::ModRef));
	}

	/// This class stores info we want to provide to or retain within an alias
	/// query. By default, the root query is stateless and starts with a freshly
	/// constructed info object. Specific alias analyses can use this query info to
	/// store per-query state that is important for recursive or nested queries to
	/// avoid recomputing. To enable preserving this state across multiple queries
	/// where safe (due to the IR not changing), use a `BatchAAResults` wrapper.
	/// The information stored in an `AAQueryInfo` is currently limitted to the
	/// caches used by BasicAA, but can further be extended to fit other AA needs.
	class AAQueryInfo {
	public:
	using LocPair = std::pair<MemoryLocation, MemoryLocation>;
	using AliasCacheT = SmallDenseMap<LocPair, AliasResult, 8>;
	AliasCacheT AliasCache;

	using IsCapturedCacheT = SmallDenseMap<const Value *, bool, 8>;
	IsCapturedCacheT IsCapturedCache;

	AAQueryInfo() : AliasCache(), IsCapturedCache() {}
	};

	class BatchAAResults;

	class AAResults {
	public:
	// Make these results default constructable and movable. We have to spell
	// these out because MSVC won't synthesize them.
	AAResults(const TargetLibraryInfo &TLI) : TLI(TLI) {}
	AAResults(AAResults &&Arg);
	~AAResults();

	/// Register a specific AA result.
	template <typename AAResultT> void addAAResult(AAResultT &AAResult) {
	// FIXME: We should use a much lighter weight system than the usual
	// polymorphic pattern because we don't own AAResult. It should
	// ideally involve two pointers and no separate allocation.
	AAs.emplace_back(new Model<AAResultT>(AAResult, *this));
	}

	/// Register a function analysis ID that the results aggregation depends on.
	///
	/// This is used in the new pass manager to implement the invalidation logic
	/// where we must invalidate the results aggregation if any of our component
	/// analyses become invalid.
	void addAADependencyID(AnalysisKey *ID) { AADeps.push_back(ID); }

	/// Handle invalidation events in the new pass manager.
	///
	/// The aggregation is invalidated if any of the underlying analyses is
	/// invalidated.
	bool invalidate(Function &F, const PreservedAnalyses &PA,
	FunctionAnalysisManager::Invalidator &Inv);

	//===--------------------------------------------------------------------===//
	/// \name Alias Queries
	/// @{

	/// The main low level interface to the alias analysis implementation.
	/// Returns an AliasResult indicating whether the two pointers are aliased to
	/// each other. This is the interface that must be implemented by specific
	/// alias analysis implementations.
	AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB);

	/// A convenience wrapper around the primary \c alias interface.
	AliasResult alias(const Value V1, LocationSize V1Size, const Value V2,
	LocationSize V2Size) {
	return alias(MemoryLocation(V1, V1Size), MemoryLocation(V2, V2Size));
	}

	/// A convenience wrapper around the primary \c alias interface.
	AliasResult alias(const Value V1, const Value V2) {
	return alias(V1, LocationSize::unknown(), V2, LocationSize::unknown());
	}

	/// A trivial helper function to check to see if the specified pointers are
	/// no-alias.
	bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB) {
	return alias(LocA, LocB) == NoAlias;
	}

	/// A convenience wrapper around the \c isNoAlias helper interface.
	bool isNoAlias(const Value V1, LocationSize V1Size, const Value V2,
	LocationSize V2Size) {
	return isNoAlias(MemoryLocation(V1, V1Size), MemoryLocation(V2, V2Size));
	}

	/// A convenience wrapper around the \c isNoAlias helper interface.
	bool isNoAlias(const Value V1, const Value V2) {
	return isNoAlias(MemoryLocation(V1), MemoryLocation(V2));
	}

	/// A trivial helper function to check to see if the specified pointers are
	/// must-alias.
	bool isMustAlias(const MemoryLocation &LocA, const MemoryLocation &LocB) {
	return alias(LocA, LocB) == MustAlias;
	}

	/// A convenience wrapper around the \c isMustAlias helper interface.
	bool isMustAlias(const Value V1, const Value V2) {
	return alias(V1, LocationSize::precise(1), V2, LocationSize::precise(1)) ==
	MustAlias;
	}

	/// Checks whether the given location points to constant memory, or if
	/// \p OrLocal is true whether it points to a local alloca.
	bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal = false);

	/// A convenience wrapper around the primary \c pointsToConstantMemory
	/// interface.
	bool pointsToConstantMemory(const Value *P, bool OrLocal = false) {
	return pointsToConstantMemory(MemoryLocation(P), OrLocal);
	}

	/// @}
	//===--------------------------------------------------------------------===//
	/// \name Simple mod/ref information
	/// @{

	/// Get the ModRef info associated with a pointer argument of a call. The
	/// result's bits are set to indicate the allowed aliasing ModRef kinds. Note
	/// that these bits do not necessarily account for the overall behavior of
	/// the function, but rather only provide additional per-argument
	/// information. This never sets ModRefInfo::Must.
	ModRefInfo getArgModRefInfo(const CallBase *Call, unsigned ArgIdx);

	/// Return the behavior of the given call site.
	FunctionModRefBehavior getModRefBehavior(const CallBase *Call);

	/// Return the behavior when calling the given function.
	FunctionModRefBehavior getModRefBehavior(const Function *F);

	/// Checks if the specified call is known to never read or write memory.
	///
	/// Note that if the call only reads from known-constant memory, it is also
	/// legal to return true. Also, calls that unwind the stack are legal for
	/// this predicate.
	///
	/// Many optimizations (such as CSE and LICM) can be performed on such calls
	/// without worrying about aliasing properties, and many calls have this
	/// property (e.g. calls to 'sin' and 'cos').
	///
	/// This property corresponds to the GCC 'const' attribute.
	bool doesNotAccessMemory(const CallBase *Call) {
	return getModRefBehavior(Call) == FMRB_DoesNotAccessMemory;
	}

	/// Checks if the specified function is known to never read or write memory.
	///
	/// Note that if the function only reads from known-constant memory, it is
	/// also legal to return true. Also, function that unwind the stack are legal
	/// for this predicate.
	///
	/// Many optimizations (such as CSE and LICM) can be performed on such calls
	/// to such functions without worrying about aliasing properties, and many
	/// functions have this property (e.g. 'sin' and 'cos').
	///
	/// This property corresponds to the GCC 'const' attribute.
	bool doesNotAccessMemory(const Function *F) {
	return getModRefBehavior(F) == FMRB_DoesNotAccessMemory;
	}

	/// Checks if the specified call is known to only read from non-volatile
	/// memory (or not access memory at all).
	///
	/// Calls that unwind the stack are legal for this predicate.
	///
	/// This property allows many common optimizations to be performed in the
	/// absence of interfering store instructions, such as CSE of strlen calls.
	///
	/// This property corresponds to the GCC 'pure' attribute.
	bool onlyReadsMemory(const CallBase *Call) {
	return onlyReadsMemory(getModRefBehavior(Call));
	}

	/// Checks if the specified function is known to only read from non-volatile
	/// memory (or not access memory at all).
	///
	/// Functions that unwind the stack are legal for this predicate.
	///
	/// This property allows many common optimizations to be performed in the
	/// absence of interfering store instructions, such as CSE of strlen calls.
	///
	/// This property corresponds to the GCC 'pure' attribute.
	bool onlyReadsMemory(const Function *F) {
	return onlyReadsMemory(getModRefBehavior(F));
	}

	/// Checks if functions with the specified behavior are known to only read
	/// from non-volatile memory (or not access memory at all).
	static bool onlyReadsMemory(FunctionModRefBehavior MRB) {
	return !isModSet(createModRefInfo(MRB));
	}

	/// Checks if functions with the specified behavior are known to only write
	/// memory (or not access memory at all).
	static bool doesNotReadMemory(FunctionModRefBehavior MRB) {
	return !isRefSet(createModRefInfo(MRB));
	}

	/// Checks if functions with the specified behavior are known to read and
	/// write at most from objects pointed to by their pointer-typed arguments
	/// (with arbitrary offsets).
	static bool onlyAccessesArgPointees(FunctionModRefBehavior MRB) {
	return !(MRB & FMRL_Anywhere & ~FMRL_ArgumentPointees);
	}

	/// Checks if functions with the specified behavior are known to potentially
	/// read or write from objects pointed to be their pointer-typed arguments
	/// (with arbitrary offsets).
	static bool doesAccessArgPointees(FunctionModRefBehavior MRB) {
	return isModOrRefSet(createModRefInfo(MRB)) &&
	(MRB & FMRL_ArgumentPointees);
	}

	/// Checks if functions with the specified behavior are known to read and
	/// write at most from memory that is inaccessible from LLVM IR.
	static bool onlyAccessesInaccessibleMem(FunctionModRefBehavior MRB) {
	return !(MRB & FMRL_Anywhere & ~FMRL_InaccessibleMem);
	}

	/// Checks if functions with the specified behavior are known to potentially
	/// read or write from memory that is inaccessible from LLVM IR.
	static bool doesAccessInaccessibleMem(FunctionModRefBehavior MRB) {
	return isModOrRefSet(createModRefInfo(MRB)) && (MRB & FMRL_InaccessibleMem);
	}

	/// Checks if functions with the specified behavior are known to read and
	/// write at most from memory that is inaccessible from LLVM IR or objects
	/// pointed to by their pointer-typed arguments (with arbitrary offsets).
	static bool onlyAccessesInaccessibleOrArgMem(FunctionModRefBehavior MRB) {
	return !(MRB & FMRL_Anywhere &
	~(FMRL_InaccessibleMem \| FMRL_ArgumentPointees));
	}

	/// getModRefInfo (for call sites) - Return information about whether
	/// a particular call site modifies or reads the specified memory location.
	ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc);

	/// getModRefInfo (for call sites) - A convenience wrapper.
	ModRefInfo getModRefInfo(const CallBase Call, const Value P,
	LocationSize Size) {
	return getModRefInfo(Call, MemoryLocation(P, Size));
	}

	/// getModRefInfo (for loads) - Return information about whether
	/// a particular load modifies or reads the specified memory location.
	ModRefInfo getModRefInfo(const LoadInst *L, const MemoryLocation &Loc);

	/// getModRefInfo (for loads) - A convenience wrapper.
	ModRefInfo getModRefInfo(const LoadInst L, const Value P,
	LocationSize Size) {
	return getModRefInfo(L, MemoryLocation(P, Size));
	}

	/// getModRefInfo (for stores) - Return information about whether
	/// a particular store modifies or reads the specified memory location.
	ModRefInfo getModRefInfo(const StoreInst *S, const MemoryLocation &Loc);

	/// getModRefInfo (for stores) - A convenience wrapper.
	ModRefInfo getModRefInfo(const StoreInst S, const Value P,
	LocationSize Size) {
	return getModRefInfo(S, MemoryLocation(P, Size));
	}

	/// getModRefInfo (for fences) - Return information about whether
	/// a particular store modifies or reads the specified memory location.
	ModRefInfo getModRefInfo(const FenceInst *S, const MemoryLocation &Loc);

	/// getModRefInfo (for fences) - A convenience wrapper.
	ModRefInfo getModRefInfo(const FenceInst S, const Value P,
	LocationSize Size) {
	return getModRefInfo(S, MemoryLocation(P, Size));
	}

	/// getModRefInfo (for cmpxchges) - Return information about whether
	/// a particular cmpxchg modifies or reads the specified memory location.
	ModRefInfo getModRefInfo(const AtomicCmpXchgInst *CX,
	const MemoryLocation &Loc);

	/// getModRefInfo (for cmpxchges) - A convenience wrapper.
	ModRefInfo getModRefInfo(const AtomicCmpXchgInst CX, const Value P,
	LocationSize Size) {
	return getModRefInfo(CX, MemoryLocation(P, Size));
	}

	/// getModRefInfo (for atomicrmws) - Return information about whether
	/// a particular atomicrmw modifies or reads the specified memory location.
	ModRefInfo getModRefInfo(const AtomicRMWInst *RMW, const MemoryLocation &Loc);

	/// getModRefInfo (for atomicrmws) - A convenience wrapper.
	ModRefInfo getModRefInfo(const AtomicRMWInst RMW, const Value P,
	LocationSize Size) {
	return getModRefInfo(RMW, MemoryLocation(P, Size));
	}

	/// getModRefInfo (for va_args) - Return information about whether
	/// a particular va_arg modifies or reads the specified memory location.
	ModRefInfo getModRefInfo(const VAArgInst *I, const MemoryLocation &Loc);

	/// getModRefInfo (for va_args) - A convenience wrapper.
	ModRefInfo getModRefInfo(const VAArgInst I, const Value P,
	LocationSize Size) {
	return getModRefInfo(I, MemoryLocation(P, Size));
	}

	/// getModRefInfo (for catchpads) - Return information about whether
	/// a particular catchpad modifies or reads the specified memory location.
	ModRefInfo getModRefInfo(const CatchPadInst *I, const MemoryLocation &Loc);

	/// getModRefInfo (for catchpads) - A convenience wrapper.
	ModRefInfo getModRefInfo(const CatchPadInst I, const Value P,
	LocationSize Size) {
	return getModRefInfo(I, MemoryLocation(P, Size));
	}

	/// getModRefInfo (for catchrets) - Return information about whether
	/// a particular catchret modifies or reads the specified memory location.
	ModRefInfo getModRefInfo(const CatchReturnInst *I, const MemoryLocation &Loc);

	/// getModRefInfo (for catchrets) - A convenience wrapper.
	ModRefInfo getModRefInfo(const CatchReturnInst I, const Value P,
	LocationSize Size) {
	return getModRefInfo(I, MemoryLocation(P, Size));
	}

	/// Check whether or not an instruction may read or write the optionally
	/// specified memory location.
	///
	///
	/// An instruction that doesn't read or write memory may be trivially LICM'd
	/// for example.
	///
	/// For function calls, this delegates to the alias-analysis specific
	/// call-site mod-ref behavior queries. Otherwise it delegates to the specific
	/// helpers above.
	ModRefInfo getModRefInfo(const Instruction *I,
	const Optional<MemoryLocation> &OptLoc) {
	AAQueryInfo AAQIP;
	return getModRefInfo(I, OptLoc, AAQIP);
	}

	/// A convenience wrapper for constructing the memory location.
	ModRefInfo getModRefInfo(const Instruction I, const Value P,
	LocationSize Size) {
	return getModRefInfo(I, MemoryLocation(P, Size));
	}

	/// Return information about whether a call and an instruction may refer to
	/// the same memory locations.
	ModRefInfo getModRefInfo(Instruction I, const CallBase Call);

	/// Return information about whether two call sites may refer to the same set
	/// of memory locations. See the AA documentation for details:
	/// http://llvm.org/docs/AliasAnalysis.html#ModRefInfo
	ModRefInfo getModRefInfo(const CallBase Call1, const CallBase Call2);

	/// Return information about whether a particular call site modifies
	/// or reads the specified memory location \p MemLoc before instruction \p I
	/// in a BasicBlock. An ordered basic block \p OBB can be used to speed up
	/// instruction ordering queries inside the BasicBlock containing \p I.
	/// Early exits in callCapturesBefore may lead to ModRefInfo::Must not being
	/// set.
	ModRefInfo callCapturesBefore(const Instruction *I,
	const MemoryLocation &MemLoc, DominatorTree *DT,
	OrderedBasicBlock *OBB = nullptr);

	/// A convenience wrapper to synthesize a memory location.
	ModRefInfo callCapturesBefore(const Instruction I, const Value P,
	LocationSize Size, DominatorTree *DT,
	OrderedBasicBlock *OBB = nullptr) {
	return callCapturesBefore(I, MemoryLocation(P, Size), DT, OBB);
	}

	/// @}
	//===--------------------------------------------------------------------===//
	/// \name Higher level methods for querying mod/ref information.
	/// @{

	/// Check if it is possible for execution of the specified basic block to
	/// modify the location Loc.
	bool canBasicBlockModify(const BasicBlock &BB, const MemoryLocation &Loc);

	/// A convenience wrapper synthesizing a memory location.
	bool canBasicBlockModify(const BasicBlock &BB, const Value *P,
	LocationSize Size) {
	return canBasicBlockModify(BB, MemoryLocation(P, Size));
	}

	/// Check if it is possible for the execution of the specified instructions
	/// to mod\ref (according to the mode) the location Loc.
	///
	/// The instructions to consider are all of the instructions in the range of
	/// [I1,I2] INCLUSIVE. I1 and I2 must be in the same basic block.
	bool canInstructionRangeModRef(const Instruction &I1, const Instruction &I2,
	const MemoryLocation &Loc,
	const ModRefInfo Mode);

	/// A convenience wrapper synthesizing a memory location.
	bool canInstructionRangeModRef(const Instruction &I1, const Instruction &I2,
	const Value *Ptr, LocationSize Size,
	const ModRefInfo Mode) {
	return canInstructionRangeModRef(I1, I2, MemoryLocation(Ptr, Size), Mode);
	}

	private:
	AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB,
	AAQueryInfo &AAQI);
	bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI,
	bool OrLocal = false);
	ModRefInfo getModRefInfo(Instruction I, const CallBase Call2,
	AAQueryInfo &AAQIP);
	ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc,
	AAQueryInfo &AAQI);
	ModRefInfo getModRefInfo(const CallBase Call1, const CallBase Call2,
	AAQueryInfo &AAQI);
	ModRefInfo getModRefInfo(const VAArgInst *V, const MemoryLocation &Loc,
	AAQueryInfo &AAQI);
	ModRefInfo getModRefInfo(const LoadInst *L, const MemoryLocation &Loc,
	AAQueryInfo &AAQI);
	ModRefInfo getModRefInfo(const StoreInst *S, const MemoryLocation &Loc,
	AAQueryInfo &AAQI);
	ModRefInfo getModRefInfo(const FenceInst *S, const MemoryLocation &Loc,
	AAQueryInfo &AAQI);
	ModRefInfo getModRefInfo(const AtomicCmpXchgInst *CX,
	const MemoryLocation &Loc, AAQueryInfo &AAQI);
	ModRefInfo getModRefInfo(const AtomicRMWInst *RMW, const MemoryLocation &Loc,
	AAQueryInfo &AAQI);
	ModRefInfo getModRefInfo(const CatchPadInst *I, const MemoryLocation &Loc,
	AAQueryInfo &AAQI);
	ModRefInfo getModRefInfo(const CatchReturnInst *I, const MemoryLocation &Loc,
	AAQueryInfo &AAQI);
	ModRefInfo getModRefInfo(const Instruction *I,
	const Optional<MemoryLocation> &OptLoc,
	AAQueryInfo &AAQIP) {
	if (OptLoc == None) {
	if (const auto *Call = dyn_cast<CallBase>(I)) {
	return createModRefInfo(getModRefBehavior(Call));
	}
	}

	const MemoryLocation &Loc = OptLoc.getValueOr(MemoryLocation());

	switch (I->getOpcode()) {
	case Instruction::VAArg:
	return getModRefInfo((const VAArgInst *)I, Loc, AAQIP);
	case Instruction::Load:
	return getModRefInfo((const LoadInst *)I, Loc, AAQIP);
	case Instruction::Store:
	return getModRefInfo((const StoreInst *)I, Loc, AAQIP);
	case Instruction::Fence:
	return getModRefInfo((const FenceInst *)I, Loc, AAQIP);
	case Instruction::AtomicCmpXchg:
	return getModRefInfo((const AtomicCmpXchgInst *)I, Loc, AAQIP);
	case Instruction::AtomicRMW:
	return getModRefInfo((const AtomicRMWInst *)I, Loc, AAQIP);
	case Instruction::Call:
	return getModRefInfo((const CallInst *)I, Loc, AAQIP);
	case Instruction::Invoke:
	return getModRefInfo((const InvokeInst *)I, Loc, AAQIP);
	case Instruction::CatchPad:
	return getModRefInfo((const CatchPadInst *)I, Loc, AAQIP);
	case Instruction::CatchRet:
	return getModRefInfo((const CatchReturnInst *)I, Loc, AAQIP);
	default:
	return ModRefInfo::NoModRef;
	}
	}

	class Concept;

	template <typename T> class Model;

	template <typename T> friend class AAResultBase;

	const TargetLibraryInfo &TLI;

	std::vector<std::unique_ptr<Concept>> AAs;

	std::vector<AnalysisKey *> AADeps;

	friend class BatchAAResults;
	};

	/// This class is a wrapper over an AAResults, and it is intended to be used
	/// only when there are no IR changes inbetween queries. BatchAAResults is
	/// reusing the same `AAQueryInfo` to preserve the state across queries,
	/// esentially making AA work in "batch mode". The internal state cannot be
	/// cleared, so to go "out-of-batch-mode", the user must either use AAResults,
	/// or create a new BatchAAResults.
	class BatchAAResults {
	AAResults &AA;
	AAQueryInfo AAQI;

	public:
	BatchAAResults(AAResults &AAR) : AA(AAR), AAQI() {}
	AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB) {
	return AA.alias(LocA, LocB, AAQI);
	}
	bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal = false) {
	return AA.pointsToConstantMemory(Loc, AAQI, OrLocal);
	}
	ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc) {
	return AA.getModRefInfo(Call, Loc, AAQI);
	}
	ModRefInfo getModRefInfo(const CallBase Call1, const CallBase Call2) {
	return AA.getModRefInfo(Call1, Call2, AAQI);
	}
	ModRefInfo getModRefInfo(const Instruction *I,
	const Optional<MemoryLocation> &OptLoc) {
	return AA.getModRefInfo(I, OptLoc, AAQI);
	}
	ModRefInfo getModRefInfo(Instruction I, const CallBase Call2) {
	return AA.getModRefInfo(I, Call2, AAQI);
	}
	ModRefInfo getArgModRefInfo(const CallBase *Call, unsigned ArgIdx) {
	return AA.getArgModRefInfo(Call, ArgIdx);
	}
	FunctionModRefBehavior getModRefBehavior(const CallBase *Call) {
	return AA.getModRefBehavior(Call);
	}
	};

	/// Temporary typedef for legacy code that uses a generic \c AliasAnalysis
	/// pointer or reference.
	using AliasAnalysis = AAResults;

	/// A private abstract base class describing the concept of an individual alias
	/// analysis implementation.
	///
	/// This interface is implemented by any \c Model instantiation. It is also the
	/// interface which a type used to instantiate the model must provide.
	///
	/// All of these methods model methods by the same name in the \c
	/// AAResults class. Only differences and specifics to how the
	/// implementations are called are documented here.
	class AAResults::Concept {
	public:
	virtual ~Concept() = 0;

	/// An update API used internally by the AAResults to provide
	/// a handle back to the top level aggregation.
	virtual void setAAResults(AAResults *NewAAR) = 0;

	//===--------------------------------------------------------------------===//
	/// \name Alias Queries
	/// @{

	/// The main low level interface to the alias analysis implementation.
	/// Returns an AliasResult indicating whether the two pointers are aliased to
	/// each other. This is the interface that must be implemented by specific
	/// alias analysis implementations.
	virtual AliasResult alias(const MemoryLocation &LocA,
	const MemoryLocation &LocB, AAQueryInfo &AAQI) = 0;

	/// Checks whether the given location points to constant memory, or if
	/// \p OrLocal is true whether it points to a local alloca.
	virtual bool pointsToConstantMemory(const MemoryLocation &Loc,
	AAQueryInfo &AAQI, bool OrLocal) = 0;

	/// @}
	//===--------------------------------------------------------------------===//
	/// \name Simple mod/ref information
	/// @{

	/// Get the ModRef info associated with a pointer argument of a callsite. The
	/// result's bits are set to indicate the allowed aliasing ModRef kinds. Note
	/// that these bits do not necessarily account for the overall behavior of
	/// the function, but rather only provide additional per-argument
	/// information.
	virtual ModRefInfo getArgModRefInfo(const CallBase *Call,
	unsigned ArgIdx) = 0;

	/// Return the behavior of the given call site.
	virtual FunctionModRefBehavior getModRefBehavior(const CallBase *Call) = 0;

	/// Return the behavior when calling the given function.
	virtual FunctionModRefBehavior getModRefBehavior(const Function *F) = 0;

	/// getModRefInfo (for call sites) - Return information about whether
	/// a particular call site modifies or reads the specified memory location.
	virtual ModRefInfo getModRefInfo(const CallBase *Call,
	const MemoryLocation &Loc,
	AAQueryInfo &AAQI) = 0;

	/// Return information about whether two call sites may refer to the same set
	/// of memory locations. See the AA documentation for details:
	/// http://llvm.org/docs/AliasAnalysis.html#ModRefInfo
	virtual ModRefInfo getModRefInfo(const CallBase Call1, const CallBase Call2,
	AAQueryInfo &AAQI) = 0;

	/// @}
	};

	/// A private class template which derives from \c Concept and wraps some other
	/// type.
	///
	/// This models the concept by directly forwarding each interface point to the
	/// wrapped type which must implement a compatible interface. This provides
	/// a type erased binding.
	template <typename AAResultT> class AAResults::Model final : public Concept {
	AAResultT &Result;

	public:
	explicit Model(AAResultT &Result, AAResults &AAR) : Result(Result) {
	Result.setAAResults(&AAR);
	}
	~Model() override = default;

	void setAAResults(AAResults *NewAAR) override { Result.setAAResults(NewAAR); }

	AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB,
	AAQueryInfo &AAQI) override {
	return Result.alias(LocA, LocB, AAQI);
	}

	bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI,
	bool OrLocal) override {
	return Result.pointsToConstantMemory(Loc, AAQI, OrLocal);
	}

	ModRefInfo getArgModRefInfo(const CallBase *Call, unsigned ArgIdx) override {
	return Result.getArgModRefInfo(Call, ArgIdx);
	}

	FunctionModRefBehavior getModRefBehavior(const CallBase *Call) override {
	return Result.getModRefBehavior(Call);
	}

	FunctionModRefBehavior getModRefBehavior(const Function *F) override {
	return Result.getModRefBehavior(F);
	}

	ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc,
	AAQueryInfo &AAQI) override {
	return Result.getModRefInfo(Call, Loc, AAQI);
	}

	ModRefInfo getModRefInfo(const CallBase Call1, const CallBase Call2,
	AAQueryInfo &AAQI) override {
	return Result.getModRefInfo(Call1, Call2, AAQI);
	}
	};

	/// A CRTP-driven "mixin" base class to help implement the function alias
	/// analysis results concept.
	///
	/// Because of the nature of many alias analysis implementations, they often
	/// only implement a subset of the interface. This base class will attempt to
	/// implement the remaining portions of the interface in terms of simpler forms
	/// of the interface where possible, and otherwise provide conservatively
	/// correct fallback implementations.
	///
	/// Implementors of an alias analysis should derive from this CRTP, and then
	/// override specific methods that they wish to customize. There is no need to
	/// use virtual anywhere, the CRTP base class does static dispatch to the
	/// derived type passed into it.
	template <typename DerivedT> class AAResultBase {
	// Expose some parts of the interface only to the AAResults::Model
	// for wrapping. Specifically, this allows the model to call our
	// setAAResults method without exposing it as a fully public API.
	friend class AAResults::Model<DerivedT>;

	/// A pointer to the AAResults object that this AAResult is
	/// aggregated within. May be null if not aggregated.
	- AAResults *AAR;
	+ AAResults *AAR = nullptr;

	/// Helper to dispatch calls back through the derived type.
	DerivedT &derived() { return static_cast<DerivedT &>(*this); }

	/// A setter for the AAResults pointer, which is used to satisfy the
	/// AAResults::Model contract.
	void setAAResults(AAResults *NewAAR) { AAR = NewAAR; }

	protected:
	/// This proxy class models a common pattern where we delegate to either the
	/// top-level \c AAResults aggregation if one is registered, or to the
	/// current result if none are registered.
	class AAResultsProxy {
	AAResults *AAR;
	DerivedT &CurrentResult;

	public:
	AAResultsProxy(AAResults *AAR, DerivedT &CurrentResult)
	: AAR(AAR), CurrentResult(CurrentResult) {}

	AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB,
	AAQueryInfo &AAQI) {
	return AAR ? AAR->alias(LocA, LocB, AAQI)
	: CurrentResult.alias(LocA, LocB, AAQI);
	}

	bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI,
	bool OrLocal) {
	return AAR ? AAR->pointsToConstantMemory(Loc, AAQI, OrLocal)
	: CurrentResult.pointsToConstantMemory(Loc, AAQI, OrLocal);
	}

	ModRefInfo getArgModRefInfo(const CallBase *Call, unsigned ArgIdx) {
	return AAR ? AAR->getArgModRefInfo(Call, ArgIdx)
	: CurrentResult.getArgModRefInfo(Call, ArgIdx);
	}

	FunctionModRefBehavior getModRefBehavior(const CallBase *Call) {
	return AAR ? AAR->getModRefBehavior(Call)
	: CurrentResult.getModRefBehavior(Call);
	}

	FunctionModRefBehavior getModRefBehavior(const Function *F) {
	return AAR ? AAR->getModRefBehavior(F) : CurrentResult.getModRefBehavior(F);
	}

	ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc,
	AAQueryInfo &AAQI) {
	return AAR ? AAR->getModRefInfo(Call, Loc, AAQI)
	: CurrentResult.getModRefInfo(Call, Loc, AAQI);
	}

	ModRefInfo getModRefInfo(const CallBase Call1, const CallBase Call2,
	AAQueryInfo &AAQI) {
	return AAR ? AAR->getModRefInfo(Call1, Call2, AAQI)
	: CurrentResult.getModRefInfo(Call1, Call2, AAQI);
	}
	};

	explicit AAResultBase() = default;

	// Provide all the copy and move constructors so that derived types aren't
	// constrained.
	AAResultBase(const AAResultBase &Arg) {}
	AAResultBase(AAResultBase &&Arg) {}

	/// Get a proxy for the best AA result set to query at this time.
	///
	/// When this result is part of a larger aggregation, this will proxy to that
	/// aggregation. When this result is used in isolation, it will just delegate
	/// back to the derived class's implementation.
	///
	/// Note that callers of this need to take considerable care to not cause
	/// performance problems when they use this routine, in the case of a large
	/// number of alias analyses being aggregated, it can be expensive to walk
	/// back across the chain.
	AAResultsProxy getBestAAResults() { return AAResultsProxy(AAR, derived()); }

	public:
	AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB,
	AAQueryInfo &AAQI) {
	return MayAlias;
	}

	bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI,
	bool OrLocal) {
	return false;
	}

	ModRefInfo getArgModRefInfo(const CallBase *Call, unsigned ArgIdx) {
	return ModRefInfo::ModRef;
	}

	FunctionModRefBehavior getModRefBehavior(const CallBase *Call) {
	return FMRB_UnknownModRefBehavior;
	}

	FunctionModRefBehavior getModRefBehavior(const Function *F) {
	return FMRB_UnknownModRefBehavior;
	}

	ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc,
	AAQueryInfo &AAQI) {
	return ModRefInfo::ModRef;
	}

	ModRefInfo getModRefInfo(const CallBase Call1, const CallBase Call2,
	AAQueryInfo &AAQI) {
	return ModRefInfo::ModRef;
	}
	};

	/// Return true if this pointer is returned by a noalias function.
	bool isNoAliasCall(const Value *V);

	/// Return true if this is an argument with the noalias attribute.
	bool isNoAliasArgument(const Value *V);

	/// Return true if this pointer refers to a distinct and identifiable object.
	/// This returns true for:
	/// Global Variables and Functions (but not Global Aliases)
	/// Allocas
	/// ByVal and NoAlias Arguments
	/// NoAlias returns (e.g. calls to malloc)
	///
	bool isIdentifiedObject(const Value *V);

	/// Return true if V is umabigously identified at the function-level.
	/// Different IdentifiedFunctionLocals can't alias.
	/// Further, an IdentifiedFunctionLocal can not alias with any function
	/// arguments other than itself, which is not necessarily true for
	/// IdentifiedObjects.
	bool isIdentifiedFunctionLocal(const Value *V);

	/// A manager for alias analyses.
	///
	/// This class can have analyses registered with it and when run, it will run
	/// all of them and aggregate their results into single AA results interface
	/// that dispatches across all of the alias analysis results available.
	///
	/// Note that the order in which analyses are registered is very significant.
	/// That is the order in which the results will be aggregated and queried.
	///
	/// This manager effectively wraps the AnalysisManager for registering alias
	/// analyses. When you register your alias analysis with this manager, it will
	/// ensure the analysis itself is registered with its AnalysisManager.
	///
	/// The result of this analysis is only invalidated if one of the particular
	/// aggregated AA results end up being invalidated. This removes the need to
	/// explicitly preserve the results of `AAManager`. Note that analyses should no
	/// longer be registered once the `AAManager` is run.
	class AAManager : public AnalysisInfoMixin<AAManager> {
	public:
	using Result = AAResults;

	/// Register a specific AA result.
	template <typename AnalysisT> void registerFunctionAnalysis() {
	ResultGetters.push_back(&getFunctionAAResultImpl<AnalysisT>);
	}

	/// Register a specific AA result.
	template <typename AnalysisT> void registerModuleAnalysis() {
	ResultGetters.push_back(&getModuleAAResultImpl<AnalysisT>);
	}

	Result run(Function &F, FunctionAnalysisManager &AM) {
	Result R(AM.getResult<TargetLibraryAnalysis>(F));
	for (auto &Getter : ResultGetters)
	(*Getter)(F, AM, R);
	return R;
	}

	private:
	friend AnalysisInfoMixin<AAManager>;

	static AnalysisKey Key;

	SmallVector<void (*)(Function &F, FunctionAnalysisManager &AM,
	AAResults &AAResults),
	4> ResultGetters;

	template <typename AnalysisT>
	static void getFunctionAAResultImpl(Function &F,
	FunctionAnalysisManager &AM,
	AAResults &AAResults) {
	AAResults.addAAResult(AM.template getResult<AnalysisT>(F));
	AAResults.addAADependencyID(AnalysisT::ID());
	}

	template <typename AnalysisT>
	static void getModuleAAResultImpl(Function &F, FunctionAnalysisManager &AM,
	AAResults &AAResults) {
	auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
	auto &MAM = MAMProxy.getManager();
	if (auto R = MAM.template getCachedResult<AnalysisT>(F.getParent())) {
	AAResults.addAAResult(*R);
	MAMProxy
	.template registerOuterAnalysisInvalidation<AnalysisT, AAManager>();
	}
	}
	};

	/// A wrapper pass to provide the legacy pass manager access to a suitably
	/// prepared AAResults object.
	class AAResultsWrapperPass : public FunctionPass {
	std::unique_ptr<AAResults> AAR;

	public:
	static char ID;

	AAResultsWrapperPass();

	AAResults &getAAResults() { return *AAR; }
	const AAResults &getAAResults() const { return *AAR; }

	bool runOnFunction(Function &F) override;

	void getAnalysisUsage(AnalysisUsage &AU) const override;
	};

	/// A wrapper pass for external alias analyses. This just squirrels away the
	/// callback used to run any analyses and register their results.
	struct ExternalAAWrapperPass : ImmutablePass {
	using CallbackT = std::function<void(Pass &, Function &, AAResults &)>;

	CallbackT CB;

	static char ID;

	ExternalAAWrapperPass() : ImmutablePass(ID) {
	initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
	}

	explicit ExternalAAWrapperPass(CallbackT CB)
	: ImmutablePass(ID), CB(std::move(CB)) {
	initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesAll();
	}
	};

	FunctionPass *createAAResultsWrapperPass();

	/// A wrapper pass around a callback which can be used to populate the
	/// AAResults in the AAResultsWrapperPass from an external AA.
	///
	/// The callback provided here will be used each time we prepare an AAResults
	/// object, and will receive a reference to the function wrapper pass, the
	/// function, and the AAResults object to populate. This should be used when
	/// setting up a custom pass pipeline to inject a hook into the AA results.
	ImmutablePass *createExternalAAWrapperPass(
	std::function<void(Pass &, Function &, AAResults &)> Callback);

	/// A helper for the legacy pass manager to create a \c AAResults
	/// object populated to the best of our ability for a particular function when
	/// inside of a \c ModulePass or a \c CallGraphSCCPass.
	///
	/// If a \c ModulePass or a \c CallGraphSCCPass calls \p
	/// createLegacyPMAAResults, it also needs to call \p addUsedAAAnalyses in \p
	/// getAnalysisUsage.
	AAResults createLegacyPMAAResults(Pass &P, Function &F, BasicAAResult &BAR);

	/// A helper for the legacy pass manager to populate \p AU to add uses to make
	/// sure the analyses required by \p createLegacyPMAAResults are available.
	void getAAResultsAnalysisUsage(AnalysisUsage &AU);

	} // end namespace llvm

	#endif // LLVM_ANALYSIS_ALIASANALYSIS_H
	Index: vendor/llvm/dist-release_90/include/llvm/CodeGen/SelectionDAG.h
	===================================================================
	--- vendor/llvm/dist-release_90/include/llvm/CodeGen/SelectionDAG.h (revision 351302)
	+++ vendor/llvm/dist-release_90/include/llvm/CodeGen/SelectionDAG.h (revision 351303)
	@@ -1,1768 +1,1786 @@
	//===- llvm/CodeGen/SelectionDAG.h - InstSelection DAG ----------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares the SelectionDAG class, and transitively defines the
	// SDNode class and subclasses.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CODEGEN_SELECTIONDAG_H
	#define LLVM_CODEGEN_SELECTIONDAG_H

	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/FoldingSet.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringMap.h"
	#include "llvm/ADT/ilist.h"
	#include "llvm/ADT/iterator.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
	#include "llvm/CodeGen/DAGCombine.h"
	#include "llvm/CodeGen/FunctionLoweringInfo.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/Support/Allocator.h"
	#include "llvm/Support/ArrayRecycler.h"
	#include "llvm/Support/AtomicOrdering.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MachineValueType.h"
	#include "llvm/Support/RecyclingAllocator.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <functional>
	#include <map>
	#include <string>
	#include <tuple>
	#include <utility>
	#include <vector>

	namespace llvm {

	class BlockAddress;
	class Constant;
	class ConstantFP;
	class ConstantInt;
	class DataLayout;
	struct fltSemantics;
	class GlobalValue;
	struct KnownBits;
	class LLVMContext;
	class MachineBasicBlock;
	class MachineConstantPoolValue;
	class MCSymbol;
	class OptimizationRemarkEmitter;
	class SDDbgValue;
	class SDDbgLabel;
	class SelectionDAG;
	class SelectionDAGTargetInfo;
	class TargetLibraryInfo;
	class TargetLowering;
	class TargetMachine;
	class TargetSubtargetInfo;
	class Value;

	class SDVTListNode : public FoldingSetNode {
	friend struct FoldingSetTrait<SDVTListNode>;

	/// A reference to an Interned FoldingSetNodeID for this node.
	/// The Allocator in SelectionDAG holds the data.
	/// SDVTList contains all types which are frequently accessed in SelectionDAG.
	/// The size of this list is not expected to be big so it won't introduce
	/// a memory penalty.
	FoldingSetNodeIDRef FastID;
	const EVT *VTs;
	unsigned int NumVTs;
	/// The hash value for SDVTList is fixed, so cache it to avoid
	/// hash calculation.
	unsigned HashValue;

	public:
	SDVTListNode(const FoldingSetNodeIDRef ID, const EVT *VT, unsigned int Num) :
	FastID(ID), VTs(VT), NumVTs(Num) {
	HashValue = ID.ComputeHash();
	}

	SDVTList getSDVTList() {
	SDVTList result = {VTs, NumVTs};
	return result;
	}
	};

	/// Specialize FoldingSetTrait for SDVTListNode
	/// to avoid computing temp FoldingSetNodeID and hash value.
	template<> struct FoldingSetTrait<SDVTListNode> : DefaultFoldingSetTrait<SDVTListNode> {
	static void Profile(const SDVTListNode &X, FoldingSetNodeID& ID) {
	ID = X.FastID;
	}

	static bool Equals(const SDVTListNode &X, const FoldingSetNodeID &ID,
	unsigned IDHash, FoldingSetNodeID &TempID) {
	if (X.HashValue != IDHash)
	return false;
	return ID == X.FastID;
	}

	static unsigned ComputeHash(const SDVTListNode &X, FoldingSetNodeID &TempID) {
	return X.HashValue;
	}
	};

	template <> struct ilist_alloc_traits<SDNode> {
	static void deleteNode(SDNode *) {
	llvm_unreachable("ilist_traits<SDNode> shouldn't see a deleteNode call!");
	}
	};

	/// Keeps track of dbg_value information through SDISel. We do
	/// not build SDNodes for these so as not to perturb the generated code;
	/// instead the info is kept off to the side in this structure. Each SDNode may
	/// have one or more associated dbg_value entries. This information is kept in
	/// DbgValMap.
	/// Byval parameters are handled separately because they don't use alloca's,
	/// which busts the normal mechanism. There is good reason for handling all
	/// parameters separately: they may not have code generated for them, they
	/// should always go at the beginning of the function regardless of other code
	/// motion, and debug info for them is potentially useful even if the parameter
	/// is unused. Right now only byval parameters are handled separately.
	class SDDbgInfo {
	BumpPtrAllocator Alloc;
	SmallVector<SDDbgValue*, 32> DbgValues;
	SmallVector<SDDbgValue*, 32> ByvalParmDbgValues;
	SmallVector<SDDbgLabel*, 4> DbgLabels;
	using DbgValMapType = DenseMap<const SDNode , SmallVector<SDDbgValue , 2>>;
	DbgValMapType DbgValMap;

	public:
	SDDbgInfo() = default;
	SDDbgInfo(const SDDbgInfo &) = delete;
	SDDbgInfo &operator=(const SDDbgInfo &) = delete;

	void add(SDDbgValue V, const SDNode Node, bool isParameter) {
	if (isParameter) {
	ByvalParmDbgValues.push_back(V);
	} else DbgValues.push_back(V);
	if (Node)
	DbgValMap[Node].push_back(V);
	}

	void add(SDDbgLabel *L) {
	DbgLabels.push_back(L);
	}

	/// Invalidate all DbgValues attached to the node and remove
	/// it from the Node-to-DbgValues map.
	void erase(const SDNode *Node);

	void clear() {
	DbgValMap.clear();
	DbgValues.clear();
	ByvalParmDbgValues.clear();
	DbgLabels.clear();
	Alloc.Reset();
	}

	BumpPtrAllocator &getAlloc() { return Alloc; }

	bool empty() const {
	return DbgValues.empty() && ByvalParmDbgValues.empty() && DbgLabels.empty();
	}

	ArrayRef<SDDbgValue> getSDDbgValues(const SDNode Node) const {
	auto I = DbgValMap.find(Node);
	if (I != DbgValMap.end())
	return I->second;
	return ArrayRef<SDDbgValue*>();
	}

	using DbgIterator = SmallVectorImpl<SDDbgValue*>::iterator;
	using DbgLabelIterator = SmallVectorImpl<SDDbgLabel*>::iterator;

	DbgIterator DbgBegin() { return DbgValues.begin(); }
	DbgIterator DbgEnd() { return DbgValues.end(); }
	DbgIterator ByvalParmDbgBegin() { return ByvalParmDbgValues.begin(); }
	DbgIterator ByvalParmDbgEnd() { return ByvalParmDbgValues.end(); }
	DbgLabelIterator DbgLabelBegin() { return DbgLabels.begin(); }
	DbgLabelIterator DbgLabelEnd() { return DbgLabels.end(); }
	};

	void checkForCycles(const SelectionDAG *DAG, bool force = false);

	/// This is used to represent a portion of an LLVM function in a low-level
	/// Data Dependence DAG representation suitable for instruction selection.
	/// This DAG is constructed as the first step of instruction selection in order
	/// to allow implementation of machine specific optimizations
	/// and code simplifications.
	///
	/// The representation used by the SelectionDAG is a target-independent
	/// representation, which has some similarities to the GCC RTL representation,
	/// but is significantly more simple, powerful, and is a graph form instead of a
	/// linear form.
	///
	class SelectionDAG {
	const TargetMachine &TM;
	const SelectionDAGTargetInfo *TSI = nullptr;
	const TargetLowering *TLI = nullptr;
	const TargetLibraryInfo *LibInfo = nullptr;
	MachineFunction *MF;
	Pass *SDAGISelPass = nullptr;
	LLVMContext *Context;
	CodeGenOpt::Level OptLevel;

	LegacyDivergenceAnalysis * DA = nullptr;
	FunctionLoweringInfo * FLI = nullptr;

	/// The function-level optimization remark emitter. Used to emit remarks
	/// whenever manipulating the DAG.
	OptimizationRemarkEmitter *ORE;

	/// The starting token.
	SDNode EntryNode;

	/// The root of the entire DAG.
	SDValue Root;

	/// A linked list of nodes in the current DAG.
	ilist<SDNode> AllNodes;

	/// The AllocatorType for allocating SDNodes. We use
	/// pool allocation with recycling.
	using NodeAllocatorType = RecyclingAllocator<BumpPtrAllocator, SDNode,
	sizeof(LargestSDNode),
	alignof(MostAlignedSDNode)>;

	/// Pool allocation for nodes.
	NodeAllocatorType NodeAllocator;

	/// This structure is used to memoize nodes, automatically performing
	/// CSE with existing nodes when a duplicate is requested.
	FoldingSet<SDNode> CSEMap;

	/// Pool allocation for machine-opcode SDNode operands.
	BumpPtrAllocator OperandAllocator;
	ArrayRecycler<SDUse> OperandRecycler;

	/// Pool allocation for misc. objects that are created once per SelectionDAG.
	BumpPtrAllocator Allocator;

	/// Tracks dbg_value and dbg_label information through SDISel.
	SDDbgInfo *DbgInfo;

	using CallSiteInfo = MachineFunction::CallSiteInfo;
	using CallSiteInfoImpl = MachineFunction::CallSiteInfoImpl;
	- DenseMap<const SDNode *, CallSiteInfo> SDCallSiteInfo;

	+ struct CallSiteDbgInfo {
	+ CallSiteInfo CSInfo;
	+ MDNode *HeapAllocSite = nullptr;
	+ };
	+
	+ DenseMap<const SDNode *, CallSiteDbgInfo> SDCallSiteDbgInfo;
	+
	uint16_t NextPersistentId = 0;

	public:
	/// Clients of various APIs that cause global effects on
	/// the DAG can optionally implement this interface. This allows the clients
	/// to handle the various sorts of updates that happen.
	///
	/// A DAGUpdateListener automatically registers itself with DAG when it is
	/// constructed, and removes itself when destroyed in RAII fashion.
	struct DAGUpdateListener {
	DAGUpdateListener *const Next;
	SelectionDAG &DAG;

	explicit DAGUpdateListener(SelectionDAG &D)
	: Next(D.UpdateListeners), DAG(D) {
	DAG.UpdateListeners = this;
	}

	virtual ~DAGUpdateListener() {
	assert(DAG.UpdateListeners == this &&
	"DAGUpdateListeners must be destroyed in LIFO order");
	DAG.UpdateListeners = Next;
	}

	/// The node N that was deleted and, if E is not null, an
	/// equivalent node E that replaced it.
	virtual void NodeDeleted(SDNode N, SDNode E);

	/// The node N that was updated.
	virtual void NodeUpdated(SDNode *N);

	/// The node N that was inserted.
	virtual void NodeInserted(SDNode *N);
	};

	struct DAGNodeDeletedListener : public DAGUpdateListener {
	std::function<void(SDNode , SDNode )> Callback;

	DAGNodeDeletedListener(SelectionDAG &DAG,
	std::function<void(SDNode , SDNode )> Callback)
	: DAGUpdateListener(DAG), Callback(std::move(Callback)) {}

	void NodeDeleted(SDNode N, SDNode E) override { Callback(N, E); }

	private:
	virtual void anchor();
	};

	/// When true, additional steps are taken to
	/// ensure that getConstant() and similar functions return DAG nodes that
	/// have legal types. This is important after type legalization since
	/// any illegally typed nodes generated after this point will not experience
	/// type legalization.
	bool NewNodesMustHaveLegalTypes = false;

	private:
	/// DAGUpdateListener is a friend so it can manipulate the listener stack.
	friend struct DAGUpdateListener;

	/// Linked list of registered DAGUpdateListener instances.
	/// This stack is maintained by DAGUpdateListener RAII.
	DAGUpdateListener *UpdateListeners = nullptr;

	/// Implementation of setSubgraphColor.
	/// Return whether we had to truncate the search.
	bool setSubgraphColorHelper(SDNode N, const char Color,
	DenseSet<SDNode *> &visited,
	int level, bool &printed);

	template <typename SDNodeT, typename... ArgTypes>
	SDNodeT *newSDNode(ArgTypes &&... Args) {
	return new (NodeAllocator.template Allocate<SDNodeT>())
	SDNodeT(std::forward<ArgTypes>(Args)...);
	}

	/// Build a synthetic SDNodeT with the given args and extract its subclass
	/// data as an integer (e.g. for use in a folding set).
	///
	/// The args to this function are the same as the args to SDNodeT's
	/// constructor, except the second arg (assumed to be a const DebugLoc&) is
	/// omitted.
	template <typename SDNodeT, typename... ArgTypes>
	static uint16_t getSyntheticNodeSubclassData(unsigned IROrder,
	ArgTypes &&... Args) {
	// The compiler can reduce this expression to a constant iff we pass an
	// empty DebugLoc. Thankfully, the debug location doesn't have any bearing
	// on the subclass data.
	return SDNodeT(IROrder, DebugLoc(), std::forward<ArgTypes>(Args)...)
	.getRawSubclassData();
	}

	template <typename SDNodeTy>
	static uint16_t getSyntheticNodeSubclassData(unsigned Opc, unsigned Order,
	SDVTList VTs, EVT MemoryVT,
	MachineMemOperand *MMO) {
	return SDNodeTy(Opc, Order, DebugLoc(), VTs, MemoryVT, MMO)
	.getRawSubclassData();
	}

	void createOperands(SDNode *Node, ArrayRef<SDValue> Vals);

	void removeOperands(SDNode *Node) {
	if (!Node->OperandList)
	return;
	OperandRecycler.deallocate(
	ArrayRecycler<SDUse>::Capacity::get(Node->NumOperands),
	Node->OperandList);
	Node->NumOperands = 0;
	Node->OperandList = nullptr;
	}
	void CreateTopologicalOrder(std::vector<SDNode*>& Order);
	public:
	explicit SelectionDAG(const TargetMachine &TM, CodeGenOpt::Level);
	SelectionDAG(const SelectionDAG &) = delete;
	SelectionDAG &operator=(const SelectionDAG &) = delete;
	~SelectionDAG();

	/// Prepare this SelectionDAG to process code in the given MachineFunction.
	void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE,
	Pass PassPtr, const TargetLibraryInfo LibraryInfo,
	LegacyDivergenceAnalysis * Divergence);

	void setFunctionLoweringInfo(FunctionLoweringInfo * FuncInfo) {
	FLI = FuncInfo;
	}

	/// Clear state and free memory necessary to make this
	/// SelectionDAG ready to process a new block.
	void clear();

	MachineFunction &getMachineFunction() const { return *MF; }
	const Pass *getPass() const { return SDAGISelPass; }

	const DataLayout &getDataLayout() const { return MF->getDataLayout(); }
	const TargetMachine &getTarget() const { return TM; }
	const TargetSubtargetInfo &getSubtarget() const { return MF->getSubtarget(); }
	const TargetLowering &getTargetLoweringInfo() const { return *TLI; }
	const TargetLibraryInfo &getLibInfo() const { return *LibInfo; }
	const SelectionDAGTargetInfo &getSelectionDAGInfo() const { return *TSI; }
	const LegacyDivergenceAnalysis *getDivergenceAnalysis() const { return DA; }
	LLVMContext *getContext() const {return Context; }
	OptimizationRemarkEmitter &getORE() const { return *ORE; }

	/// Pop up a GraphViz/gv window with the DAG rendered using 'dot'.
	void viewGraph(const std::string &Title);
	void viewGraph();

	#ifndef NDEBUG
	std::map<const SDNode *, std::string> NodeGraphAttrs;
	#endif

	/// Clear all previously defined node graph attributes.
	/// Intended to be used from a debugging tool (eg. gdb).
	void clearGraphAttrs();

	/// Set graph attributes for a node. (eg. "color=red".)
	void setGraphAttrs(const SDNode N, const char Attrs);

	/// Get graph attributes for a node. (eg. "color=red".)
	/// Used from getNodeAttributes.
	const std::string getGraphAttrs(const SDNode *N) const;

	/// Convenience for setting node color attribute.
	void setGraphColor(const SDNode N, const char Color);

	/// Convenience for setting subgraph color attribute.
	void setSubgraphColor(SDNode N, const char Color);

	using allnodes_const_iterator = ilist<SDNode>::const_iterator;

	allnodes_const_iterator allnodes_begin() const { return AllNodes.begin(); }
	allnodes_const_iterator allnodes_end() const { return AllNodes.end(); }

	using allnodes_iterator = ilist<SDNode>::iterator;

	allnodes_iterator allnodes_begin() { return AllNodes.begin(); }
	allnodes_iterator allnodes_end() { return AllNodes.end(); }

	ilist<SDNode>::size_type allnodes_size() const {
	return AllNodes.size();
	}

	iterator_range<allnodes_iterator> allnodes() {
	return make_range(allnodes_begin(), allnodes_end());
	}
	iterator_range<allnodes_const_iterator> allnodes() const {
	return make_range(allnodes_begin(), allnodes_end());
	}

	/// Return the root tag of the SelectionDAG.
	const SDValue &getRoot() const { return Root; }

	/// Return the token chain corresponding to the entry of the function.
	SDValue getEntryNode() const {
	return SDValue(const_cast<SDNode *>(&EntryNode), 0);
	}

	/// Set the current root tag of the SelectionDAG.
	///
	const SDValue &setRoot(SDValue N) {
	assert((!N.getNode() \|\| N.getValueType() == MVT::Other) &&
	"DAG root value is not a chain!");
	if (N.getNode())
	checkForCycles(N.getNode(), this);
	Root = N;
	if (N.getNode())
	checkForCycles(this);
	return Root;
	}

	#ifndef NDEBUG
	void VerifyDAGDiverence();
	#endif

	/// This iterates over the nodes in the SelectionDAG, folding
	/// certain types of nodes together, or eliminating superfluous nodes. The
	/// Level argument controls whether Combine is allowed to produce nodes and
	/// types that are illegal on the target.
	void Combine(CombineLevel Level, AliasAnalysis *AA,
	CodeGenOpt::Level OptLevel);

	/// This transforms the SelectionDAG into a SelectionDAG that
	/// only uses types natively supported by the target.
	/// Returns "true" if it made any changes.
	///
	/// Note that this is an involved process that may invalidate pointers into
	/// the graph.
	bool LegalizeTypes();

	/// This transforms the SelectionDAG into a SelectionDAG that is
	/// compatible with the target instruction selector, as indicated by the
	/// TargetLowering object.
	///
	/// Note that this is an involved process that may invalidate pointers into
	/// the graph.
	void Legalize();

	/// Transforms a SelectionDAG node and any operands to it into a node
	/// that is compatible with the target instruction selector, as indicated by
	/// the TargetLowering object.
	///
	/// \returns true if \c N is a valid, legal node after calling this.
	///
	/// This essentially runs a single recursive walk of the \c Legalize process
	/// over the given node (and its operands). This can be used to incrementally
	/// legalize the DAG. All of the nodes which are directly replaced,
	/// potentially including N, are added to the output parameter \c
	/// UpdatedNodes so that the delta to the DAG can be understood by the
	/// caller.
	///
	/// When this returns false, N has been legalized in a way that make the
	/// pointer passed in no longer valid. It may have even been deleted from the
	/// DAG, and so it shouldn't be used further. When this returns true, the
	/// N passed in is a legal node, and can be immediately processed as such.
	/// This may still have done some work on the DAG, and will still populate
	/// UpdatedNodes with any new nodes replacing those originally in the DAG.
	bool LegalizeOp(SDNode N, SmallSetVector<SDNode , 16> &UpdatedNodes);

	/// This transforms the SelectionDAG into a SelectionDAG
	/// that only uses vector math operations supported by the target. This is
	/// necessary as a separate step from Legalize because unrolling a vector
	/// operation can introduce illegal types, which requires running
	/// LegalizeTypes again.
	///
	/// This returns true if it made any changes; in that case, LegalizeTypes
	/// is called again before Legalize.
	///
	/// Note that this is an involved process that may invalidate pointers into
	/// the graph.
	bool LegalizeVectors();

	/// This method deletes all unreachable nodes in the SelectionDAG.
	void RemoveDeadNodes();

	/// Remove the specified node from the system. This node must
	/// have no referrers.
	void DeleteNode(SDNode *N);

	/// Return an SDVTList that represents the list of values specified.
	SDVTList getVTList(EVT VT);
	SDVTList getVTList(EVT VT1, EVT VT2);
	SDVTList getVTList(EVT VT1, EVT VT2, EVT VT3);
	SDVTList getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4);
	SDVTList getVTList(ArrayRef<EVT> VTs);

	//===--------------------------------------------------------------------===//
	// Node creation methods.

	/// Create a ConstantSDNode wrapping a constant value.
	/// If VT is a vector type, the constant is splatted into a BUILD_VECTOR.
	///
	/// If only legal types can be produced, this does the necessary
	/// transformations (e.g., if the vector element type is illegal).
	/// @{
	SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
	bool isTarget = false, bool isOpaque = false);
	SDValue getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
	bool isTarget = false, bool isOpaque = false);

	SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget = false,
	bool IsOpaque = false) {
	return getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()), DL,
	VT, IsTarget, IsOpaque);
	}

	SDValue getConstant(const ConstantInt &Val, const SDLoc &DL, EVT VT,
	bool isTarget = false, bool isOpaque = false);
	SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL,
	bool isTarget = false);
	SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL,
	bool LegalTypes = true);

	SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT,
	bool isOpaque = false) {
	return getConstant(Val, DL, VT, true, isOpaque);
	}
	SDValue getTargetConstant(const APInt &Val, const SDLoc &DL, EVT VT,
	bool isOpaque = false) {
	return getConstant(Val, DL, VT, true, isOpaque);
	}
	SDValue getTargetConstant(const ConstantInt &Val, const SDLoc &DL, EVT VT,
	bool isOpaque = false) {
	return getConstant(Val, DL, VT, true, isOpaque);
	}

	/// Create a true or false constant of type \p VT using the target's
	/// BooleanContent for type \p OpVT.
	SDValue getBoolConstant(bool V, const SDLoc &DL, EVT VT, EVT OpVT);
	/// @}

	/// Create a ConstantFPSDNode wrapping a constant value.
	/// If VT is a vector type, the constant is splatted into a BUILD_VECTOR.
	///
	/// If only legal types can be produced, this does the necessary
	/// transformations (e.g., if the vector element type is illegal).
	/// The forms that take a double should only be used for simple constants
	/// that can be exactly represented in VT. No checks are made.
	/// @{
	SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT,
	bool isTarget = false);
	SDValue getConstantFP(const APFloat &Val, const SDLoc &DL, EVT VT,
	bool isTarget = false);
	SDValue getConstantFP(const ConstantFP &V, const SDLoc &DL, EVT VT,
	bool isTarget = false);
	SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT) {
	return getConstantFP(Val, DL, VT, true);
	}
	SDValue getTargetConstantFP(const APFloat &Val, const SDLoc &DL, EVT VT) {
	return getConstantFP(Val, DL, VT, true);
	}
	SDValue getTargetConstantFP(const ConstantFP &Val, const SDLoc &DL, EVT VT) {
	return getConstantFP(Val, DL, VT, true);
	}
	/// @}

	SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT,
	int64_t offset = 0, bool isTargetGA = false,
	unsigned char TargetFlags = 0);
	SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT,
	int64_t offset = 0,
	unsigned char TargetFlags = 0) {
	return getGlobalAddress(GV, DL, VT, offset, true, TargetFlags);
	}
	SDValue getFrameIndex(int FI, EVT VT, bool isTarget = false);
	SDValue getTargetFrameIndex(int FI, EVT VT) {
	return getFrameIndex(FI, VT, true);
	}
	SDValue getJumpTable(int JTI, EVT VT, bool isTarget = false,
	unsigned char TargetFlags = 0);
	SDValue getTargetJumpTable(int JTI, EVT VT, unsigned char TargetFlags = 0) {
	return getJumpTable(JTI, VT, true, TargetFlags);
	}
	SDValue getConstantPool(const Constant *C, EVT VT,
	unsigned Align = 0, int Offs = 0, bool isT=false,
	unsigned char TargetFlags = 0);
	SDValue getTargetConstantPool(const Constant *C, EVT VT,
	unsigned Align = 0, int Offset = 0,
	unsigned char TargetFlags = 0) {
	return getConstantPool(C, VT, Align, Offset, true, TargetFlags);
	}
	SDValue getConstantPool(MachineConstantPoolValue *C, EVT VT,
	unsigned Align = 0, int Offs = 0, bool isT=false,
	unsigned char TargetFlags = 0);
	SDValue getTargetConstantPool(MachineConstantPoolValue *C,
	EVT VT, unsigned Align = 0,
	int Offset = 0, unsigned char TargetFlags=0) {
	return getConstantPool(C, VT, Align, Offset, true, TargetFlags);
	}
	SDValue getTargetIndex(int Index, EVT VT, int64_t Offset = 0,
	unsigned char TargetFlags = 0);
	// When generating a branch to a BB, we don't in general know enough
	// to provide debug info for the BB at that time, so keep this one around.
	SDValue getBasicBlock(MachineBasicBlock *MBB);
	SDValue getBasicBlock(MachineBasicBlock *MBB, SDLoc dl);
	SDValue getExternalSymbol(const char *Sym, EVT VT);
	SDValue getExternalSymbol(const char *Sym, const SDLoc &dl, EVT VT);
	SDValue getTargetExternalSymbol(const char *Sym, EVT VT,
	unsigned char TargetFlags = 0);
	SDValue getMCSymbol(MCSymbol *Sym, EVT VT);

	SDValue getValueType(EVT);
	SDValue getRegister(unsigned Reg, EVT VT);
	SDValue getRegisterMask(const uint32_t *RegMask);
	SDValue getEHLabel(const SDLoc &dl, SDValue Root, MCSymbol *Label);
	SDValue getLabelNode(unsigned Opcode, const SDLoc &dl, SDValue Root,
	MCSymbol *Label);
	SDValue getBlockAddress(const BlockAddress *BA, EVT VT,
	int64_t Offset = 0, bool isTarget = false,
	unsigned char TargetFlags = 0);
	SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT,
	int64_t Offset = 0,
	unsigned char TargetFlags = 0) {
	return getBlockAddress(BA, VT, Offset, true, TargetFlags);
	}

	SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg,
	SDValue N) {
	return getNode(ISD::CopyToReg, dl, MVT::Other, Chain,
	getRegister(Reg, N.getValueType()), N);
	}

	// This version of the getCopyToReg method takes an extra operand, which
	// indicates that there is potentially an incoming glue value (if Glue is not
	// null) and that there should be a glue result.
	SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N,
	SDValue Glue) {
	SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
	SDValue Ops[] = { Chain, getRegister(Reg, N.getValueType()), N, Glue };
	return getNode(ISD::CopyToReg, dl, VTs,
	makeArrayRef(Ops, Glue.getNode() ? 4 : 3));
	}

	// Similar to last getCopyToReg() except parameter Reg is a SDValue
	SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, SDValue Reg, SDValue N,
	SDValue Glue) {
	SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
	SDValue Ops[] = { Chain, Reg, N, Glue };
	return getNode(ISD::CopyToReg, dl, VTs,
	makeArrayRef(Ops, Glue.getNode() ? 4 : 3));
	}

	SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT) {
	SDVTList VTs = getVTList(VT, MVT::Other);
	SDValue Ops[] = { Chain, getRegister(Reg, VT) };
	return getNode(ISD::CopyFromReg, dl, VTs, Ops);
	}

	// This version of the getCopyFromReg method takes an extra operand, which
	// indicates that there is potentially an incoming glue value (if Glue is not
	// null) and that there should be a glue result.
	SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT,
	SDValue Glue) {
	SDVTList VTs = getVTList(VT, MVT::Other, MVT::Glue);
	SDValue Ops[] = { Chain, getRegister(Reg, VT), Glue };
	return getNode(ISD::CopyFromReg, dl, VTs,
	makeArrayRef(Ops, Glue.getNode() ? 3 : 2));
	}

	SDValue getCondCode(ISD::CondCode Cond);

	/// Return an ISD::VECTOR_SHUFFLE node. The number of elements in VT,
	/// which must be a vector type, must match the number of mask elements
	/// NumElts. An integer mask element equal to -1 is treated as undefined.
	SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2,
	ArrayRef<int> Mask);

	/// Return an ISD::BUILD_VECTOR node. The number of elements in VT,
	/// which must be a vector type, must match the number of operands in Ops.
	/// The operands must have the same type as (or, for integers, a type wider
	/// than) VT's element type.
	SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef<SDValue> Ops) {
	// VerifySDNode (via InsertNode) checks BUILD_VECTOR later.
	return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
	}

	/// Return an ISD::BUILD_VECTOR node. The number of elements in VT,
	/// which must be a vector type, must match the number of operands in Ops.
	/// The operands must have the same type as (or, for integers, a type wider
	/// than) VT's element type.
	SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef<SDUse> Ops) {
	// VerifySDNode (via InsertNode) checks BUILD_VECTOR later.
	return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
	}

	/// Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all
	/// elements. VT must be a vector type. Op's type must be the same as (or,
	/// for integers, a type wider than) VT's element type.
	SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op) {
	// VerifySDNode (via InsertNode) checks BUILD_VECTOR later.
	if (Op.getOpcode() == ISD::UNDEF) {
	assert((VT.getVectorElementType() == Op.getValueType() \|\|
	(VT.isInteger() &&
	VT.getVectorElementType().bitsLE(Op.getValueType()))) &&
	"A splatted value must have a width equal or (for integers) "
	"greater than the vector element type!");
	return getNode(ISD::UNDEF, SDLoc(), VT);
	}

	SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Op);
	return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
	}

	/// Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to
	/// the shuffle node in input but with swapped operands.
	///
	/// Example: shuffle A, B, <0,5,2,7> -> shuffle B, A, <4,1,6,3>
	SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV);

	/// Convert Op, which must be of float type, to the
	/// float type VT, by either extending or rounding (by truncation).
	SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT);

	/// Convert Op, which must be of integer type, to the
	/// integer type VT, by either any-extending or truncating it.
	SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT);

	/// Convert Op, which must be of integer type, to the
	/// integer type VT, by either sign-extending or truncating it.
	SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT);

	/// Convert Op, which must be of integer type, to the
	/// integer type VT, by either zero-extending or truncating it.
	SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT);

	/// Return the expression required to zero extend the Op
	/// value assuming it was the smaller SrcTy value.
	SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT);

	/// Convert Op, which must be of integer type, to the integer type VT, by
	/// either truncating it or performing either zero or sign extension as
	/// appropriate extension for the pointer's semantics.
	SDValue getPtrExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT);

	/// Return the expression required to extend the Op as a pointer value
	/// assuming it was the smaller SrcTy value. This may be either a zero extend
	/// or a sign extend.
	SDValue getPtrExtendInReg(SDValue Op, const SDLoc &DL, EVT VT);

	/// Convert Op, which must be of integer type, to the integer type VT,
	/// by using an extension appropriate for the target's
	/// BooleanContent for type OpVT or truncating it.
	SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT);

	/// Create a bitwise NOT operation as (XOR Val, -1).
	SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT);

	/// Create a logical NOT operation as (XOR Val, BooleanOne).
	SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT);

	/// Create an add instruction with appropriate flags when used for
	/// addressing some offset of an object. i.e. if a load is split into multiple
	/// components, create an add nuw from the base pointer to the offset.
	SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Op, int64_t Offset) {
	EVT VT = Op.getValueType();
	return getObjectPtrOffset(SL, Op, getConstant(Offset, SL, VT));
	}

	SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Op, SDValue Offset) {
	EVT VT = Op.getValueType();

	// The object itself can't wrap around the address space, so it shouldn't be
	// possible for the adds of the offsets to the split parts to overflow.
	SDNodeFlags Flags;
	Flags.setNoUnsignedWrap(true);
	return getNode(ISD::ADD, SL, VT, Op, Offset, Flags);
	}

	/// Return a new CALLSEQ_START node, that starts new call frame, in which
	/// InSize bytes are set up inside CALLSEQ_START..CALLSEQ_END sequence and
	/// OutSize specifies part of the frame set up prior to the sequence.
	SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize,
	const SDLoc &DL) {
	SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
	SDValue Ops[] = { Chain,
	getIntPtrConstant(InSize, DL, true),
	getIntPtrConstant(OutSize, DL, true) };
	return getNode(ISD::CALLSEQ_START, DL, VTs, Ops);
	}

	/// Return a new CALLSEQ_END node, which always must have a
	/// glue result (to ensure it's not CSE'd).
	/// CALLSEQ_END does not have a useful SDLoc.
	SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2,
	SDValue InGlue, const SDLoc &DL) {
	SDVTList NodeTys = getVTList(MVT::Other, MVT::Glue);
	SmallVector<SDValue, 4> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Op1);
	Ops.push_back(Op2);
	if (InGlue.getNode())
	Ops.push_back(InGlue);
	return getNode(ISD::CALLSEQ_END, DL, NodeTys, Ops);
	}

	/// Return true if the result of this operation is always undefined.
	bool isUndef(unsigned Opcode, ArrayRef<SDValue> Ops);

	/// Return an UNDEF node. UNDEF does not have a useful SDLoc.
	SDValue getUNDEF(EVT VT) {
	return getNode(ISD::UNDEF, SDLoc(), VT);
	}

	/// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc.
	SDValue getGLOBAL_OFFSET_TABLE(EVT VT) {
	return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT);
	}

	/// Gets or creates the specified node.
	///
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	ArrayRef<SDUse> Ops);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	ArrayRef<SDValue> Ops, const SDNodeFlags Flags = SDNodeFlags());
	SDValue getNode(unsigned Opcode, const SDLoc &DL, ArrayRef<EVT> ResultTys,
	ArrayRef<SDValue> Ops);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	ArrayRef<SDValue> Ops);

	// Specialize based on number of operands.
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand,
	const SDNodeFlags Flags = SDNodeFlags());
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
	SDValue N2, const SDNodeFlags Flags = SDNodeFlags());
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
	SDValue N2, SDValue N3,
	const SDNodeFlags Flags = SDNodeFlags());
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
	SDValue N2, SDValue N3, SDValue N4);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
	SDValue N2, SDValue N3, SDValue N4, SDValue N5);

	// Specialize again based on number of operands for nodes with a VTList
	// rather than a single VT.
	SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
	SDValue N2);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
	SDValue N2, SDValue N3);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
	SDValue N2, SDValue N3, SDValue N4);
	SDValue getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, SDValue N1,
	SDValue N2, SDValue N3, SDValue N4, SDValue N5);

	/// Compute a TokenFactor to force all the incoming stack arguments to be
	/// loaded from the stack. This is used in tail call lowering to protect
	/// stack arguments from being clobbered.
	SDValue getStackArgumentTokenFactor(SDValue Chain);

	SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
	SDValue Size, unsigned Align, bool isVol, bool AlwaysInline,
	bool isTailCall, MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo);

	SDValue getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
	SDValue Size, unsigned Align, bool isVol, bool isTailCall,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo);

	SDValue getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
	SDValue Size, unsigned Align, bool isVol, bool isTailCall,
	MachinePointerInfo DstPtrInfo);

	SDValue getAtomicMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
	unsigned DstAlign, SDValue Src, unsigned SrcAlign,
	SDValue Size, Type *SizeTy, unsigned ElemSz,
	bool isTailCall, MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo);

	SDValue getAtomicMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
	unsigned DstAlign, SDValue Src, unsigned SrcAlign,
	SDValue Size, Type *SizeTy, unsigned ElemSz,
	bool isTailCall, MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo);

	SDValue getAtomicMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
	unsigned DstAlign, SDValue Value, SDValue Size,
	Type *SizeTy, unsigned ElemSz, bool isTailCall,
	MachinePointerInfo DstPtrInfo);

	/// Helper function to make it easier to build SetCC's if you just have an
	/// ISD::CondCode instead of an SDValue.
	SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS,
	ISD::CondCode Cond) {
	assert(LHS.getValueType().isVector() == RHS.getValueType().isVector() &&
	"Cannot compare scalars to vectors");
	assert(LHS.getValueType().isVector() == VT.isVector() &&
	"Cannot compare scalars to vectors");
	assert(Cond != ISD::SETCC_INVALID &&
	"Cannot create a setCC of an invalid node.");
	return getNode(ISD::SETCC, DL, VT, LHS, RHS, getCondCode(Cond));
	}

	/// Helper function to make it easier to build Select's if you just have
	/// operands and don't want to check for vector.
	SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS,
	SDValue RHS) {
	assert(LHS.getValueType() == RHS.getValueType() &&
	"Cannot use select on differing types");
	assert(VT.isVector() == LHS.getValueType().isVector() &&
	"Cannot mix vectors and scalars");
	auto Opcode = Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT;
	return getNode(Opcode, DL, VT, Cond, LHS, RHS);
	}

	/// Helper function to make it easier to build SelectCC's if you just have an
	/// ISD::CondCode instead of an SDValue.
	SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True,
	SDValue False, ISD::CondCode Cond) {
	return getNode(ISD::SELECT_CC, DL, True.getValueType(), LHS, RHS, True,
	False, getCondCode(Cond));
	}

	/// Try to simplify a select/vselect into 1 of its operands or a constant.
	SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal);

	/// Try to simplify a shift into 1 of its operands or a constant.
	SDValue simplifyShift(SDValue X, SDValue Y);

	/// Try to simplify a floating-point binary operation into 1 of its operands
	/// or a constant.
	SDValue simplifyFPBinop(unsigned Opcode, SDValue X, SDValue Y);

	/// VAArg produces a result and token chain, and takes a pointer
	/// and a source value as input.
	SDValue getVAArg(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
	SDValue SV, unsigned Align);

	/// Gets a node for an atomic cmpxchg op. There are two
	/// valid Opcodes. ISD::ATOMIC_CMO_SWAP produces the value loaded and a
	/// chain result. ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS produces the value loaded,
	/// a success flag (initially i1), and a chain.
	SDValue getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	SDVTList VTs, SDValue Chain, SDValue Ptr,
	SDValue Cmp, SDValue Swp, MachineMemOperand *MMO);

	/// Gets a node for an atomic op, produces result (if relevant)
	/// and chain and takes 2 operands.
	SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain,
	SDValue Ptr, SDValue Val, MachineMemOperand *MMO);

	/// Gets a node for an atomic op, produces result and chain and
	/// takes 1 operand.
	SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, EVT VT,
	SDValue Chain, SDValue Ptr, MachineMemOperand *MMO);

	/// Gets a node for an atomic op, produces result and chain and takes N
	/// operands.
	SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	SDVTList VTList, ArrayRef<SDValue> Ops,
	MachineMemOperand *MMO);

	/// Creates a MemIntrinsicNode that may produce a
	/// result and takes a list of operands. Opcode may be INTRINSIC_VOID,
	/// INTRINSIC_W_CHAIN, or a target-specific opcode with a value not
	/// less than FIRST_TARGET_MEMORY_OPCODE.
	SDValue getMemIntrinsicNode(
	unsigned Opcode, const SDLoc &dl, SDVTList VTList,
	ArrayRef<SDValue> Ops, EVT MemVT,
	MachinePointerInfo PtrInfo,
	unsigned Align = 0,
	MachineMemOperand::Flags Flags
	= MachineMemOperand::MOLoad \| MachineMemOperand::MOStore,
	unsigned Size = 0,
	const AAMDNodes &AAInfo = AAMDNodes());

	SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList,
	ArrayRef<SDValue> Ops, EVT MemVT,
	MachineMemOperand *MMO);

	/// Creates a LifetimeSDNode that starts (`IsStart==true`) or ends
	/// (`IsStart==false`) the lifetime of the portion of `FrameIndex` between
	/// offsets `Offset` and `Offset + Size`.
	SDValue getLifetimeNode(bool IsStart, const SDLoc &dl, SDValue Chain,
	int FrameIndex, int64_t Size, int64_t Offset = -1);

	/// Create a MERGE_VALUES node from the given operands.
	SDValue getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl);

	/// Loads are not normal binary operators: their result type is not
	/// determined by their operands, and they produce a value AND a token chain.
	///
	/// This function will set the MOLoad flag on MMOFlags, but you can set it if
	/// you want. The MOStore flag must not be set.
	SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
	MachinePointerInfo PtrInfo, unsigned Alignment = 0,
	MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
	const AAMDNodes &AAInfo = AAMDNodes(),
	const MDNode *Ranges = nullptr);
	SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
	MachineMemOperand *MMO);
	SDValue
	getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain,
	SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT,
	unsigned Alignment = 0,
	MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
	const AAMDNodes &AAInfo = AAMDNodes());
	SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT,
	SDValue Chain, SDValue Ptr, EVT MemVT,
	MachineMemOperand *MMO);
	SDValue getIndexedLoad(SDValue OrigLoad, const SDLoc &dl, SDValue Base,
	SDValue Offset, ISD::MemIndexedMode AM);
	SDValue getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT,
	const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset,
	MachinePointerInfo PtrInfo, EVT MemVT, unsigned Alignment = 0,
	MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
	const AAMDNodes &AAInfo = AAMDNodes(),
	const MDNode *Ranges = nullptr);
	SDValue getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT,
	const SDLoc &dl, SDValue Chain, SDValue Ptr, SDValue Offset,
	EVT MemVT, MachineMemOperand *MMO);

	/// Helper function to build ISD::STORE nodes.
	///
	/// This function will set the MOStore flag on MMOFlags, but you can set it if
	/// you want. The MOLoad and MOInvariant flags must not be set.
	SDValue
	getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
	MachinePointerInfo PtrInfo, unsigned Alignment = 0,
	MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
	const AAMDNodes &AAInfo = AAMDNodes());
	SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
	MachineMemOperand *MMO);
	SDValue
	getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr,
	MachinePointerInfo PtrInfo, EVT SVT, unsigned Alignment = 0,
	MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
	const AAMDNodes &AAInfo = AAMDNodes());
	SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, EVT SVT, MachineMemOperand *MMO);
	SDValue getIndexedStore(SDValue OrigStore, const SDLoc &dl, SDValue Base,
	SDValue Offset, ISD::MemIndexedMode AM);

	/// Returns sum of the base pointer and offset.
	SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset, const SDLoc &DL);

	SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
	SDValue Mask, SDValue Src0, EVT MemVT,
	MachineMemOperand *MMO, ISD::LoadExtType,
	bool IsExpanding = false);
	SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, SDValue Mask, EVT MemVT,
	MachineMemOperand *MMO, bool IsTruncating = false,
	bool IsCompressing = false);
	SDValue getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
	ArrayRef<SDValue> Ops, MachineMemOperand *MMO);
	SDValue getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
	ArrayRef<SDValue> Ops, MachineMemOperand *MMO);

	/// Return (create a new or find existing) a target-specific node.
	/// TargetMemSDNode should be derived class from MemSDNode.
	template <class TargetMemSDNode>
	SDValue getTargetMemSDNode(SDVTList VTs, ArrayRef<SDValue> Ops,
	const SDLoc &dl, EVT MemVT,
	MachineMemOperand *MMO);

	/// Construct a node to track a Value* through the backend.
	SDValue getSrcValue(const Value *v);

	/// Return an MDNodeSDNode which holds an MDNode.
	SDValue getMDNode(const MDNode *MD);

	/// Return a bitcast using the SDLoc of the value operand, and casting to the
	/// provided type. Use getNode to set a custom SDLoc.
	SDValue getBitcast(EVT VT, SDValue V);

	/// Return an AddrSpaceCastSDNode.
	SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS,
	unsigned DestAS);

	/// Return the specified value casted to
	/// the target's desired shift amount type.
	SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);

	/// Expand the specified \c ISD::VAARG node as the Legalize pass would.
	SDValue expandVAArg(SDNode *Node);

	/// Expand the specified \c ISD::VACOPY node as the Legalize pass would.
	SDValue expandVACopy(SDNode *Node);

	/// Returs an GlobalAddress of the function from the current module with
	/// name matching the given ExternalSymbol. Additionally can provide the
	/// matched function.
	/// Panics the function doesn't exists.
	SDValue getSymbolFunctionGlobalAddress(SDValue Op,
	Function **TargetFunction = nullptr);

	/// Mutate the specified node in-place to have the
	/// specified operands. If the resultant node already exists in the DAG,
	/// this does not modify the specified node, instead it returns the node that
	/// already exists. If the resultant node does not exist in the DAG, the
	/// input node is returned. As a degenerate case, if you specify the same
	/// input operands as the node already has, the input node is returned.
	SDNode UpdateNodeOperands(SDNode N, SDValue Op);
	SDNode UpdateNodeOperands(SDNode N, SDValue Op1, SDValue Op2);
	SDNode UpdateNodeOperands(SDNode N, SDValue Op1, SDValue Op2,
	SDValue Op3);
	SDNode UpdateNodeOperands(SDNode N, SDValue Op1, SDValue Op2,
	SDValue Op3, SDValue Op4);
	SDNode UpdateNodeOperands(SDNode N, SDValue Op1, SDValue Op2,
	SDValue Op3, SDValue Op4, SDValue Op5);
	SDNode UpdateNodeOperands(SDNode N, ArrayRef<SDValue> Ops);

	/// Creates a new TokenFactor containing \p Vals. If \p Vals contains 64k
	/// values or more, move values into new TokenFactors in 64k-1 blocks, until
	/// the final TokenFactor has less than 64k operands.
	SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl<SDValue> &Vals);

	/// Mutate the specified machine node's memory references to the provided
	/// list.
	void setNodeMemRefs(MachineSDNode *N,
	ArrayRef<MachineMemOperand *> NewMemRefs);

	// Propagates the change in divergence to users
	void updateDivergence(SDNode * N);

	/// These are used for target selectors to mutate the
	/// specified node to have the specified return type, Target opcode, and
	/// operands. Note that target opcodes are stored as
	/// ~TargetOpcode in the node opcode field. The resultant node is returned.
	SDNode SelectNodeTo(SDNode N, unsigned MachineOpc, EVT VT);
	SDNode SelectNodeTo(SDNode N, unsigned MachineOpc, EVT VT, SDValue Op1);
	SDNode SelectNodeTo(SDNode N, unsigned MachineOpc, EVT VT,
	SDValue Op1, SDValue Op2);
	SDNode SelectNodeTo(SDNode N, unsigned MachineOpc, EVT VT,
	SDValue Op1, SDValue Op2, SDValue Op3);
	SDNode SelectNodeTo(SDNode N, unsigned MachineOpc, EVT VT,
	ArrayRef<SDValue> Ops);
	SDNode SelectNodeTo(SDNode N, unsigned MachineOpc, EVT VT1, EVT VT2);
	SDNode SelectNodeTo(SDNode N, unsigned MachineOpc, EVT VT1,
	EVT VT2, ArrayRef<SDValue> Ops);
	SDNode SelectNodeTo(SDNode N, unsigned MachineOpc, EVT VT1,
	EVT VT2, EVT VT3, ArrayRef<SDValue> Ops);
	SDNode SelectNodeTo(SDNode N, unsigned TargetOpc, EVT VT1,
	EVT VT2, SDValue Op1);
	SDNode SelectNodeTo(SDNode N, unsigned MachineOpc, EVT VT1,
	EVT VT2, SDValue Op1, SDValue Op2);
	SDNode SelectNodeTo(SDNode N, unsigned MachineOpc, SDVTList VTs,
	ArrayRef<SDValue> Ops);

	/// This mutates the specified node to have the specified
	/// return type, opcode, and operands.
	SDNode MorphNodeTo(SDNode N, unsigned Opc, SDVTList VTs,
	ArrayRef<SDValue> Ops);

	/// Mutate the specified strict FP node to its non-strict equivalent,
	/// unlinking the node from its chain and dropping the metadata arguments.
	/// The node must be a strict FP node.
	SDNode mutateStrictFPToFP(SDNode Node);

	/// These are used for target selectors to create a new node
	/// with specified return type(s), MachineInstr opcode, and operands.
	///
	/// Note that getMachineNode returns the resultant node. If there is already
	/// a node of the specified opcode and operands, it returns that node instead
	/// of the current one.
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
	SDValue Op1);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
	SDValue Op1, SDValue Op2);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
	SDValue Op1, SDValue Op2, SDValue Op3);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT,
	ArrayRef<SDValue> Ops);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
	EVT VT2, SDValue Op1, SDValue Op2);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
	EVT VT2, SDValue Op1, SDValue Op2, SDValue Op3);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
	EVT VT2, ArrayRef<SDValue> Ops);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
	EVT VT2, EVT VT3, SDValue Op1, SDValue Op2);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
	EVT VT2, EVT VT3, SDValue Op1, SDValue Op2,
	SDValue Op3);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT1,
	EVT VT2, EVT VT3, ArrayRef<SDValue> Ops);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl,
	ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops);
	MachineSDNode *getMachineNode(unsigned Opcode, const SDLoc &dl, SDVTList VTs,
	ArrayRef<SDValue> Ops);

	/// A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
	SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT,
	SDValue Operand);

	/// A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
	SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
	SDValue Operand, SDValue Subreg);

	/// Get the specified node if it's already available, or else return NULL.
	SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef<SDValue> Ops,
	const SDNodeFlags Flags = SDNodeFlags());

	/// Creates a SDDbgValue node.
	SDDbgValue getDbgValue(DIVariable Var, DIExpression Expr, SDNode N,
	unsigned R, bool IsIndirect, const DebugLoc &DL,
	unsigned O);

	/// Creates a constant SDDbgValue node.
	SDDbgValue getConstantDbgValue(DIVariable Var, DIExpression *Expr,
	const Value *C, const DebugLoc &DL,
	unsigned O);

	/// Creates a FrameIndex SDDbgValue node.
	SDDbgValue getFrameIndexDbgValue(DIVariable Var, DIExpression *Expr,
	unsigned FI, bool IsIndirect,
	const DebugLoc &DL, unsigned O);

	/// Creates a VReg SDDbgValue node.
	SDDbgValue getVRegDbgValue(DIVariable Var, DIExpression *Expr,
	unsigned VReg, bool IsIndirect,
	const DebugLoc &DL, unsigned O);

	/// Creates a SDDbgLabel node.
	SDDbgLabel getDbgLabel(DILabel Label, const DebugLoc &DL, unsigned O);

	/// Transfer debug values from one node to another, while optionally
	/// generating fragment expressions for split-up values. If \p InvalidateDbg
	/// is set, debug values are invalidated after they are transferred.
	void transferDbgValues(SDValue From, SDValue To, unsigned OffsetInBits = 0,
	unsigned SizeInBits = 0, bool InvalidateDbg = true);

	/// Remove the specified node from the system. If any of its
	/// operands then becomes dead, remove them as well. Inform UpdateListener
	/// for each node deleted.
	void RemoveDeadNode(SDNode *N);

	/// This method deletes the unreachable nodes in the
	/// given list, and any nodes that become unreachable as a result.
	void RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes);

	/// Modify anything using 'From' to use 'To' instead.
	/// This can cause recursive merging of nodes in the DAG. Use the first
	/// version if 'From' is known to have a single result, use the second
	/// if you have two nodes with identical results (or if 'To' has a superset
	/// of the results of 'From'), use the third otherwise.
	///
	/// These methods all take an optional UpdateListener, which (if not null) is
	/// informed about nodes that are deleted and modified due to recursive
	/// changes in the dag.
	///
	/// These functions only replace all existing uses. It's possible that as
	/// these replacements are being performed, CSE may cause the From node
	/// to be given new uses. These new uses of From are left in place, and
	/// not automatically transferred to To.
	///
	void ReplaceAllUsesWith(SDValue From, SDValue To);
	void ReplaceAllUsesWith(SDNode From, SDNode To);
	void ReplaceAllUsesWith(SDNode From, const SDValue To);

	/// Replace any uses of From with To, leaving
	/// uses of other values produced by From.getNode() alone.
	void ReplaceAllUsesOfValueWith(SDValue From, SDValue To);

	/// Like ReplaceAllUsesOfValueWith, but for multiple values at once.
	/// This correctly handles the case where
	/// there is an overlap between the From values and the To values.
	void ReplaceAllUsesOfValuesWith(const SDValue From, const SDValue To,
	unsigned Num);

	/// If an existing load has uses of its chain, create a token factor node with
	/// that chain and the new memory node's chain and update users of the old
	/// chain to the token factor. This ensures that the new memory node will have
	/// the same relative memory dependency position as the old load. Returns the
	/// new merged load chain.
	SDValue makeEquivalentMemoryOrdering(LoadSDNode *Old, SDValue New);

	/// Topological-sort the AllNodes list and a
	/// assign a unique node id for each node in the DAG based on their
	/// topological order. Returns the number of nodes.
	unsigned AssignTopologicalOrder();

	/// Move node N in the AllNodes list to be immediately
	/// before the given iterator Position. This may be used to update the
	/// topological ordering when the list of nodes is modified.
	void RepositionNode(allnodes_iterator Position, SDNode *N) {
	AllNodes.insert(Position, AllNodes.remove(N));
	}

	/// Returns an APFloat semantics tag appropriate for the given type. If VT is
	/// a vector type, the element semantics are returned.
	static const fltSemantics &EVTToAPFloatSemantics(EVT VT) {
	switch (VT.getScalarType().getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unknown FP format");
	case MVT::f16: return APFloat::IEEEhalf();
	case MVT::f32: return APFloat::IEEEsingle();
	case MVT::f64: return APFloat::IEEEdouble();
	case MVT::f80: return APFloat::x87DoubleExtended();
	case MVT::f128: return APFloat::IEEEquad();
	case MVT::ppcf128: return APFloat::PPCDoubleDouble();
	}
	}

	/// Add a dbg_value SDNode. If SD is non-null that means the
	/// value is produced by SD.
	void AddDbgValue(SDDbgValue DB, SDNode SD, bool isParameter);

	/// Add a dbg_label SDNode.
	void AddDbgLabel(SDDbgLabel *DB);

	/// Get the debug values which reference the given SDNode.
	ArrayRef<SDDbgValue> GetDbgValues(const SDNode SD) const {
	return DbgInfo->getSDDbgValues(SD);
	}

	public:
	/// Return true if there are any SDDbgValue nodes associated
	/// with this SelectionDAG.
	bool hasDebugValues() const { return !DbgInfo->empty(); }

	SDDbgInfo::DbgIterator DbgBegin() const { return DbgInfo->DbgBegin(); }
	SDDbgInfo::DbgIterator DbgEnd() const { return DbgInfo->DbgEnd(); }

	SDDbgInfo::DbgIterator ByvalParmDbgBegin() const {
	return DbgInfo->ByvalParmDbgBegin();
	}
	SDDbgInfo::DbgIterator ByvalParmDbgEnd() const {
	return DbgInfo->ByvalParmDbgEnd();
	}

	SDDbgInfo::DbgLabelIterator DbgLabelBegin() const {
	return DbgInfo->DbgLabelBegin();
	}
	SDDbgInfo::DbgLabelIterator DbgLabelEnd() const {
	return DbgInfo->DbgLabelEnd();
	}

	/// To be invoked on an SDNode that is slated to be erased. This
	/// function mirrors \c llvm::salvageDebugInfo.
	void salvageDebugInfo(SDNode &N);

	void dump() const;

	/// Create a stack temporary, suitable for holding the specified value type.
	/// If minAlign is specified, the slot size will have at least that alignment.
	SDValue CreateStackTemporary(EVT VT, unsigned minAlign = 1);

	/// Create a stack temporary suitable for holding either of the specified
	/// value types.
	SDValue CreateStackTemporary(EVT VT1, EVT VT2);

	SDValue FoldSymbolOffset(unsigned Opcode, EVT VT,
	const GlobalAddressSDNode *GA,
	const SDNode *N2);

	SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDNode N1, SDNode N2);

	SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
	const ConstantSDNode *C1,
	const ConstantSDNode *C2);

	SDValue FoldConstantVectorArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT,
	ArrayRef<SDValue> Ops,
	const SDNodeFlags Flags = SDNodeFlags());

	/// Fold floating-point operations with 2 operands when both operands are
	/// constants and/or undefined.
	SDValue foldConstantFPMath(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue N1, SDValue N2);

	/// Constant fold a setcc to true or false.
	SDValue FoldSetCC(EVT VT, SDValue N1, SDValue N2, ISD::CondCode Cond,
	const SDLoc &dl);

	/// See if the specified operand can be simplified with the knowledge that
	/// only the bits specified by DemandedBits are used. If so, return the
	/// simpler operand, otherwise return a null SDValue.
	///
	/// (This exists alongside SimplifyDemandedBits because GetDemandedBits can
	/// simplify nodes with multiple uses more aggressively.)
	SDValue GetDemandedBits(SDValue V, const APInt &DemandedBits);

	/// See if the specified operand can be simplified with the knowledge that
	/// only the bits specified by DemandedBits are used in the elements specified
	/// by DemandedElts. If so, return the simpler operand, otherwise return a
	/// null SDValue.
	///
	/// (This exists alongside SimplifyDemandedBits because GetDemandedBits can
	/// simplify nodes with multiple uses more aggressively.)
	SDValue GetDemandedBits(SDValue V, const APInt &DemandedBits,
	const APInt &DemandedElts);

	/// Return true if the sign bit of Op is known to be zero.
	/// We use this predicate to simplify operations downstream.
	bool SignBitIsZero(SDValue Op, unsigned Depth = 0) const;

	/// Return true if 'Op & Mask' is known to be zero. We
	/// use this predicate to simplify operations downstream. Op and Mask are
	/// known to be the same type.
	bool MaskedValueIsZero(SDValue Op, const APInt &Mask,
	unsigned Depth = 0) const;

	/// Return true if 'Op & Mask' is known to be zero in DemandedElts. We
	/// use this predicate to simplify operations downstream. Op and Mask are
	/// known to be the same type.
	bool MaskedValueIsZero(SDValue Op, const APInt &Mask,
	const APInt &DemandedElts, unsigned Depth = 0) const;

	/// Return true if '(Op & Mask) == Mask'.
	/// Op and Mask are known to be the same type.
	bool MaskedValueIsAllOnes(SDValue Op, const APInt &Mask,
	unsigned Depth = 0) const;

	/// Determine which bits of Op are known to be either zero or one and return
	/// them in Known. For vectors, the known bits are those that are shared by
	/// every vector element.
	/// Targets can implement the computeKnownBitsForTargetNode method in the
	/// TargetLowering class to allow target nodes to be understood.
	KnownBits computeKnownBits(SDValue Op, unsigned Depth = 0) const;

	/// Determine which bits of Op are known to be either zero or one and return
	/// them in Known. The DemandedElts argument allows us to only collect the
	/// known bits that are shared by the requested vector elements.
	/// Targets can implement the computeKnownBitsForTargetNode method in the
	/// TargetLowering class to allow target nodes to be understood.
	KnownBits computeKnownBits(SDValue Op, const APInt &DemandedElts,
	unsigned Depth = 0) const;

	/// Used to represent the possible overflow behavior of an operation.
	/// Never: the operation cannot overflow.
	/// Always: the operation will always overflow.
	/// Sometime: the operation may or may not overflow.
	enum OverflowKind {
	OFK_Never,
	OFK_Sometime,
	OFK_Always,
	};

	/// Determine if the result of the addition of 2 node can overflow.
	OverflowKind computeOverflowKind(SDValue N0, SDValue N1) const;

	/// Test if the given value is known to have exactly one bit set. This differs
	/// from computeKnownBits in that it doesn't necessarily determine which bit
	/// is set.
	bool isKnownToBeAPowerOfTwo(SDValue Val) const;

	/// Return the number of times the sign bit of the register is replicated into
	/// the other bits. We know that at least 1 bit is always equal to the sign
	/// bit (itself), but other cases can give us information. For example,
	/// immediately after an "SRA X, 2", we know that the top 3 bits are all equal
	/// to each other, so we return 3. Targets can implement the
	/// ComputeNumSignBitsForTarget method in the TargetLowering class to allow
	/// target nodes to be understood.
	unsigned ComputeNumSignBits(SDValue Op, unsigned Depth = 0) const;

	/// Return the number of times the sign bit of the register is replicated into
	/// the other bits. We know that at least 1 bit is always equal to the sign
	/// bit (itself), but other cases can give us information. For example,
	/// immediately after an "SRA X, 2", we know that the top 3 bits are all equal
	/// to each other, so we return 3. The DemandedElts argument allows
	/// us to only collect the minimum sign bits of the requested vector elements.
	/// Targets can implement the ComputeNumSignBitsForTarget method in the
	/// TargetLowering class to allow target nodes to be understood.
	unsigned ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
	unsigned Depth = 0) const;

	/// Return true if the specified operand is an ISD::ADD with a ConstantSDNode
	/// on the right-hand side, or if it is an ISD::OR with a ConstantSDNode that
	/// is guaranteed to have the same semantics as an ADD. This handles the
	/// equivalence:
	/// X\|Cst == X+Cst iff X&Cst = 0.
	bool isBaseWithConstantOffset(SDValue Op) const;

	/// Test whether the given SDValue is known to never be NaN. If \p SNaN is
	/// true, returns if \p Op is known to never be a signaling NaN (it may still
	/// be a qNaN).
	bool isKnownNeverNaN(SDValue Op, bool SNaN = false, unsigned Depth = 0) const;

	/// \returns true if \p Op is known to never be a signaling NaN.
	bool isKnownNeverSNaN(SDValue Op, unsigned Depth = 0) const {
	return isKnownNeverNaN(Op, true, Depth);
	}

	/// Test whether the given floating point SDValue is known to never be
	/// positive or negative zero.
	bool isKnownNeverZeroFloat(SDValue Op) const;

	/// Test whether the given SDValue is known to contain non-zero value(s).
	bool isKnownNeverZero(SDValue Op) const;

	/// Test whether two SDValues are known to compare equal. This
	/// is true if they are the same value, or if one is negative zero and the
	/// other positive zero.
	bool isEqualTo(SDValue A, SDValue B) const;

	/// Return true if A and B have no common bits set. As an example, this can
	/// allow an 'add' to be transformed into an 'or'.
	bool haveNoCommonBitsSet(SDValue A, SDValue B) const;

	/// Test whether \p V has a splatted value for all the demanded elements.
	///
	/// On success \p UndefElts will indicate the elements that have UNDEF
	/// values instead of the splat value, this is only guaranteed to be correct
	/// for \p DemandedElts.
	///
	/// NOTE: The function will return true for a demanded splat of UNDEF values.
	bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts);

	/// Test whether \p V has a splatted value.
	bool isSplatValue(SDValue V, bool AllowUndefs = false);

	/// If V is a splatted value, return the source vector and its splat index.
	SDValue getSplatSourceVector(SDValue V, int &SplatIndex);

	/// If V is a splat vector, return its scalar source operand by extracting
	/// that element from the source vector.
	SDValue getSplatValue(SDValue V);

	/// Match a binop + shuffle pyramid that represents a horizontal reduction
	/// over the elements of a vector starting from the EXTRACT_VECTOR_ELT node /p
	/// Extract. The reduction must use one of the opcodes listed in /p
	/// CandidateBinOps and on success /p BinOp will contain the matching opcode.
	/// Returns the vector that is being reduced on, or SDValue() if a reduction
	/// was not matched.
	SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
	ArrayRef<ISD::NodeType> CandidateBinOps);

	/// Utility function used by legalize and lowering to
	/// "unroll" a vector operation by splitting out the scalars and operating
	/// on each element individually. If the ResNE is 0, fully unroll the vector
	/// op. If ResNE is less than the width of the vector op, unroll up to ResNE.
	/// If the ResNE is greater than the width of the vector op, unroll the
	/// vector op and fill the end of the resulting vector with UNDEFS.
	SDValue UnrollVectorOp(SDNode *N, unsigned ResNE = 0);

	/// Like UnrollVectorOp(), but for the [US](ADD\|SUB\|MUL)O family of opcodes.
	/// This is a separate function because those opcodes have two results.
	std::pair<SDValue, SDValue> UnrollVectorOverflowOp(SDNode *N,
	unsigned ResNE = 0);

	/// Return true if loads are next to each other and can be
	/// merged. Check that both are nonvolatile and if LD is loading
	/// 'Bytes' bytes from a location that is 'Dist' units away from the
	/// location that the 'Base' load is loading from.
	bool areNonVolatileConsecutiveLoads(LoadSDNode LD, LoadSDNode Base,
	unsigned Bytes, int Dist) const;

	/// Infer alignment of a load / store address. Return 0 if
	/// it cannot be inferred.
	unsigned InferPtrAlignment(SDValue Ptr) const;

	/// Compute the VTs needed for the low/hi parts of a type
	/// which is split (or expanded) into two not necessarily identical pieces.
	std::pair<EVT, EVT> GetSplitDestVTs(const EVT &VT) const;

	/// Split the vector with EXTRACT_SUBVECTOR using the provides
	/// VTs and return the low/high part.
	std::pair<SDValue, SDValue> SplitVector(const SDValue &N, const SDLoc &DL,
	const EVT &LoVT, const EVT &HiVT);

	/// Split the vector with EXTRACT_SUBVECTOR and return the low/high part.
	std::pair<SDValue, SDValue> SplitVector(const SDValue &N, const SDLoc &DL) {
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = GetSplitDestVTs(N.getValueType());
	return SplitVector(N, DL, LoVT, HiVT);
	}

	/// Split the node's operand with EXTRACT_SUBVECTOR and
	/// return the low/high part.
	std::pair<SDValue, SDValue> SplitVectorOperand(const SDNode *N, unsigned OpNo)
	{
	return SplitVector(N->getOperand(OpNo), SDLoc(N));
	}

	/// Widen the vector up to the next power of two using INSERT_SUBVECTOR.
	SDValue WidenVector(const SDValue &N, const SDLoc &DL);

	/// Append the extracted elements from Start to Count out of the vector Op
	/// in Args. If Count is 0, all of the elements will be extracted.
	void ExtractVectorElements(SDValue Op, SmallVectorImpl<SDValue> &Args,
	unsigned Start = 0, unsigned Count = 0);

	/// Compute the default alignment value for the given type.
	unsigned getEVTAlignment(EVT MemoryVT) const;

	/// Test whether the given value is a constant int or similar node.
	SDNode *isConstantIntBuildVectorOrConstantInt(SDValue N);

	/// Test whether the given value is a constant FP or similar node.
	SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N);

	/// \returns true if \p N is any kind of constant or build_vector of
	/// constants, int or float. If a vector, it may not necessarily be a splat.
	inline bool isConstantValueOfAnyType(SDValue N) {
	return isConstantIntBuildVectorOrConstantInt(N) \|\|
	isConstantFPBuildVectorOrConstantFP(N);
	}

	void addCallSiteInfo(const SDNode *CallNode, CallSiteInfoImpl &&CallInfo) {
	- SDCallSiteInfo[CallNode] = std::move(CallInfo);
	+ SDCallSiteDbgInfo[CallNode].CSInfo = std::move(CallInfo);
	}

	CallSiteInfo getSDCallSiteInfo(const SDNode *CallNode) {
	- auto I = SDCallSiteInfo.find(CallNode);
	- if (I != SDCallSiteInfo.end())
	- return std::move(I->second);
	+ auto I = SDCallSiteDbgInfo.find(CallNode);
	+ if (I != SDCallSiteDbgInfo.end())
	+ return std::move(I->second).CSInfo;
	return CallSiteInfo();
	+ }
	+
	+ void addHeapAllocSite(const SDNode Node, MDNode MD) {
	+ SDCallSiteDbgInfo[Node].HeapAllocSite = MD;
	+ }
	+
	+ /// Return the HeapAllocSite type associated with the SDNode, if it exists.
	+ MDNode getHeapAllocSite(const SDNode Node) {
	+ auto It = SDCallSiteDbgInfo.find(Node);
	+ if (It == SDCallSiteDbgInfo.end())
	+ return nullptr;
	+ return It->second.HeapAllocSite;
	}

	private:
	void InsertNode(SDNode *N);
	bool RemoveNodeFromCSEMaps(SDNode *N);
	void AddModifiedNodeToCSEMaps(SDNode *N);
	SDNode FindModifiedNodeSlot(SDNode N, SDValue Op, void *&InsertPos);
	SDNode FindModifiedNodeSlot(SDNode N, SDValue Op1, SDValue Op2,
	void *&InsertPos);
	SDNode FindModifiedNodeSlot(SDNode N, ArrayRef<SDValue> Ops,
	void *&InsertPos);
	SDNode UpdateSDLocOnMergeSDNode(SDNode N, const SDLoc &loc);

	void DeleteNodeNotInCSEMaps(SDNode *N);
	void DeallocateNode(SDNode *N);

	void allnodes_clear();

	/// Look up the node specified by ID in CSEMap. If it exists, return it. If
	/// not, return the insertion token that will make insertion faster. This
	/// overload is for nodes other than Constant or ConstantFP, use the other one
	/// for those.
	SDNode FindNodeOrInsertPos(const FoldingSetNodeID &ID, void &InsertPos);

	/// Look up the node specified by ID in CSEMap. If it exists, return it. If
	/// not, return the insertion token that will make insertion faster. Performs
	/// additional processing for constant nodes.
	SDNode *FindNodeOrInsertPos(const FoldingSetNodeID &ID, const SDLoc &DL,
	void *&InsertPos);

	/// List of non-single value types.
	FoldingSet<SDVTListNode> VTListMap;

	/// Maps to auto-CSE operations.
	std::vector<CondCodeSDNode*> CondCodeNodes;

	std::vector<SDNode*> ValueTypeNodes;
	std::map<EVT, SDNode*, EVT::compareRawBits> ExtendedValueTypeNodes;
	StringMap<SDNode*> ExternalSymbols;

	std::map<std::pair<std::string, unsigned char>,SDNode*> TargetExternalSymbols;
	DenseMap<MCSymbol , SDNode > MCSymbols;
	};

	template <> struct GraphTraits<SelectionDAG> : public GraphTraits<SDNode> {
	using nodes_iterator = pointer_iterator<SelectionDAG::allnodes_iterator>;

	static nodes_iterator nodes_begin(SelectionDAG *G) {
	return nodes_iterator(G->allnodes_begin());
	}

	static nodes_iterator nodes_end(SelectionDAG *G) {
	return nodes_iterator(G->allnodes_end());
	}
	};

	template <class TargetMemSDNode>
	SDValue SelectionDAG::getTargetMemSDNode(SDVTList VTs,
	ArrayRef<SDValue> Ops,
	const SDLoc &dl, EVT MemVT,
	MachineMemOperand *MMO) {
	/// Compose node ID and try to find an existing node.
	FoldingSetNodeID ID;
	unsigned Opcode =
	TargetMemSDNode(dl.getIROrder(), DebugLoc(), VTs, MemVT, MMO).getOpcode();
	ID.AddInteger(Opcode);
	ID.AddPointer(VTs.VTs);
	for (auto& Op : Ops) {
	ID.AddPointer(Op.getNode());
	ID.AddInteger(Op.getResNo());
	}
	ID.AddInteger(MemVT.getRawBits());
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	ID.AddInteger(getSyntheticNodeSubclassData<TargetMemSDNode>(
	dl.getIROrder(), VTs, MemVT, MMO));

	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<TargetMemSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}

	/// Existing node was not found. Create a new one.
	auto *N = newSDNode<TargetMemSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
	MemVT, MMO);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	} // end namespace llvm

	#endif // LLVM_CODEGEN_SELECTIONDAG_H
	Index: vendor/llvm/dist-release_90/include/llvm/CodeGen/TargetLowering.h
	===================================================================
	--- vendor/llvm/dist-release_90/include/llvm/CodeGen/TargetLowering.h (revision 351302)
	+++ vendor/llvm/dist-release_90/include/llvm/CodeGen/TargetLowering.h (revision 351303)
	@@ -1,4091 +1,4092 @@
	//===- llvm/CodeGen/TargetLowering.h - Target Lowering Info ------ C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// This file describes how to lower LLVM code to machine code. This has two
	/// main components:
	///
	/// 1. Which ValueTypes are natively supported by the target.
	/// 2. Which operations are supported for supported ValueTypes.
	/// 3. Cost thresholds for alternative implementations of certain operations.
	///
	/// In addition it has a few other components, like information about FP
	/// immediates.
	///
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CODEGEN_TARGETLOWERING_H
	#define LLVM_CODEGEN_TARGETLOWERING_H

	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
	#include "llvm/CodeGen/DAGCombine.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetCallingConv.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Type.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Support/AtomicOrdering.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MachineValueType.h"
	#include "llvm/Target/TargetMachine.h"
	#include <algorithm>
	#include <cassert>
	#include <climits>
	#include <cstdint>
	#include <iterator>
	#include <map>
	#include <string>
	#include <utility>
	#include <vector>

	namespace llvm {

	class BranchProbability;
	class CCState;
	class CCValAssign;
	class Constant;
	class FastISel;
	class FunctionLoweringInfo;
	class GlobalValue;
	class IntrinsicInst;
	struct KnownBits;
	class LLVMContext;
	class MachineBasicBlock;
	class MachineFunction;
	class MachineInstr;
	class MachineJumpTableInfo;
	class MachineLoop;
	class MachineRegisterInfo;
	class MCContext;
	class MCExpr;
	class Module;
	class TargetRegisterClass;
	class TargetLibraryInfo;
	class TargetRegisterInfo;
	class Value;

	namespace Sched {

	enum Preference {
	None, // No preference
	Source, // Follow source order.
	RegPressure, // Scheduling for lowest register pressure.
	Hybrid, // Scheduling for both latency and register pressure.
	ILP, // Scheduling for ILP in low register pressure mode.
	VLIW // Scheduling for VLIW targets.
	};

	} // end namespace Sched

	/// This base class for TargetLowering contains the SelectionDAG-independent
	/// parts that can be used from the rest of CodeGen.
	class TargetLoweringBase {
	public:
	/// This enum indicates whether operations are valid for a target, and if not,
	/// what action should be used to make them valid.
	enum LegalizeAction : uint8_t {
	Legal, // The target natively supports this operation.
	Promote, // This operation should be executed in a larger type.
	Expand, // Try to expand this to other ops, otherwise use a libcall.
	LibCall, // Don't try to expand this to other ops, always use a libcall.
	Custom // Use the LowerOperation hook to implement custom lowering.
	};

	/// This enum indicates whether a types are legal for a target, and if not,
	/// what action should be used to make them valid.
	enum LegalizeTypeAction : uint8_t {
	TypeLegal, // The target natively supports this type.
	TypePromoteInteger, // Replace this integer with a larger one.
	TypeExpandInteger, // Split this integer into two of half the size.
	TypeSoftenFloat, // Convert this float to a same size integer type,
	// if an operation is not supported in target HW.
	TypeExpandFloat, // Split this float into two of half the size.
	TypeScalarizeVector, // Replace this one-element vector with its element.
	TypeSplitVector, // Split this vector into two of half the size.
	TypeWidenVector, // This vector should be widened into a larger vector.
	TypePromoteFloat // Replace this float with a larger one.
	};

	/// LegalizeKind holds the legalization kind that needs to happen to EVT
	/// in order to type-legalize it.
	using LegalizeKind = std::pair<LegalizeTypeAction, EVT>;

	/// Enum that describes how the target represents true/false values.
	enum BooleanContent {
	UndefinedBooleanContent, // Only bit 0 counts, the rest can hold garbage.
	ZeroOrOneBooleanContent, // All bits zero except for bit 0.
	ZeroOrNegativeOneBooleanContent // All bits equal to bit 0.
	};

	/// Enum that describes what type of support for selects the target has.
	enum SelectSupportKind {
	ScalarValSelect, // The target supports scalar selects (ex: cmov).
	ScalarCondVectorVal, // The target supports selects with a scalar condition
	// and vector values (ex: cmov).
	VectorMaskSelect // The target supports vector selects with a vector
	// mask (ex: x86 blends).
	};

	/// Enum that specifies what an atomic load/AtomicRMWInst is expanded
	/// to, if at all. Exists because different targets have different levels of
	/// support for these atomic instructions, and also have different options
	/// w.r.t. what they should expand to.
	enum class AtomicExpansionKind {
	None, // Don't expand the instruction.
	LLSC, // Expand the instruction into loadlinked/storeconditional; used
	// by ARM/AArch64.
	LLOnly, // Expand the (load) instruction into just a load-linked, which has
	// greater atomic guarantees than a normal load.
	CmpXChg, // Expand the instruction into cmpxchg; used by at least X86.
	MaskedIntrinsic, // Use a target-specific intrinsic for the LL/SC loop.
	};

	/// Enum that specifies when a multiplication should be expanded.
	enum class MulExpansionKind {
	Always, // Always expand the instruction.
	OnlyLegalOrCustom, // Only expand when the resulting instructions are legal
	// or custom.
	};

	class ArgListEntry {
	public:
	Value *Val = nullptr;
	SDValue Node = SDValue();
	Type *Ty = nullptr;
	bool IsSExt : 1;
	bool IsZExt : 1;
	bool IsInReg : 1;
	bool IsSRet : 1;
	bool IsNest : 1;
	bool IsByVal : 1;
	bool IsInAlloca : 1;
	bool IsReturned : 1;
	bool IsSwiftSelf : 1;
	bool IsSwiftError : 1;
	uint16_t Alignment = 0;
	Type *ByValType = nullptr;

	ArgListEntry()
	: IsSExt(false), IsZExt(false), IsInReg(false), IsSRet(false),
	IsNest(false), IsByVal(false), IsInAlloca(false), IsReturned(false),
	IsSwiftSelf(false), IsSwiftError(false) {}

	void setAttributes(const CallBase *Call, unsigned ArgIdx);

	void setAttributes(ImmutableCallSite *CS, unsigned ArgIdx) {
	return setAttributes(cast<CallBase>(CS->getInstruction()), ArgIdx);
	}
	};
	using ArgListTy = std::vector<ArgListEntry>;

	virtual void markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const {};

	static ISD::NodeType getExtendForContent(BooleanContent Content) {
	switch (Content) {
	case UndefinedBooleanContent:
	// Extend by adding rubbish bits.
	return ISD::ANY_EXTEND;
	case ZeroOrOneBooleanContent:
	// Extend by adding zero bits.
	return ISD::ZERO_EXTEND;
	case ZeroOrNegativeOneBooleanContent:
	// Extend by copying the sign bit.
	return ISD::SIGN_EXTEND;
	}
	llvm_unreachable("Invalid content kind");
	}

	/// NOTE: The TargetMachine owns TLOF.
	explicit TargetLoweringBase(const TargetMachine &TM);
	TargetLoweringBase(const TargetLoweringBase &) = delete;
	TargetLoweringBase &operator=(const TargetLoweringBase &) = delete;
	virtual ~TargetLoweringBase() = default;

	protected:
	/// Initialize all of the actions to default values.
	void initActions();

	public:
	const TargetMachine &getTargetMachine() const { return TM; }

	virtual bool useSoftFloat() const { return false; }

	/// Return the pointer type for the given address space, defaults to
	/// the pointer type from the data layout.
	/// FIXME: The default needs to be removed once all the code is updated.
	virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const {
	return MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
	}

	/// Return the in-memory pointer type for the given address space, defaults to
	/// the pointer type from the data layout. FIXME: The default needs to be
	/// removed once all the code is updated.
	MVT getPointerMemTy(const DataLayout &DL, uint32_t AS = 0) const {
	return MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
	}

	/// Return the type for frame index, which is determined by
	/// the alloca address space specified through the data layout.
	MVT getFrameIndexTy(const DataLayout &DL) const {
	return getPointerTy(DL, DL.getAllocaAddrSpace());
	}

	/// Return the type for operands of fence.
	/// TODO: Let fence operands be of i32 type and remove this.
	virtual MVT getFenceOperandTy(const DataLayout &DL) const {
	return getPointerTy(DL);
	}

	/// EVT is not used in-tree, but is used by out-of-tree target.
	/// A documentation for this function would be nice...
	virtual MVT getScalarShiftAmountTy(const DataLayout &, EVT) const;

	EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL,
	bool LegalTypes = true) const;

	/// Returns the type to be used for the index operand of:
	/// ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
	/// ISD::INSERT_SUBVECTOR, and ISD::EXTRACT_SUBVECTOR
	virtual MVT getVectorIdxTy(const DataLayout &DL) const {
	return getPointerTy(DL);
	}

	virtual bool isSelectSupported(SelectSupportKind /kind/) const {
	return true;
	}

	/// Return true if it is profitable to convert a select of FP constants into
	/// a constant pool load whose address depends on the select condition. The
	/// parameter may be used to differentiate a select with FP compare from
	/// integer compare.
	virtual bool reduceSelectOfFPConstantLoads(bool IsFPSetCC) const {
	return true;
	}

	/// Return true if multiple condition registers are available.
	bool hasMultipleConditionRegisters() const {
	return HasMultipleConditionRegisters;
	}

	/// Return true if the target has BitExtract instructions.
	bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }

	/// Return the preferred vector type legalization action.
	virtual TargetLoweringBase::LegalizeTypeAction
	getPreferredVectorAction(MVT VT) const {
	// The default action for one element vectors is to scalarize
	if (VT.getVectorNumElements() == 1)
	return TypeScalarizeVector;
	// The default action for an odd-width vector is to widen.
	if (!VT.isPow2VectorType())
	return TypeWidenVector;
	// The default action for other vectors is to promote
	return TypePromoteInteger;
	}

	// There are two general methods for expanding a BUILD_VECTOR node:
	// 1. Use SCALAR_TO_VECTOR on the defined scalar values and then shuffle
	// them together.
	// 2. Build the vector on the stack and then load it.
	// If this function returns true, then method (1) will be used, subject to
	// the constraint that all of the necessary shuffles are legal (as determined
	// by isShuffleMaskLegal). If this function returns false, then method (2) is
	// always used. The vector type, and the number of defined values, are
	// provided.
	virtual bool
	shouldExpandBuildVectorWithShuffles(EVT /* VT */,
	unsigned DefinedValues) const {
	return DefinedValues < 3;
	}

	/// Return true if integer divide is usually cheaper than a sequence of
	/// several shifts, adds, and multiplies for this target.
	/// The definition of "cheaper" may depend on whether we're optimizing
	/// for speed or for size.
	virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const { return false; }

	/// Return true if the target can handle a standalone remainder operation.
	virtual bool hasStandaloneRem(EVT VT) const {
	return true;
	}

	/// Return true if SQRT(X) shouldn't be replaced with X*RSQRT(X).
	virtual bool isFsqrtCheap(SDValue X, SelectionDAG &DAG) const {
	// Default behavior is to replace SQRT(X) with X*RSQRT(X).
	return false;
	}

	/// Reciprocal estimate status values used by the functions below.
	enum ReciprocalEstimate : int {
	Unspecified = -1,
	Disabled = 0,
	Enabled = 1
	};

	/// Return a ReciprocalEstimate enum value for a square root of the given type
	/// based on the function's attributes. If the operation is not overridden by
	/// the function's attributes, "Unspecified" is returned and target defaults
	/// are expected to be used for instruction selection.
	int getRecipEstimateSqrtEnabled(EVT VT, MachineFunction &MF) const;

	/// Return a ReciprocalEstimate enum value for a division of the given type
	/// based on the function's attributes. If the operation is not overridden by
	/// the function's attributes, "Unspecified" is returned and target defaults
	/// are expected to be used for instruction selection.
	int getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF) const;

	/// Return the refinement step count for a square root of the given type based
	/// on the function's attributes. If the operation is not overridden by
	/// the function's attributes, "Unspecified" is returned and target defaults
	/// are expected to be used for instruction selection.
	int getSqrtRefinementSteps(EVT VT, MachineFunction &MF) const;

	/// Return the refinement step count for a division of the given type based
	/// on the function's attributes. If the operation is not overridden by
	/// the function's attributes, "Unspecified" is returned and target defaults
	/// are expected to be used for instruction selection.
	int getDivRefinementSteps(EVT VT, MachineFunction &MF) const;

	/// Returns true if target has indicated at least one type should be bypassed.
	bool isSlowDivBypassed() const { return !BypassSlowDivWidths.empty(); }

	/// Returns map of slow types for division or remainder with corresponding
	/// fast types
	const DenseMap<unsigned int, unsigned int> &getBypassSlowDivWidths() const {
	return BypassSlowDivWidths;
	}

	/// Return true if Flow Control is an expensive operation that should be
	/// avoided.
	bool isJumpExpensive() const { return JumpIsExpensive; }

	/// Return true if selects are only cheaper than branches if the branch is
	/// unlikely to be predicted right.
	bool isPredictableSelectExpensive() const {
	return PredictableSelectIsExpensive;
	}

	/// If a branch or a select condition is skewed in one direction by more than
	/// this factor, it is very likely to be predicted correctly.
	virtual BranchProbability getPredictableBranchThreshold() const;

	/// Return true if the following transform is beneficial:
	/// fold (conv (load x)) -> (load (conv*)x)
	/// On architectures that don't natively support some vector loads
	/// efficiently, casting the load to a smaller vector of larger types and
	/// loading is more efficient, however, this can be undone by optimizations in
	/// dag combiner.
	virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
	const SelectionDAG &DAG,
	const MachineMemOperand &MMO) const {
	// Don't do if we could do an indexed load on the original type, but not on
	// the new one.
	if (!LoadVT.isSimple() \|\| !BitcastVT.isSimple())
	return true;

	MVT LoadMVT = LoadVT.getSimpleVT();

	// Don't bother doing this if it's just going to be promoted again later, as
	// doing so might interfere with other combines.
	if (getOperationAction(ISD::LOAD, LoadMVT) == Promote &&
	getTypeToPromoteTo(ISD::LOAD, LoadMVT) == BitcastVT.getSimpleVT())
	return false;

	bool Fast = false;
	return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), BitcastVT,
	MMO, &Fast) && Fast;
	}

	/// Return true if the following transform is beneficial:
	/// (store (y (conv x)), y)) -> (store x, (x))
	virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT,
	const SelectionDAG &DAG,
	const MachineMemOperand &MMO) const {
	// Default to the same logic as loads.
	return isLoadBitCastBeneficial(StoreVT, BitcastVT, DAG, MMO);
	}

	/// Return true if it is expected to be cheaper to do a store of a non-zero
	/// vector constant with the given size and type for the address space than to
	/// store the individual scalar element constants.
	virtual bool storeOfVectorConstantIsCheap(EVT MemVT,
	unsigned NumElem,
	unsigned AddrSpace) const {
	return false;
	}

	/// Allow store merging for the specified type after legalization in addition
	/// to before legalization. This may transform stores that do not exist
	/// earlier (for example, stores created from intrinsics).
	virtual bool mergeStoresAfterLegalization(EVT MemVT) const {
	return true;
	}

	/// Returns if it's reasonable to merge stores to MemVT size.
	virtual bool canMergeStoresTo(unsigned AS, EVT MemVT,
	const SelectionDAG &DAG) const {
	return true;
	}

	/// Return true if it is cheap to speculate a call to intrinsic cttz.
	virtual bool isCheapToSpeculateCttz() const {
	return false;
	}

	/// Return true if it is cheap to speculate a call to intrinsic ctlz.
	virtual bool isCheapToSpeculateCtlz() const {
	return false;
	}

	/// Return true if ctlz instruction is fast.
	virtual bool isCtlzFast() const {
	return false;
	}

	/// Return true if it is safe to transform an integer-domain bitwise operation
	/// into the equivalent floating-point operation. This should be set to true
	/// if the target has IEEE-754-compliant fabs/fneg operations for the input
	/// type.
	virtual bool hasBitPreservingFPLogic(EVT VT) const {
	return false;
	}

	/// Return true if it is cheaper to split the store of a merged int val
	/// from a pair of smaller values into multiple stores.
	virtual bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const {
	return false;
	}

	/// Return if the target supports combining a
	/// chain like:
	/// \code
	/// %andResult = and %val1, #mask
	/// %icmpResult = icmp %andResult, 0
	/// \endcode
	/// into a single machine instruction of a form like:
	/// \code
	/// cc = test %register, #mask
	/// \endcode
	virtual bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
	return false;
	}

	/// Use bitwise logic to make pairs of compares more efficient. For example:
	/// and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
	/// This should be true when it takes more than one instruction to lower
	/// setcc (cmp+set on x86 scalar), when bitwise ops are faster than logic on
	/// condition bits (crand on PowerPC), and/or when reducing cmp+br is a win.
	virtual bool convertSetCCLogicToBitwiseLogic(EVT VT) const {
	return false;
	}

	/// Return the preferred operand type if the target has a quick way to compare
	/// integer values of the given size. Assume that any legal integer type can
	/// be compared efficiently. Targets may override this to allow illegal wide
	/// types to return a vector type if there is support to compare that type.
	virtual MVT hasFastEqualityCompare(unsigned NumBits) const {
	MVT VT = MVT::getIntegerVT(NumBits);
	return isTypeLegal(VT) ? VT : MVT::INVALID_SIMPLE_VALUE_TYPE;
	}

	/// Return true if the target should transform:
	/// (X & Y) == Y ---> (~X & Y) == 0
	/// (X & Y) != Y ---> (~X & Y) != 0
	///
	/// This may be profitable if the target has a bitwise and-not operation that
	/// sets comparison flags. A target may want to limit the transformation based
	/// on the type of Y or if Y is a constant.
	///
	/// Note that the transform will not occur if Y is known to be a power-of-2
	/// because a mask and compare of a single bit can be handled by inverting the
	/// predicate, for example:
	/// (X & 8) == 8 ---> (X & 8) != 0
	virtual bool hasAndNotCompare(SDValue Y) const {
	return false;
	}

	/// Return true if the target has a bitwise and-not operation:
	/// X = ~A & B
	/// This can be used to simplify select or other instructions.
	virtual bool hasAndNot(SDValue X) const {
	// If the target has the more complex version of this operation, assume that
	// it has this operation too.
	return hasAndNotCompare(X);
	}

	/// There are two ways to clear extreme bits (either low or high):
	/// Mask: x & (-1 << y) (the instcombine canonical form)
	/// Shifts: x >> y << y
	/// Return true if the variant with 2 variable shifts is preferred.
	/// Return false if there is no preference.
	virtual bool shouldFoldMaskToVariableShiftPair(SDValue X) const {
	// By default, let's assume that no one prefers shifts.
	return false;
	}

	/// Return true if it is profitable to fold a pair of shifts into a mask.
	/// This is usually true on most targets. But some targets, like Thumb1,
	/// have immediate shift instructions, but no immediate "and" instruction;
	/// this makes the fold unprofitable.
	virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N,
	CombineLevel Level) const {
	return true;
	}

	/// Should we tranform the IR-optimal check for whether given truncation
	/// down into KeptBits would be truncating or not:
	/// (add %x, (1 << (KeptBits-1))) srccond (1 << KeptBits)
	/// Into it's more traditional form:
	/// ((%x << C) a>> C) dstcond %x
	/// Return true if we should transform.
	/// Return false if there is no preference.
	virtual bool shouldTransformSignedTruncationCheck(EVT XVT,
	unsigned KeptBits) const {
	// By default, let's assume that no one prefers shifts.
	return false;
	}

	/// These two forms are equivalent:
	/// sub %y, (xor %x, -1)
	/// add (add %x, 1), %y
	/// The variant with two add's is IR-canonical.
	/// Some targets may prefer one to the other.
	virtual bool preferIncOfAddToSubOfNot(EVT VT) const {
	// By default, let's assume that everyone prefers the form with two add's.
	return true;
	}

	/// Return true if the target wants to use the optimization that
	/// turns ext(promotableInst1(...(promotableInstN(load)))) into
	/// promotedInst1(...(promotedInstN(ext(load)))).
	bool enableExtLdPromotion() const { return EnableExtLdPromotion; }

	/// Return true if the target can combine store(extractelement VectorTy,
	/// Idx).
	/// \p Cost[out] gives the cost of that transformation when this is true.
	virtual bool canCombineStoreAndExtract(Type VectorTy, Value Idx,
	unsigned &Cost) const {
	return false;
	}

	/// Return true if inserting a scalar into a variable element of an undef
	/// vector is more efficiently handled by splatting the scalar instead.
	virtual bool shouldSplatInsEltVarIndex(EVT) const {
	return false;
	}

	/// Return true if target always beneficiates from combining into FMA for a
	/// given value type. This must typically return false on targets where FMA
	/// takes more cycles to execute than FADD.
	virtual bool enableAggressiveFMAFusion(EVT VT) const {
	return false;
	}

	/// Return the ValueType of the result of SETCC operations.
	virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
	EVT VT) const;

	/// Return the ValueType for comparison libcalls. Comparions libcalls include
	/// floating point comparion calls, and Ordered/Unordered check calls on
	/// floating point numbers.
	virtual
	MVT::SimpleValueType getCmpLibcallReturnType() const;

	/// For targets without i1 registers, this gives the nature of the high-bits
	/// of boolean values held in types wider than i1.
	///
	/// "Boolean values" are special true/false values produced by nodes like
	/// SETCC and consumed (as the condition) by nodes like SELECT and BRCOND.
	/// Not to be confused with general values promoted from i1. Some cpus
	/// distinguish between vectors of boolean and scalars; the isVec parameter
	/// selects between the two kinds. For example on X86 a scalar boolean should
	/// be zero extended from i1, while the elements of a vector of booleans
	/// should be sign extended from i1.
	///
	/// Some cpus also treat floating point types the same way as they treat
	/// vectors instead of the way they treat scalars.
	BooleanContent getBooleanContents(bool isVec, bool isFloat) const {
	if (isVec)
	return BooleanVectorContents;
	return isFloat ? BooleanFloatContents : BooleanContents;
	}

	BooleanContent getBooleanContents(EVT Type) const {
	return getBooleanContents(Type.isVector(), Type.isFloatingPoint());
	}

	/// Return target scheduling preference.
	Sched::Preference getSchedulingPreference() const {
	return SchedPreferenceInfo;
	}

	/// Some scheduler, e.g. hybrid, can switch to different scheduling heuristics
	/// for different nodes. This function returns the preference (or none) for
	/// the given node.
	virtual Sched::Preference getSchedulingPreference(SDNode *) const {
	return Sched::None;
	}

	/// Return the register class that should be used for the specified value
	/// type.
	virtual const TargetRegisterClass *getRegClassFor(MVT VT, bool isDivergent = false) const {
	(void)isDivergent;
	const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
	assert(RC && "This value type is not natively supported!");
	return RC;
	}

	/// Allows target to decide about the register class of the
	/// specific value that is live outside the defining block.
	/// Returns true if the value needs uniform register class.
	virtual bool requiresUniformRegister(MachineFunction &MF,
	const Value *) const {
	return false;
	}

	/// Return the 'representative' register class for the specified value
	/// type.
	///
	/// The 'representative' register class is the largest legal super-reg
	/// register class for the register class of the value type. For example, on
	/// i386 the rep register class for i8, i16, and i32 are GR32; while the rep
	/// register class is GR64 on x86_64.
	virtual const TargetRegisterClass *getRepRegClassFor(MVT VT) const {
	const TargetRegisterClass *RC = RepRegClassForVT[VT.SimpleTy];
	return RC;
	}

	/// Return the cost of the 'representative' register class for the specified
	/// value type.
	virtual uint8_t getRepRegClassCostFor(MVT VT) const {
	return RepRegClassCostForVT[VT.SimpleTy];
	}

	/// Return true if SHIFT instructions should be expanded to SHIFT_PARTS
	/// instructions, and false if a library call is preferred (e.g for code-size
	/// reasons).
	virtual bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
	return true;
	}

	/// Return true if the target has native support for the specified value type.
	/// This means that it has a register that directly holds it without
	/// promotions or expansions.
	bool isTypeLegal(EVT VT) const {
	assert(!VT.isSimple() \|\|
	(unsigned)VT.getSimpleVT().SimpleTy < array_lengthof(RegClassForVT));
	return VT.isSimple() && RegClassForVT[VT.getSimpleVT().SimpleTy] != nullptr;
	}

	class ValueTypeActionImpl {
	/// ValueTypeActions - For each value type, keep a LegalizeTypeAction enum
	/// that indicates how instruction selection should deal with the type.
	LegalizeTypeAction ValueTypeActions[MVT::LAST_VALUETYPE];

	public:
	ValueTypeActionImpl() {
	std::fill(std::begin(ValueTypeActions), std::end(ValueTypeActions),
	TypeLegal);
	}

	LegalizeTypeAction getTypeAction(MVT VT) const {
	return ValueTypeActions[VT.SimpleTy];
	}

	void setTypeAction(MVT VT, LegalizeTypeAction Action) {
	ValueTypeActions[VT.SimpleTy] = Action;
	}
	};

	const ValueTypeActionImpl &getValueTypeActions() const {
	return ValueTypeActions;
	}

	/// Return how we should legalize values of this type, either it is already
	/// legal (return 'Legal') or we need to promote it to a larger type (return
	/// 'Promote'), or we need to expand it into multiple registers of smaller
	/// integer type (return 'Expand'). 'Custom' is not an option.
	LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const {
	return getTypeConversion(Context, VT).first;
	}
	LegalizeTypeAction getTypeAction(MVT VT) const {
	return ValueTypeActions.getTypeAction(VT);
	}

	/// For types supported by the target, this is an identity function. For
	/// types that must be promoted to larger types, this returns the larger type
	/// to promote to. For integer types that are larger than the largest integer
	/// register, this contains one step in the expansion to get to the smaller
	/// register. For illegal floating point types, this returns the integer type
	/// to transform to.
	EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const {
	return getTypeConversion(Context, VT).second;
	}

	/// For types supported by the target, this is an identity function. For
	/// types that must be expanded (i.e. integer types that are larger than the
	/// largest integer register or illegal floating point types), this returns
	/// the largest legal type it will be expanded to.
	EVT getTypeToExpandTo(LLVMContext &Context, EVT VT) const {
	assert(!VT.isVector());
	while (true) {
	switch (getTypeAction(Context, VT)) {
	case TypeLegal:
	return VT;
	case TypeExpandInteger:
	VT = getTypeToTransformTo(Context, VT);
	break;
	default:
	llvm_unreachable("Type is not legal nor is it to be expanded!");
	}
	}
	}

	/// Vector types are broken down into some number of legal first class types.
	/// For example, EVT::v8f32 maps to 2 EVT::v4f32 with Altivec or SSE1, or 8
	/// promoted EVT::f64 values with the X86 FP stack. Similarly, EVT::v2i64
	/// turns into 4 EVT::i32 values with both PPC and X86.
	///
	/// This method returns the number of registers needed, and the VT for each
	/// register. It also returns the VT and quantity of the intermediate values
	/// before they are promoted/expanded.
	unsigned getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
	EVT &IntermediateVT,
	unsigned &NumIntermediates,
	MVT &RegisterVT) const;

	/// Certain targets such as MIPS require that some types such as vectors are
	/// always broken down into scalars in some contexts. This occurs even if the
	/// vector type is legal.
	virtual unsigned getVectorTypeBreakdownForCallingConv(
	LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
	unsigned &NumIntermediates, MVT &RegisterVT) const {
	return getVectorTypeBreakdown(Context, VT, IntermediateVT, NumIntermediates,
	RegisterVT);
	}

	struct IntrinsicInfo {
	unsigned opc = 0; // target opcode
	EVT memVT; // memory VT

	// value representing memory location
	PointerUnion<const Value , const PseudoSourceValue > ptrVal;

	int offset = 0; // offset off of ptrVal
	unsigned size = 0; // the size of the memory location
	// (taken from memVT if zero)
	unsigned align = 1; // alignment

	MachineMemOperand::Flags flags = MachineMemOperand::MONone;
	IntrinsicInfo() = default;
	};

	/// Given an intrinsic, checks if on the target the intrinsic will need to map
	/// to a MemIntrinsicNode (touches memory). If this is the case, it returns
	/// true and store the intrinsic information into the IntrinsicInfo that was
	/// passed to the function.
	virtual bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
	MachineFunction &,
	unsigned /Intrinsic/) const {
	return false;
	}

	/// Returns true if the target can instruction select the specified FP
	/// immediate natively. If false, the legalizer will materialize the FP
	/// immediate as a load from a constant pool.
	virtual bool isFPImmLegal(const APFloat & /Imm/, EVT /VT/,
	bool ForCodeSize = false) const {
	return false;
	}

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
	/// target supports the VECTOR_SHUFFLE node, all mask values are assumed to be
	/// legal.
	virtual bool isShuffleMaskLegal(ArrayRef<int> /Mask/, EVT /VT/) const {
	return true;
	}

	/// Returns true if the operation can trap for the value type.
	///
	/// VT must be a legal type. By default, we optimistically assume most
	/// operations don't trap except for integer divide and remainder.
	virtual bool canOpTrap(unsigned Op, EVT VT) const;

	/// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
	/// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
	/// constant pool entry.
	virtual bool isVectorClearMaskLegal(ArrayRef<int> /Mask/,
	EVT /VT/) const {
	return false;
	}

	/// Return how this operation should be treated: either it is legal, needs to
	/// be promoted to a larger size, needs to be expanded to some other code
	/// sequence, or the target has a custom expander for it.
	LegalizeAction getOperationAction(unsigned Op, EVT VT) const {
	if (VT.isExtended()) return Expand;
	// If a target-specific SDNode requires legalization, require the target
	// to provide custom legalization for it.
	if (Op >= array_lengthof(OpActions[0])) return Custom;
	return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op];
	}

	/// Custom method defined by each target to indicate if an operation which
	/// may require a scale is supported natively by the target.
	/// If not, the operation is illegal.
	virtual bool isSupportedFixedPointOperation(unsigned Op, EVT VT,
	unsigned Scale) const {
	return false;
	}

	/// Some fixed point operations may be natively supported by the target but
	/// only for specific scales. This method allows for checking
	/// if the width is supported by the target for a given operation that may
	/// depend on scale.
	LegalizeAction getFixedPointOperationAction(unsigned Op, EVT VT,
	unsigned Scale) const {
	auto Action = getOperationAction(Op, VT);
	if (Action != Legal)
	return Action;

	// This operation is supported in this type but may only work on specific
	// scales.
	bool Supported;
	switch (Op) {
	default:
	llvm_unreachable("Unexpected fixed point operation.");
	case ISD::SMULFIX:
	case ISD::SMULFIXSAT:
	case ISD::UMULFIX:
	Supported = isSupportedFixedPointOperation(Op, VT, Scale);
	break;
	}

	return Supported ? Action : Expand;
	}

	LegalizeAction getStrictFPOperationAction(unsigned Op, EVT VT) const {
	unsigned EqOpc;
	switch (Op) {
	default: llvm_unreachable("Unexpected FP pseudo-opcode");
	case ISD::STRICT_FADD: EqOpc = ISD::FADD; break;
	case ISD::STRICT_FSUB: EqOpc = ISD::FSUB; break;
	case ISD::STRICT_FMUL: EqOpc = ISD::FMUL; break;
	case ISD::STRICT_FDIV: EqOpc = ISD::FDIV; break;
	case ISD::STRICT_FREM: EqOpc = ISD::FREM; break;
	case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break;
	case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break;
	case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break;
	case ISD::STRICT_FMA: EqOpc = ISD::FMA; break;
	case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break;
	case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break;
	case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break;
	case ISD::STRICT_FEXP2: EqOpc = ISD::FEXP2; break;
	case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break;
	case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break;
	case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break;
	case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break;
	case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break;
	case ISD::STRICT_FMAXNUM: EqOpc = ISD::FMAXNUM; break;
	case ISD::STRICT_FMINNUM: EqOpc = ISD::FMINNUM; break;
	case ISD::STRICT_FCEIL: EqOpc = ISD::FCEIL; break;
	case ISD::STRICT_FFLOOR: EqOpc = ISD::FFLOOR; break;
	case ISD::STRICT_FROUND: EqOpc = ISD::FROUND; break;
	case ISD::STRICT_FTRUNC: EqOpc = ISD::FTRUNC; break;
	case ISD::STRICT_FP_ROUND: EqOpc = ISD::FP_ROUND; break;
	case ISD::STRICT_FP_EXTEND: EqOpc = ISD::FP_EXTEND; break;
	}

	auto Action = getOperationAction(EqOpc, VT);

	// We don't currently handle Custom or Promote for strict FP pseudo-ops.
	// For now, we just expand for those cases.
	if (Action != Legal)
	Action = Expand;

	return Action;
	}

	/// Return true if the specified operation is legal on this target or can be
	/// made legal with custom lowering. This is used to help guide high-level
	/// lowering decisions.
	bool isOperationLegalOrCustom(unsigned Op, EVT VT) const {
	return (VT == MVT::Other \|\| isTypeLegal(VT)) &&
	(getOperationAction(Op, VT) == Legal \|\|
	getOperationAction(Op, VT) == Custom);
	}

	/// Return true if the specified operation is legal on this target or can be
	/// made legal using promotion. This is used to help guide high-level lowering
	/// decisions.
	bool isOperationLegalOrPromote(unsigned Op, EVT VT) const {
	return (VT == MVT::Other \|\| isTypeLegal(VT)) &&
	(getOperationAction(Op, VT) == Legal \|\|
	getOperationAction(Op, VT) == Promote);
	}

	/// Return true if the specified operation is legal on this target or can be
	/// made legal with custom lowering or using promotion. This is used to help
	/// guide high-level lowering decisions.
	bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT) const {
	return (VT == MVT::Other \|\| isTypeLegal(VT)) &&
	(getOperationAction(Op, VT) == Legal \|\|
	getOperationAction(Op, VT) == Custom \|\|
	getOperationAction(Op, VT) == Promote);
	}

	/// Return true if the operation uses custom lowering, regardless of whether
	/// the type is legal or not.
	bool isOperationCustom(unsigned Op, EVT VT) const {
	return getOperationAction(Op, VT) == Custom;
	}

	/// Return true if lowering to a jump table is allowed.
	virtual bool areJTsAllowed(const Function *Fn) const {
	if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true")
	return false;

	return isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) \|\|
	isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
	}

	/// Check whether the range [Low,High] fits in a machine word.
	bool rangeFitsInWord(const APInt &Low, const APInt &High,
	const DataLayout &DL) const {
	// FIXME: Using the pointer type doesn't seem ideal.
	uint64_t BW = DL.getIndexSizeInBits(0u);
	uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1;
	return Range <= BW;
	}

	/// Return true if lowering to a jump table is suitable for a set of case
	/// clusters which may contain \p NumCases cases, \p Range range of values.
	virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases,
	uint64_t Range) const {
	// FIXME: This function check the maximum table size and density, but the
	// minimum size is not checked. It would be nice if the minimum size is
	// also combined within this function. Currently, the minimum size check is
	// performed in findJumpTable() in SelectionDAGBuiler and
	// getEstimatedNumberOfCaseClusters() in BasicTTIImpl.
	const bool OptForSize = SI->getParent()->getParent()->hasOptSize();
	const unsigned MinDensity = getMinimumJumpTableDensity(OptForSize);
	const unsigned MaxJumpTableSize = getMaximumJumpTableSize();

	// Check whether the number of cases is small enough and
	// the range is dense enough for a jump table.
	if ((OptForSize \|\| Range <= MaxJumpTableSize) &&
	(NumCases * 100 >= Range * MinDensity)) {
	return true;
	}
	return false;
	}

	/// Return true if lowering to a bit test is suitable for a set of case
	/// clusters which contains \p NumDests unique destinations, \p Low and
	/// \p High as its lowest and highest case values, and expects \p NumCmps
	/// case value comparisons. Check if the number of destinations, comparison
	/// metric, and range are all suitable.
	bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps,
	const APInt &Low, const APInt &High,
	const DataLayout &DL) const {
	// FIXME: I don't think NumCmps is the correct metric: a single case and a
	// range of cases both require only one branch to lower. Just looking at the
	// number of clusters and destinations should be enough to decide whether to
	// build bit tests.

	// To lower a range with bit tests, the range must fit the bitwidth of a
	// machine word.
	if (!rangeFitsInWord(Low, High, DL))
	return false;

	// Decide whether it's profitable to lower this range with bit tests. Each
	// destination requires a bit test and branch, and there is an overall range
	// check branch. For a small number of clusters, separate comparisons might
	// be cheaper, and for many destinations, splitting the range might be
	// better.
	return (NumDests == 1 && NumCmps >= 3) \|\| (NumDests == 2 && NumCmps >= 5) \|\|
	(NumDests == 3 && NumCmps >= 6);
	}

	/// Return true if the specified operation is illegal on this target or
	/// unlikely to be made legal with custom lowering. This is used to help guide
	/// high-level lowering decisions.
	bool isOperationExpand(unsigned Op, EVT VT) const {
	return (!isTypeLegal(VT) \|\| getOperationAction(Op, VT) == Expand);
	}

	/// Return true if the specified operation is legal on this target.
	bool isOperationLegal(unsigned Op, EVT VT) const {
	return (VT == MVT::Other \|\| isTypeLegal(VT)) &&
	getOperationAction(Op, VT) == Legal;
	}

	/// Return how this load with extension should be treated: either it is legal,
	/// needs to be promoted to a larger size, needs to be expanded to some other
	/// code sequence, or the target has a custom expander for it.
	LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT,
	EVT MemVT) const {
	if (ValVT.isExtended() \|\| MemVT.isExtended()) return Expand;
	unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy;
	unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy;
	assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValI < MVT::LAST_VALUETYPE &&
	MemI < MVT::LAST_VALUETYPE && "Table isn't big enough!");
	unsigned Shift = 4 * ExtType;
	return (LegalizeAction)((LoadExtActions[ValI][MemI] >> Shift) & 0xf);
	}

	/// Return true if the specified load with extension is legal on this target.
	bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const {
	return getLoadExtAction(ExtType, ValVT, MemVT) == Legal;
	}

	/// Return true if the specified load with extension is legal or custom
	/// on this target.
	bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const {
	return getLoadExtAction(ExtType, ValVT, MemVT) == Legal \|\|
	getLoadExtAction(ExtType, ValVT, MemVT) == Custom;
	}

	/// Return how this store with truncation should be treated: either it is
	/// legal, needs to be promoted to a larger size, needs to be expanded to some
	/// other code sequence, or the target has a custom expander for it.
	LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const {
	if (ValVT.isExtended() \|\| MemVT.isExtended()) return Expand;
	unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy;
	unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy;
	assert(ValI < MVT::LAST_VALUETYPE && MemI < MVT::LAST_VALUETYPE &&
	"Table isn't big enough!");
	return TruncStoreActions[ValI][MemI];
	}

	/// Return true if the specified store with truncation is legal on this
	/// target.
	bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const {
	return isTypeLegal(ValVT) && getTruncStoreAction(ValVT, MemVT) == Legal;
	}

	/// Return true if the specified store with truncation has solution on this
	/// target.
	bool isTruncStoreLegalOrCustom(EVT ValVT, EVT MemVT) const {
	return isTypeLegal(ValVT) &&
	(getTruncStoreAction(ValVT, MemVT) == Legal \|\|
	getTruncStoreAction(ValVT, MemVT) == Custom);
	}

	/// Return how the indexed load should be treated: either it is legal, needs
	/// to be promoted to a larger size, needs to be expanded to some other code
	/// sequence, or the target has a custom expander for it.
	LegalizeAction
	getIndexedLoadAction(unsigned IdxMode, MVT VT) const {
	assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
	"Table isn't big enough!");
	unsigned Ty = (unsigned)VT.SimpleTy;
	return (LegalizeAction)((IndexedModeActions[Ty][IdxMode] & 0xf0) >> 4);
	}

	/// Return true if the specified indexed load is legal on this target.
	bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const {
	return VT.isSimple() &&
	(getIndexedLoadAction(IdxMode, VT.getSimpleVT()) == Legal \|\|
	getIndexedLoadAction(IdxMode, VT.getSimpleVT()) == Custom);
	}

	/// Return how the indexed store should be treated: either it is legal, needs
	/// to be promoted to a larger size, needs to be expanded to some other code
	/// sequence, or the target has a custom expander for it.
	LegalizeAction
	getIndexedStoreAction(unsigned IdxMode, MVT VT) const {
	assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
	"Table isn't big enough!");
	unsigned Ty = (unsigned)VT.SimpleTy;
	return (LegalizeAction)(IndexedModeActions[Ty][IdxMode] & 0x0f);
	}

	/// Return true if the specified indexed load is legal on this target.
	bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const {
	return VT.isSimple() &&
	(getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Legal \|\|
	getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Custom);
	}

	/// Return how the condition code should be treated: either it is legal, needs
	/// to be expanded to some other code sequence, or the target has a custom
	/// expander for it.
	LegalizeAction
	getCondCodeAction(ISD::CondCode CC, MVT VT) const {
	assert((unsigned)CC < array_lengthof(CondCodeActions) &&
	((unsigned)VT.SimpleTy >> 3) < array_lengthof(CondCodeActions[0]) &&
	"Table isn't big enough!");
	// See setCondCodeAction for how this is encoded.
	uint32_t Shift = 4 * (VT.SimpleTy & 0x7);
	uint32_t Value = CondCodeActions[CC][VT.SimpleTy >> 3];
	LegalizeAction Action = (LegalizeAction) ((Value >> Shift) & 0xF);
	assert(Action != Promote && "Can't promote condition code!");
	return Action;
	}

	/// Return true if the specified condition code is legal on this target.
	bool isCondCodeLegal(ISD::CondCode CC, MVT VT) const {
	return getCondCodeAction(CC, VT) == Legal;
	}

	/// Return true if the specified condition code is legal or custom on this
	/// target.
	bool isCondCodeLegalOrCustom(ISD::CondCode CC, MVT VT) const {
	return getCondCodeAction(CC, VT) == Legal \|\|
	getCondCodeAction(CC, VT) == Custom;
	}

	/// If the action for this operation is to promote, this method returns the
	/// ValueType to promote to.
	MVT getTypeToPromoteTo(unsigned Op, MVT VT) const {
	assert(getOperationAction(Op, VT) == Promote &&
	"This operation isn't promoted!");

	// See if this has an explicit type specified.
	std::map<std::pair<unsigned, MVT::SimpleValueType>,
	MVT::SimpleValueType>::const_iterator PTTI =
	PromoteToType.find(std::make_pair(Op, VT.SimpleTy));
	if (PTTI != PromoteToType.end()) return PTTI->second;

	assert((VT.isInteger() \|\| VT.isFloatingPoint()) &&
	"Cannot autopromote this type, add it with AddPromotedToType.");

	MVT NVT = VT;
	do {
	NVT = (MVT::SimpleValueType)(NVT.SimpleTy+1);
	assert(NVT.isInteger() == VT.isInteger() && NVT != MVT::isVoid &&
	"Didn't find type to promote to!");
	} while (!isTypeLegal(NVT) \|\|
	getOperationAction(Op, NVT) == Promote);
	return NVT;
	}

	/// Return the EVT corresponding to this LLVM type. This is fixed by the LLVM
	/// operations except for the pointer size. If AllowUnknown is true, this
	/// will return MVT::Other for types with no EVT counterpart (e.g. structs),
	/// otherwise it will assert.
	EVT getValueType(const DataLayout &DL, Type *Ty,
	bool AllowUnknown = false) const {
	// Lower scalar pointers to native pointer types.
	if (auto *PTy = dyn_cast<PointerType>(Ty))
	return getPointerTy(DL, PTy->getAddressSpace());

	if (auto *VTy = dyn_cast<VectorType>(Ty)) {
	Type *EltTy = VTy->getElementType();
	// Lower vectors of pointers to native pointer types.
	if (auto *PTy = dyn_cast<PointerType>(EltTy)) {
	EVT PointerTy(getPointerTy(DL, PTy->getAddressSpace()));
	EltTy = PointerTy.getTypeForEVT(Ty->getContext());
	}
	return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(EltTy, false),
	VTy->getNumElements());
	}

	return EVT::getEVT(Ty, AllowUnknown);
	}

	EVT getMemValueType(const DataLayout &DL, Type *Ty,
	bool AllowUnknown = false) const {
	// Lower scalar pointers to native pointer types.
	if (PointerType *PTy = dyn_cast<PointerType>(Ty))
	return getPointerMemTy(DL, PTy->getAddressSpace());
	else if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
	Type *Elm = VTy->getElementType();
	if (PointerType *PT = dyn_cast<PointerType>(Elm)) {
	EVT PointerTy(getPointerMemTy(DL, PT->getAddressSpace()));
	Elm = PointerTy.getTypeForEVT(Ty->getContext());
	}
	return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(Elm, false),
	VTy->getNumElements());
	}

	return getValueType(DL, Ty, AllowUnknown);
	}


	/// Return the MVT corresponding to this LLVM type. See getValueType.
	MVT getSimpleValueType(const DataLayout &DL, Type *Ty,
	bool AllowUnknown = false) const {
	return getValueType(DL, Ty, AllowUnknown).getSimpleVT();
	}

	/// Return the desired alignment for ByVal or InAlloca aggregate function
	/// arguments in the caller parameter area. This is the actual alignment, not
	/// its logarithm.
	virtual unsigned getByValTypeAlignment(Type *Ty, const DataLayout &DL) const;

	/// Return the type of registers that this ValueType will eventually require.
	MVT getRegisterType(MVT VT) const {
	assert((unsigned)VT.SimpleTy < array_lengthof(RegisterTypeForVT));
	return RegisterTypeForVT[VT.SimpleTy];
	}

	/// Return the type of registers that this ValueType will eventually require.
	MVT getRegisterType(LLVMContext &Context, EVT VT) const {
	if (VT.isSimple()) {
	assert((unsigned)VT.getSimpleVT().SimpleTy <
	array_lengthof(RegisterTypeForVT));
	return RegisterTypeForVT[VT.getSimpleVT().SimpleTy];
	}
	if (VT.isVector()) {
	EVT VT1;
	MVT RegisterVT;
	unsigned NumIntermediates;
	(void)getVectorTypeBreakdown(Context, VT, VT1,
	NumIntermediates, RegisterVT);
	return RegisterVT;
	}
	if (VT.isInteger()) {
	return getRegisterType(Context, getTypeToTransformTo(Context, VT));
	}
	llvm_unreachable("Unsupported extended type!");
	}

	/// Return the number of registers that this ValueType will eventually
	/// require.
	///
	/// This is one for any types promoted to live in larger registers, but may be
	/// more than one for types (like i64) that are split into pieces. For types
	/// like i140, which are first promoted then expanded, it is the number of
	/// registers needed to hold all the bits of the original type. For an i140
	/// on a 32 bit machine this means 5 registers.
	unsigned getNumRegisters(LLVMContext &Context, EVT VT) const {
	if (VT.isSimple()) {
	assert((unsigned)VT.getSimpleVT().SimpleTy <
	array_lengthof(NumRegistersForVT));
	return NumRegistersForVT[VT.getSimpleVT().SimpleTy];
	}
	if (VT.isVector()) {
	EVT VT1;
	MVT VT2;
	unsigned NumIntermediates;
	return getVectorTypeBreakdown(Context, VT, VT1, NumIntermediates, VT2);
	}
	if (VT.isInteger()) {
	unsigned BitWidth = VT.getSizeInBits();
	unsigned RegWidth = getRegisterType(Context, VT).getSizeInBits();
	return (BitWidth + RegWidth - 1) / RegWidth;
	}
	llvm_unreachable("Unsupported extended type!");
	}

	/// Certain combinations of ABIs, Targets and features require that types
	/// are legal for some operations and not for other operations.
	/// For MIPS all vector types must be passed through the integer register set.
	virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context,
	CallingConv::ID CC, EVT VT) const {
	return getRegisterType(Context, VT);
	}

	/// Certain targets require unusual breakdowns of certain types. For MIPS,
	/// this occurs when a vector type is used, as vector are passed through the
	/// integer register set.
	virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context,
	CallingConv::ID CC,
	EVT VT) const {
	return getNumRegisters(Context, VT);
	}

	/// Certain targets have context senstive alignment requirements, where one
	/// type has the alignment requirement of another type.
	virtual unsigned getABIAlignmentForCallingConv(Type *ArgTy,
	DataLayout DL) const {
	return DL.getABITypeAlignment(ArgTy);
	}

	/// If true, then instruction selection should seek to shrink the FP constant
	/// of the specified type to a smaller type in order to save space and / or
	/// reduce runtime.
	virtual bool ShouldShrinkFPConstant(EVT) const { return true; }

	/// Return true if it is profitable to reduce a load to a smaller type.
	/// Example: (i16 (trunc (i32 (load x))) -> i16 load x
	virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	// By default, assume that it is cheaper to extract a subvector from a wide
	// vector load rather than creating multiple narrow vector loads.
	if (NewVT.isVector() && !Load->hasOneUse())
	return false;

	return true;
	}

	/// When splitting a value of the specified type into parts, does the Lo
	/// or Hi part come first? This usually follows the endianness, except
	/// for ppcf128, where the Hi part always comes first.
	bool hasBigEndianPartOrdering(EVT VT, const DataLayout &DL) const {
	return DL.isBigEndian() \|\| VT == MVT::ppcf128;
	}

	/// If true, the target has custom DAG combine transformations that it can
	/// perform for the specified node.
	bool hasTargetDAGCombine(ISD::NodeType NT) const {
	assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray));
	return TargetDAGCombineArray[NT >> 3] & (1 << (NT&7));
	}

	unsigned getGatherAllAliasesMaxDepth() const {
	return GatherAllAliasesMaxDepth;
	}

	/// Returns the size of the platform's va_list object.
	virtual unsigned getVaListSizeInBits(const DataLayout &DL) const {
	return getPointerTy(DL).getSizeInBits();
	}

	/// Get maximum # of store operations permitted for llvm.memset
	///
	/// This function returns the maximum number of store operations permitted
	/// to replace a call to llvm.memset. The value is set by the target at the
	/// performance threshold for such a replacement. If OptSize is true,
	/// return the limit for functions that have OptSize attribute.
	unsigned getMaxStoresPerMemset(bool OptSize) const {
	return OptSize ? MaxStoresPerMemsetOptSize : MaxStoresPerMemset;
	}

	/// Get maximum # of store operations permitted for llvm.memcpy
	///
	/// This function returns the maximum number of store operations permitted
	/// to replace a call to llvm.memcpy. The value is set by the target at the
	/// performance threshold for such a replacement. If OptSize is true,
	/// return the limit for functions that have OptSize attribute.
	unsigned getMaxStoresPerMemcpy(bool OptSize) const {
	return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
	}

	/// \brief Get maximum # of store operations to be glued together
	///
	/// This function returns the maximum number of store operations permitted
	/// to glue together during lowering of llvm.memcpy. The value is set by
	// the target at the performance threshold for such a replacement.
	virtual unsigned getMaxGluedStoresPerMemcpy() const {
	return MaxGluedStoresPerMemcpy;
	}

	/// Get maximum # of load operations permitted for memcmp
	///
	/// This function returns the maximum number of load operations permitted
	/// to replace a call to memcmp. The value is set by the target at the
	/// performance threshold for such a replacement. If OptSize is true,
	/// return the limit for functions that have OptSize attribute.
	unsigned getMaxExpandSizeMemcmp(bool OptSize) const {
	return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp;
	}

	/// Get maximum # of store operations permitted for llvm.memmove
	///
	/// This function returns the maximum number of store operations permitted
	/// to replace a call to llvm.memmove. The value is set by the target at the
	/// performance threshold for such a replacement. If OptSize is true,
	/// return the limit for functions that have OptSize attribute.
	unsigned getMaxStoresPerMemmove(bool OptSize) const {
	return OptSize ? MaxStoresPerMemmoveOptSize : MaxStoresPerMemmove;
	}

	/// Determine if the target supports unaligned memory accesses.
	///
	/// This function returns true if the target allows unaligned memory accesses
	/// of the specified type in the given address space. If true, it also returns
	/// whether the unaligned memory access is "fast" in the last argument by
	/// reference. This is used, for example, in situations where an array
	/// copy/move/set is converted to a sequence of store operations. Its use
	/// helps to ensure that such replacements don't generate code that causes an
	/// alignment error (trap) on the target machine.
	virtual bool allowsMisalignedMemoryAccesses(
	EVT, unsigned AddrSpace = 0, unsigned Align = 1,
	MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
	bool * /Fast/ = nullptr) const {
	return false;
	}

	/// Return true if the target supports a memory access of this type for the
	/// given address space and alignment. If the access is allowed, the optional
	/// final parameter returns if the access is also fast (as defined by the
	/// target).
	bool
	allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
	unsigned AddrSpace = 0, unsigned Alignment = 1,
	MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
	bool *Fast = nullptr) const;

	/// Return true if the target supports a memory access of this type for the
	/// given MachineMemOperand. If the access is allowed, the optional
	/// final parameter returns if the access is also fast (as defined by the
	/// target).
	bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
	const MachineMemOperand &MMO,
	bool *Fast = nullptr) const;

	/// Returns the target specific optimal type for load and store operations as
	/// a result of memset, memcpy, and memmove lowering.
	///
	/// If DstAlign is zero that means it's safe to destination alignment can
	/// satisfy any constraint. Similarly if SrcAlign is zero it means there isn't
	/// a need to check it against alignment requirement, probably because the
	/// source does not need to be loaded. If 'IsMemset' is true, that means it's
	/// expanding a memset. If 'ZeroMemset' is true, that means it's a memset of
	/// zero. 'MemcpyStrSrc' indicates whether the memcpy source is constant so it
	/// does not need to be loaded. It returns EVT::Other if the type should be
	/// determined using generic target-independent logic.
	virtual EVT
	getOptimalMemOpType(uint64_t /Size/, unsigned /DstAlign/,
	unsigned /SrcAlign/, bool /IsMemset/,
	bool /ZeroMemset/, bool /MemcpyStrSrc/,
	const AttributeList & /FuncAttributes/) const {
	return MVT::Other;
	}

	/// Returns true if it's safe to use load / store of the specified type to
	/// expand memcpy / memset inline.
	///
	/// This is mostly true for all types except for some special cases. For
	/// example, on X86 targets without SSE2 f64 load / store are done with fldl /
	/// fstpl which also does type conversion. Note the specified type doesn't
	/// have to be legal as the hook is used before type legalization.
	virtual bool isSafeMemOpType(MVT /VT/) const { return true; }

	/// Determine if we should use _setjmp or setjmp to implement llvm.setjmp.
	bool usesUnderscoreSetJmp() const {
	return UseUnderscoreSetJmp;
	}

	/// Determine if we should use _longjmp or longjmp to implement llvm.longjmp.
	bool usesUnderscoreLongJmp() const {
	return UseUnderscoreLongJmp;
	}

	/// Return lower limit for number of blocks in a jump table.
	virtual unsigned getMinimumJumpTableEntries() const;

	/// Return lower limit of the density in a jump table.
	unsigned getMinimumJumpTableDensity(bool OptForSize) const;

	/// Return upper limit for number of entries in a jump table.
	/// Zero if no limit.
	unsigned getMaximumJumpTableSize() const;

	virtual bool isJumpTableRelative() const {
	return TM.isPositionIndependent();
	}

	/// If a physical register, this specifies the register that
	/// llvm.savestack/llvm.restorestack should save and restore.
	unsigned getStackPointerRegisterToSaveRestore() const {
	return StackPointerRegisterToSaveRestore;
	}

	/// If a physical register, this returns the register that receives the
	/// exception address on entry to an EH pad.
	virtual unsigned
	getExceptionPointerRegister(const Constant *PersonalityFn) const {
	// 0 is guaranteed to be the NoRegister value on all targets
	return 0;
	}

	/// If a physical register, this returns the register that receives the
	/// exception typeid on entry to a landing pad.
	virtual unsigned
	getExceptionSelectorRegister(const Constant *PersonalityFn) const {
	// 0 is guaranteed to be the NoRegister value on all targets
	return 0;
	}

	virtual bool needsFixedCatchObjects() const {
	report_fatal_error("Funclet EH is not implemented for this target");
	}

	/// Returns the target's jmp_buf size in bytes (if never set, the default is
	/// 200)
	unsigned getJumpBufSize() const {
	return JumpBufSize;
	}

	/// Returns the target's jmp_buf alignment in bytes (if never set, the default
	/// is 0)
	unsigned getJumpBufAlignment() const {
	return JumpBufAlignment;
	}

	/// Return the minimum stack alignment of an argument.
	unsigned getMinStackArgumentAlignment() const {
	return MinStackArgumentAlignment;
	}

	/// Return the minimum function alignment.
	unsigned getMinFunctionAlignment() const {
	return MinFunctionAlignment;
	}

	/// Return the preferred function alignment.
	unsigned getPrefFunctionAlignment() const {
	return PrefFunctionAlignment;
	}

	/// Return the preferred loop alignment.
	virtual unsigned getPrefLoopAlignment(MachineLoop *ML = nullptr) const {
	return PrefLoopAlignment;
	}

	/// Should loops be aligned even when the function is marked OptSize (but not
	/// MinSize).
	virtual bool alignLoopsWithOptSize() const {
	return false;
	}

	/// If the target has a standard location for the stack protector guard,
	/// returns the address of that location. Otherwise, returns nullptr.
	/// DEPRECATED: please override useLoadStackGuardNode and customize
	/// LOAD_STACK_GUARD, or customize \@llvm.stackguard().
	virtual Value *getIRStackGuard(IRBuilder<> &IRB) const;

	/// Inserts necessary declarations for SSP (stack protection) purpose.
	/// Should be used only when getIRStackGuard returns nullptr.
	virtual void insertSSPDeclarations(Module &M) const;

	/// Return the variable that's previously inserted by insertSSPDeclarations,
	/// if any, otherwise return nullptr. Should be used only when
	/// getIRStackGuard returns nullptr.
	virtual Value *getSDagStackGuard(const Module &M) const;

	/// If this function returns true, stack protection checks should XOR the
	/// frame pointer (or whichever pointer is used to address locals) into the
	/// stack guard value before checking it. getIRStackGuard must return nullptr
	/// if this returns true.
	virtual bool useStackGuardXorFP() const { return false; }

	/// If the target has a standard stack protection check function that
	/// performs validation and error handling, returns the function. Otherwise,
	/// returns nullptr. Must be previously inserted by insertSSPDeclarations.
	/// Should be used only when getIRStackGuard returns nullptr.
	virtual Function *getSSPStackGuardCheck(const Module &M) const;

	protected:
	Value *getDefaultSafeStackPointerLocation(IRBuilder<> &IRB,
	bool UseTLS) const;

	public:
	/// Returns the target-specific address of the unsafe stack pointer.
	virtual Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const;

	/// Returns the name of the symbol used to emit stack probes or the empty
	/// string if not applicable.
	virtual StringRef getStackProbeSymbolName(MachineFunction &MF) const {
	return "";
	}

	/// Returns true if a cast between SrcAS and DestAS is a noop.
	virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
	return false;
	}

	/// Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g. we
	/// are happy to sink it into basic blocks. A cast may be free, but not
	/// necessarily a no-op. e.g. a free truncate from a 64-bit to 32-bit pointer.
	virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
	return isNoopAddrSpaceCast(SrcAS, DestAS);
	}

	/// Return true if the pointer arguments to CI should be aligned by aligning
	/// the object whose address is being passed. If so then MinSize is set to the
	/// minimum size the object must be to be aligned and PrefAlign is set to the
	/// preferred alignment.
	virtual bool shouldAlignPointerArgs(CallInst * /CI/, unsigned & /MinSize/,
	unsigned & /PrefAlign/) const {
	return false;
	}

	//===--------------------------------------------------------------------===//
	/// \name Helpers for TargetTransformInfo implementations
	/// @{

	/// Get the ISD node that corresponds to the Instruction class opcode.
	int InstructionOpcodeToISD(unsigned Opcode) const;

	/// Estimate the cost of type-legalization and the legalized type.
	std::pair<int, MVT> getTypeLegalizationCost(const DataLayout &DL,
	Type *Ty) const;

	/// @}

	//===--------------------------------------------------------------------===//
	/// \name Helpers for atomic expansion.
	/// @{

	/// Returns the maximum atomic operation size (in bits) supported by
	/// the backend. Atomic operations greater than this size (as well
	/// as ones that are not naturally aligned), will be expanded by
	/// AtomicExpandPass into an __atomic_* library call.
	unsigned getMaxAtomicSizeInBitsSupported() const {
	return MaxAtomicSizeInBitsSupported;
	}

	/// Returns the size of the smallest cmpxchg or ll/sc instruction
	/// the backend supports. Any smaller operations are widened in
	/// AtomicExpandPass.
	///
	/// Note that unlike operations above the maximum size, atomic ops
	/// are still natively supported below the minimum; they just
	/// require a more complex expansion.
	unsigned getMinCmpXchgSizeInBits() const { return MinCmpXchgSizeInBits; }

	/// Whether the target supports unaligned atomic operations.
	bool supportsUnalignedAtomics() const { return SupportsUnalignedAtomics; }

	/// Whether AtomicExpandPass should automatically insert fences and reduce
	/// ordering for this atomic. This should be true for most architectures with
	/// weak memory ordering. Defaults to false.
	virtual bool shouldInsertFencesForAtomic(const Instruction *I) const {
	return false;
	}

	/// Perform a load-linked operation on Addr, returning a "Value *" with the
	/// corresponding pointee type. This may entail some non-trivial operations to
	/// truncate or reconstruct types that will be illegal in the backend. See
	/// ARMISelLowering for an example implementation.
	virtual Value emitLoadLinked(IRBuilder<> &Builder, Value Addr,
	AtomicOrdering Ord) const {
	llvm_unreachable("Load linked unimplemented on this target");
	}

	/// Perform a store-conditional operation to Addr. Return the status of the
	/// store. This should be 0 if the store succeeded, non-zero otherwise.
	virtual Value emitStoreConditional(IRBuilder<> &Builder, Value Val,
	Value *Addr, AtomicOrdering Ord) const {
	llvm_unreachable("Store conditional unimplemented on this target");
	}

	/// Perform a masked atomicrmw using a target-specific intrinsic. This
	/// represents the core LL/SC loop which will be lowered at a late stage by
	/// the backend.
	virtual Value *emitMaskedAtomicRMWIntrinsic(IRBuilder<> &Builder,
	AtomicRMWInst *AI,
	Value AlignedAddr, Value Incr,
	Value Mask, Value ShiftAmt,
	AtomicOrdering Ord) const {
	llvm_unreachable("Masked atomicrmw expansion unimplemented on this target");
	}

	/// Perform a masked cmpxchg using a target-specific intrinsic. This
	/// represents the core LL/SC loop which will be lowered at a late stage by
	/// the backend.
	virtual Value *emitMaskedAtomicCmpXchgIntrinsic(
	IRBuilder<> &Builder, AtomicCmpXchgInst CI, Value AlignedAddr,
	Value CmpVal, Value NewVal, Value *Mask, AtomicOrdering Ord) const {
	llvm_unreachable("Masked cmpxchg expansion unimplemented on this target");
	}

	/// Inserts in the IR a target-specific intrinsic specifying a fence.
	/// It is called by AtomicExpandPass before expanding an
	/// AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad
	/// if shouldInsertFencesForAtomic returns true.
	///
	/// Inst is the original atomic instruction, prior to other expansions that
	/// may be performed.
	///
	/// This function should either return a nullptr, or a pointer to an IR-level
	/// Instruction*. Even complex fence sequences can be represented by a
	/// single Instruction* through an intrinsic to be lowered later.
	/// Backends should override this method to produce target-specific intrinsic
	/// for their fences.
	/// FIXME: Please note that the default implementation here in terms of
	/// IR-level fences exists for historical/compatibility reasons and is
	/// unsound ! Fences cannot, in general, be used to restore sequential
	/// consistency. For example, consider the following example:
	/// atomic<int> x = y = 0;
	/// int r1, r2, r3, r4;
	/// Thread 0:
	/// x.store(1);
	/// Thread 1:
	/// y.store(1);
	/// Thread 2:
	/// r1 = x.load();
	/// r2 = y.load();
	/// Thread 3:
	/// r3 = y.load();
	/// r4 = x.load();
	/// r1 = r3 = 1 and r2 = r4 = 0 is impossible as long as the accesses are all
	/// seq_cst. But if they are lowered to monotonic accesses, no amount of
	/// IR-level fences can prevent it.
	/// @{
	virtual Instruction emitLeadingFence(IRBuilder<> &Builder, Instruction Inst,
	AtomicOrdering Ord) const {
	if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
	return Builder.CreateFence(Ord);
	else
	return nullptr;
	}

	virtual Instruction *emitTrailingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	if (isAcquireOrStronger(Ord))
	return Builder.CreateFence(Ord);
	else
	return nullptr;
	}
	/// @}

	// Emits code that executes when the comparison result in the ll/sc
	// expansion of a cmpxchg instruction is such that the store-conditional will
	// not execute. This makes it possible to balance out the load-linked with
	// a dedicated instruction, if desired.
	// E.g., on ARM, if ldrex isn't followed by strex, the exclusive monitor would
	// be unnecessarily held, except if clrex, inserted by this hook, is executed.
	virtual void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const {}

	/// Returns true if the given (atomic) store should be expanded by the
	/// IR-level AtomicExpand pass into an "atomic xchg" which ignores its input.
	virtual bool shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	return false;
	}

	/// Returns true if arguments should be sign-extended in lib calls.
	virtual bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
	return IsSigned;
	}

	/// Returns how the given (atomic) load should be expanded by the
	/// IR-level AtomicExpand pass.
	virtual AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	return AtomicExpansionKind::None;
	}

	/// Returns how the given atomic cmpxchg should be expanded by the IR-level
	/// AtomicExpand pass.
	virtual AtomicExpansionKind
	shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
	return AtomicExpansionKind::None;
	}

	/// Returns how the IR-level AtomicExpand pass should expand the given
	/// AtomicRMW, if at all. Default is to never expand.
	virtual AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
	return RMW->isFloatingPointOperation() ?
	AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None;
	}

	/// On some platforms, an AtomicRMW that never actually modifies the value
	/// (such as fetch_add of 0) can be turned into a fence followed by an
	/// atomic load. This may sound useless, but it makes it possible for the
	/// processor to keep the cacheline shared, dramatically improving
	/// performance. And such idempotent RMWs are useful for implementing some
	/// kinds of locks, see for example (justification + benchmarks):
	/// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
	/// This method tries doing that transformation, returning the atomic load if
	/// it succeeds, and nullptr otherwise.
	/// If shouldExpandAtomicLoadInIR returns true on that load, it will undergo
	/// another round of expansion.
	virtual LoadInst *
	lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *RMWI) const {
	return nullptr;
	}

	/// Returns how the platform's atomic operations are extended (ZERO_EXTEND,
	/// SIGN_EXTEND, or ANY_EXTEND).
	virtual ISD::NodeType getExtendForAtomicOps() const {
	return ISD::ZERO_EXTEND;
	}

	/// @}

	/// Returns true if we should normalize
	/// select(N0&N1, X, Y) => select(N0, select(N1, X, Y), Y) and
	/// select(N0\|N1, X, Y) => select(N0, select(N1, X, Y, Y)) if it is likely
	/// that it saves us from materializing N0 and N1 in an integer register.
	/// Targets that are able to perform and/or on flags should return false here.
	virtual bool shouldNormalizeToSelectSequence(LLVMContext &Context,
	EVT VT) const {
	// If a target has multiple condition registers, then it likely has logical
	// operations on those registers.
	if (hasMultipleConditionRegisters())
	return false;
	// Only do the transform if the value won't be split into multiple
	// registers.
	LegalizeTypeAction Action = getTypeAction(Context, VT);
	return Action != TypeExpandInteger && Action != TypeExpandFloat &&
	Action != TypeSplitVector;
	}

	virtual bool isProfitableToCombineMinNumMaxNum(EVT VT) const { return true; }

	/// Return true if a select of constants (select Cond, C1, C2) should be
	/// transformed into simple math ops with the condition value. For example:
	/// select Cond, C1, C1-1 --> add (zext Cond), C1-1
	virtual bool convertSelectOfConstantsToMath(EVT VT) const {
	return false;
	}

	/// Return true if it is profitable to transform an integer
	/// multiplication-by-constant into simpler operations like shifts and adds.
	/// This may be true if the target does not directly support the
	/// multiplication operation for the specified type or the sequence of simpler
	/// ops is faster than the multiply.
	virtual bool decomposeMulByConstant(EVT VT, SDValue C) const {
	return false;
	}

	/// Return true if it is more correct/profitable to use strict FP_TO_INT
	/// conversion operations - canonicalizing the FP source value instead of
	/// converting all cases and then selecting based on value.
	/// This may be true if the target throws exceptions for out of bounds
	/// conversions or has fast FP CMOV.
	virtual bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
	bool IsSigned) const {
	return false;
	}

	//===--------------------------------------------------------------------===//
	// TargetLowering Configuration Methods - These methods should be invoked by
	// the derived class constructor to configure this object for the target.
	//
	protected:
	/// Specify how the target extends the result of integer and floating point
	/// boolean values from i1 to a wider type. See getBooleanContents.
	void setBooleanContents(BooleanContent Ty) {
	BooleanContents = Ty;
	BooleanFloatContents = Ty;
	}

	/// Specify how the target extends the result of integer and floating point
	/// boolean values from i1 to a wider type. See getBooleanContents.
	void setBooleanContents(BooleanContent IntTy, BooleanContent FloatTy) {
	BooleanContents = IntTy;
	BooleanFloatContents = FloatTy;
	}

	/// Specify how the target extends the result of a vector boolean value from a
	/// vector of i1 to a wider type. See getBooleanContents.
	void setBooleanVectorContents(BooleanContent Ty) {
	BooleanVectorContents = Ty;
	}

	/// Specify the target scheduling preference.
	void setSchedulingPreference(Sched::Preference Pref) {
	SchedPreferenceInfo = Pref;
	}

	/// Indicate whether this target prefers to use _setjmp to implement
	/// llvm.setjmp or the version without _. Defaults to false.
	void setUseUnderscoreSetJmp(bool Val) {
	UseUnderscoreSetJmp = Val;
	}

	/// Indicate whether this target prefers to use _longjmp to implement
	/// llvm.longjmp or the version without _. Defaults to false.
	void setUseUnderscoreLongJmp(bool Val) {
	UseUnderscoreLongJmp = Val;
	}

	/// Indicate the minimum number of blocks to generate jump tables.
	void setMinimumJumpTableEntries(unsigned Val);

	/// Indicate the maximum number of entries in jump tables.
	/// Set to zero to generate unlimited jump tables.
	void setMaximumJumpTableSize(unsigned);

	/// If set to a physical register, this specifies the register that
	/// llvm.savestack/llvm.restorestack should save and restore.
	void setStackPointerRegisterToSaveRestore(unsigned R) {
	StackPointerRegisterToSaveRestore = R;
	}

	/// Tells the code generator that the target has multiple (allocatable)
	/// condition registers that can be used to store the results of comparisons
	/// for use by selects and conditional branches. With multiple condition
	/// registers, the code generator will not aggressively sink comparisons into
	/// the blocks of their users.
	void setHasMultipleConditionRegisters(bool hasManyRegs = true) {
	HasMultipleConditionRegisters = hasManyRegs;
	}

	/// Tells the code generator that the target has BitExtract instructions.
	/// The code generator will aggressively sink "shift"s into the blocks of
	/// their users if the users will generate "and" instructions which can be
	/// combined with "shift" to BitExtract instructions.
	void setHasExtractBitsInsn(bool hasExtractInsn = true) {
	HasExtractBitsInsn = hasExtractInsn;
	}

	/// Tells the code generator not to expand logic operations on comparison
	/// predicates into separate sequences that increase the amount of flow
	/// control.
	void setJumpIsExpensive(bool isExpensive = true);

	/// Tells the code generator which bitwidths to bypass.
	void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth) {
	BypassSlowDivWidths[SlowBitWidth] = FastBitWidth;
	}

	/// Add the specified register class as an available regclass for the
	/// specified value type. This indicates the selector can handle values of
	/// that class natively.
	void addRegisterClass(MVT VT, const TargetRegisterClass *RC) {
	assert((unsigned)VT.SimpleTy < array_lengthof(RegClassForVT));
	RegClassForVT[VT.SimpleTy] = RC;
	}

	/// Return the largest legal super-reg register class of the register class
	/// for the specified type and its associated "cost".
	virtual std::pair<const TargetRegisterClass *, uint8_t>
	findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const;

	/// Once all of the register classes are added, this allows us to compute
	/// derived properties we expose.
	void computeRegisterProperties(const TargetRegisterInfo *TRI);

	/// Indicate that the specified operation does not work with the specified
	/// type and indicate what to do about it. Note that VT may refer to either
	/// the type of a result or that of an operand of Op.
	void setOperationAction(unsigned Op, MVT VT,
	LegalizeAction Action) {
	assert(Op < array_lengthof(OpActions[0]) && "Table isn't big enough!");
	OpActions[(unsigned)VT.SimpleTy][Op] = Action;
	}

	/// Indicate that the specified load with extension does not work with the
	/// specified type and indicate what to do about it.
	void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT,
	LegalizeAction Action) {
	assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValVT.isValid() &&
	MemVT.isValid() && "Table isn't big enough!");
	assert((unsigned)Action < 0x10 && "too many bits for bitfield array");
	unsigned Shift = 4 * ExtType;
	LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy] &= ~((uint16_t)0xF << Shift);
	LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy] \|= (uint16_t)Action << Shift;
	}

	/// Indicate that the specified truncating store does not work with the
	/// specified type and indicate what to do about it.
	void setTruncStoreAction(MVT ValVT, MVT MemVT,
	LegalizeAction Action) {
	assert(ValVT.isValid() && MemVT.isValid() && "Table isn't big enough!");
	TruncStoreActions[(unsigned)ValVT.SimpleTy][MemVT.SimpleTy] = Action;
	}

	/// Indicate that the specified indexed load does or does not work with the
	/// specified type and indicate what to do abort it.
	///
	/// NOTE: All indexed mode loads are initialized to Expand in
	/// TargetLowering.cpp
	void setIndexedLoadAction(unsigned IdxMode, MVT VT,
	LegalizeAction Action) {
	assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
	(unsigned)Action < 0xf && "Table isn't big enough!");
	// Load action are kept in the upper half.
	IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0xf0;
	IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] \|= ((uint8_t)Action) <<4;
	}

	/// Indicate that the specified indexed store does or does not work with the
	/// specified type and indicate what to do about it.
	///
	/// NOTE: All indexed mode stores are initialized to Expand in
	/// TargetLowering.cpp
	void setIndexedStoreAction(unsigned IdxMode, MVT VT,
	LegalizeAction Action) {
	assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
	(unsigned)Action < 0xf && "Table isn't big enough!");
	// Store action are kept in the lower half.
	IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0x0f;
	IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] \|= ((uint8_t)Action);
	}

	/// Indicate that the specified condition code is or isn't supported on the
	/// target and indicate what to do about it.
	void setCondCodeAction(ISD::CondCode CC, MVT VT,
	LegalizeAction Action) {
	assert(VT.isValid() && (unsigned)CC < array_lengthof(CondCodeActions) &&
	"Table isn't big enough!");
	assert((unsigned)Action < 0x10 && "too many bits for bitfield array");
	/// The lower 3 bits of the SimpleTy index into Nth 4bit set from the 32-bit
	/// value and the upper 29 bits index into the second dimension of the array
	/// to select what 32-bit value to use.
	uint32_t Shift = 4 * (VT.SimpleTy & 0x7);
	CondCodeActions[CC][VT.SimpleTy >> 3] &= ~((uint32_t)0xF << Shift);
	CondCodeActions[CC][VT.SimpleTy >> 3] \|= (uint32_t)Action << Shift;
	}

	/// If Opc/OrigVT is specified as being promoted, the promotion code defaults
	/// to trying a larger integer/fp until it can find one that works. If that
	/// default is insufficient, this method can be used by the target to override
	/// the default.
	void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT) {
	PromoteToType[std::make_pair(Opc, OrigVT.SimpleTy)] = DestVT.SimpleTy;
	}

	/// Convenience method to set an operation to Promote and specify the type
	/// in a single call.
	void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT) {
	setOperationAction(Opc, OrigVT, Promote);
	AddPromotedToType(Opc, OrigVT, DestVT);
	}

	/// Targets should invoke this method for each target independent node that
	/// they want to provide a custom DAG combiner for by implementing the
	/// PerformDAGCombine virtual method.
	void setTargetDAGCombine(ISD::NodeType NT) {
	assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray));
	TargetDAGCombineArray[NT >> 3] \|= 1 << (NT&7);
	}

	/// Set the target's required jmp_buf buffer size (in bytes); default is 200
	void setJumpBufSize(unsigned Size) {
	JumpBufSize = Size;
	}

	/// Set the target's required jmp_buf buffer alignment (in bytes); default is
	/// 0
	void setJumpBufAlignment(unsigned Align) {
	JumpBufAlignment = Align;
	}

	/// Set the target's minimum function alignment (in log2(bytes))
	void setMinFunctionAlignment(unsigned Align) {
	MinFunctionAlignment = Align;
	}

	/// Set the target's preferred function alignment. This should be set if
	/// there is a performance benefit to higher-than-minimum alignment (in
	/// log2(bytes))
	void setPrefFunctionAlignment(unsigned Align) {
	PrefFunctionAlignment = Align;
	}

	/// Set the target's preferred loop alignment. Default alignment is zero, it
	/// means the target does not care about loop alignment. The alignment is
	/// specified in log2(bytes). The target may also override
	/// getPrefLoopAlignment to provide per-loop values.
	void setPrefLoopAlignment(unsigned Align) {
	PrefLoopAlignment = Align;
	}

	/// Set the minimum stack alignment of an argument (in log2(bytes)).
	void setMinStackArgumentAlignment(unsigned Align) {
	MinStackArgumentAlignment = Align;
	}

	/// Set the maximum atomic operation size supported by the
	/// backend. Atomic operations greater than this size (as well as
	/// ones that are not naturally aligned), will be expanded by
	/// AtomicExpandPass into an __atomic_* library call.
	void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits) {
	MaxAtomicSizeInBitsSupported = SizeInBits;
	}

	/// Sets the minimum cmpxchg or ll/sc size supported by the backend.
	void setMinCmpXchgSizeInBits(unsigned SizeInBits) {
	MinCmpXchgSizeInBits = SizeInBits;
	}

	/// Sets whether unaligned atomic operations are supported.
	void setSupportsUnalignedAtomics(bool UnalignedSupported) {
	SupportsUnalignedAtomics = UnalignedSupported;
	}

	public:
	//===--------------------------------------------------------------------===//
	// Addressing mode description hooks (used by LSR etc).
	//

	/// CodeGenPrepare sinks address calculations into the same BB as Load/Store
	/// instructions reading the address. This allows as much computation as
	/// possible to be done in the address mode for that operand. This hook lets
	/// targets also pass back when this should be done on intrinsics which
	/// load/store.
	virtual bool getAddrModeArguments(IntrinsicInst * /I/,
	SmallVectorImpl<Value> &/Ops*/,
	Type &/AccessTy*/) const {
	return false;
	}

	/// This represents an addressing mode of:
	/// BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
	/// If BaseGV is null, there is no BaseGV.
	/// If BaseOffs is zero, there is no base offset.
	/// If HasBaseReg is false, there is no base register.
	/// If Scale is zero, there is no ScaleReg. Scale of 1 indicates a reg with
	/// no scale.
	struct AddrMode {
	GlobalValue *BaseGV = nullptr;
	int64_t BaseOffs = 0;
	bool HasBaseReg = false;
	int64_t Scale = 0;
	AddrMode() = default;
	};

	/// Return true if the addressing mode represented by AM is legal for this
	/// target, for a load/store of the specified type.
	///
	/// The type may be VoidTy, in which case only return true if the addressing
	/// mode is legal for a load/store of any legal type. TODO: Handle
	/// pre/postinc as well.
	///
	/// If the address space cannot be determined, it will be -1.
	///
	/// TODO: Remove default argument
	virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
	Type *Ty, unsigned AddrSpace,
	Instruction *I = nullptr) const;

	/// Return the cost of the scaling factor used in the addressing mode
	/// represented by AM for this target, for a load/store of the specified type.
	///
	/// If the AM is supported, the return value must be >= 0.
	/// If the AM is not supported, it returns a negative value.
	/// TODO: Handle pre/postinc as well.
	/// TODO: Remove default argument
	virtual int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM,
	Type *Ty, unsigned AS = 0) const {
	// Default: assume that any scaling factor used in a legal AM is free.
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	return 0;
	return -1;
	}

	/// Return true if the specified immediate is legal icmp immediate, that is
	/// the target has icmp instructions which can compare a register against the
	/// immediate without having to materialize the immediate into a register.
	virtual bool isLegalICmpImmediate(int64_t) const {
	return true;
	}

	/// Return true if the specified immediate is legal add immediate, that is the
	/// target has add instructions which can add a register with the immediate
	/// without having to materialize the immediate into a register.
	virtual bool isLegalAddImmediate(int64_t) const {
	return true;
	}

	/// Return true if the specified immediate is legal for the value input of a
	/// store instruction.
	virtual bool isLegalStoreImmediate(int64_t Value) const {
	// Default implementation assumes that at least 0 works since it is likely
	// that a zero register exists or a zero immediate is allowed.
	return Value == 0;
	}

	/// Return true if it's significantly cheaper to shift a vector by a uniform
	/// scalar than by an amount which will vary across each lane. On x86, for
	/// example, there is a "psllw" instruction for the former case, but no simple
	/// instruction for a general "a << b" operation on vectors.
	virtual bool isVectorShiftByScalarCheap(Type *Ty) const {
	return false;
	}

	/// Returns true if the opcode is a commutative binary operation.
	virtual bool isCommutativeBinOp(unsigned Opcode) const {
	// FIXME: This should get its info from the td file.
	switch (Opcode) {
	case ISD::ADD:
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX:
	case ISD::MUL:
	case ISD::MULHU:
	case ISD::MULHS:
	case ISD::SMUL_LOHI:
	case ISD::UMUL_LOHI:
	case ISD::FADD:
	case ISD::FMUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::ADDC:
	case ISD::ADDE:
	case ISD::SADDSAT:
	case ISD::UADDSAT:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMINNUM_IEEE:
	case ISD::FMAXNUM_IEEE:
	case ISD::FMINIMUM:
	case ISD::FMAXIMUM:
	return true;
	default: return false;
	}
	}

	/// Return true if the node is a math/logic binary operator.
	virtual bool isBinOp(unsigned Opcode) const {
	// A commutative binop must be a binop.
	if (isCommutativeBinOp(Opcode))
	return true;
	// These are non-commutative binops.
	switch (Opcode) {
	case ISD::SUB:
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SRA:
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM:
	case ISD::FSUB:
	case ISD::FDIV:
	case ISD::FREM:
	return true;
	default:
	return false;
	}
	}

	/// Return true if it's free to truncate a value of type FromTy to type
	/// ToTy. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
	/// by referencing its sub-register AX.
	/// Targets must return false when FromTy <= ToTy.
	virtual bool isTruncateFree(Type FromTy, Type ToTy) const {
	return false;
	}

	/// Return true if a truncation from FromTy to ToTy is permitted when deciding
	/// whether a call is in tail position. Typically this means that both results
	/// would be assigned to the same register or stack slot, but it could mean
	/// the target performs adequate checks of its own before proceeding with the
	/// tail call. Targets must return false when FromTy <= ToTy.
	virtual bool allowTruncateForTailCall(Type FromTy, Type ToTy) const {
	return false;
	}

	virtual bool isTruncateFree(EVT FromVT, EVT ToVT) const {
	return false;
	}

	virtual bool isProfitableToHoist(Instruction *I) const { return true; }

	/// Return true if the extension represented by \p I is free.
	/// Unlikely the is[Z\|FP]ExtFree family which is based on types,
	/// this method can use the context provided by \p I to decide
	/// whether or not \p I is free.
	/// This method extends the behavior of the is[Z\|FP]ExtFree family.
	/// In other words, if is[Z\|FP]Free returns true, then this method
	/// returns true as well. The converse is not true.
	/// The target can perform the adequate checks by overriding isExtFreeImpl.
	/// \pre \p I must be a sign, zero, or fp extension.
	bool isExtFree(const Instruction *I) const {
	switch (I->getOpcode()) {
	case Instruction::FPExt:
	if (isFPExtFree(EVT::getEVT(I->getType()),
	EVT::getEVT(I->getOperand(0)->getType())))
	return true;
	break;
	case Instruction::ZExt:
	if (isZExtFree(I->getOperand(0)->getType(), I->getType()))
	return true;
	break;
	case Instruction::SExt:
	break;
	default:
	llvm_unreachable("Instruction is not an extension");
	}
	return isExtFreeImpl(I);
	}

	/// Return true if \p Load and \p Ext can form an ExtLoad.
	/// For example, in AArch64
	/// %L = load i8, i8* %ptr
	/// %E = zext i8 %L to i32
	/// can be lowered into one load instruction
	/// ldrb w0, [x0]
	bool isExtLoad(const LoadInst Load, const Instruction Ext,
	const DataLayout &DL) const {
	EVT VT = getValueType(DL, Ext->getType());
	EVT LoadVT = getValueType(DL, Load->getType());

	// If the load has other users and the truncate is not free, the ext
	// probably isn't free.
	if (!Load->hasOneUse() && (isTypeLegal(LoadVT) \|\| !isTypeLegal(VT)) &&
	!isTruncateFree(Ext->getType(), Load->getType()))
	return false;

	// Check whether the target supports casts folded into loads.
	unsigned LType;
	if (isa<ZExtInst>(Ext))
	LType = ISD::ZEXTLOAD;
	else {
	assert(isa<SExtInst>(Ext) && "Unexpected ext type!");
	LType = ISD::SEXTLOAD;
	}

	return isLoadExtLegal(LType, VT, LoadVT);
	}

	/// Return true if any actual instruction that defines a value of type FromTy
	/// implicitly zero-extends the value to ToTy in the result register.
	///
	/// The function should return true when it is likely that the truncate can
	/// be freely folded with an instruction defining a value of FromTy. If
	/// the defining instruction is unknown (because you're looking at a
	/// function argument, PHI, etc.) then the target may require an
	/// explicit truncate, which is not necessarily free, but this function
	/// does not deal with those cases.
	/// Targets must return false when FromTy >= ToTy.
	virtual bool isZExtFree(Type FromTy, Type ToTy) const {
	return false;
	}

	virtual bool isZExtFree(EVT FromTy, EVT ToTy) const {
	return false;
	}

	/// Return true if sign-extension from FromTy to ToTy is cheaper than
	/// zero-extension.
	virtual bool isSExtCheaperThanZExt(EVT FromTy, EVT ToTy) const {
	return false;
	}

	/// Return true if sinking I's operands to the same basic block as I is
	/// profitable, e.g. because the operands can be folded into a target
	/// instruction during instruction selection. After calling the function
	/// \p Ops contains the Uses to sink ordered by dominance (dominating users
	/// come first).
	virtual bool shouldSinkOperands(Instruction *I,
	SmallVectorImpl<Use *> &Ops) const {
	return false;
	}

	/// Return true if the target supplies and combines to a paired load
	/// two loaded values of type LoadedType next to each other in memory.
	/// RequiredAlignment gives the minimal alignment constraints that must be met
	/// to be able to select this paired load.
	///
	/// This information is not used to generate actual paired loads, but it is
	/// used to generate a sequence of loads that is easier to combine into a
	/// paired load.
	/// For instance, something like this:
	/// a = load i64* addr
	/// b = trunc i64 a to i32
	/// c = lshr i64 a, 32
	/// d = trunc i64 c to i32
	/// will be optimized into:
	/// b = load i32* addr1
	/// d = load i32* addr2
	/// Where addr1 = addr2 +/- sizeof(i32).
	///
	/// In other words, unless the target performs a post-isel load combining,
	/// this information should not be provided because it will generate more
	/// loads.
	virtual bool hasPairedLoad(EVT /LoadedType/,
	unsigned & /RequiredAlignment/) const {
	return false;
	}

	/// Return true if the target has a vector blend instruction.
	virtual bool hasVectorBlend() const { return false; }

	/// Get the maximum supported factor for interleaved memory accesses.
	/// Default to be the minimum interleave factor: 2.
	virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }

	/// Lower an interleaved load to target specific intrinsics. Return
	/// true on success.
	///
	/// \p LI is the vector load instruction.
	/// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
	/// \p Indices is the corresponding indices for each shufflevector.
	/// \p Factor is the interleave factor.
	virtual bool lowerInterleavedLoad(LoadInst *LI,
	ArrayRef<ShuffleVectorInst *> Shuffles,
	ArrayRef<unsigned> Indices,
	unsigned Factor) const {
	return false;
	}

	/// Lower an interleaved store to target specific intrinsics. Return
	/// true on success.
	///
	/// \p SI is the vector store instruction.
	/// \p SVI is the shufflevector to RE-interleave the stored vector.
	/// \p Factor is the interleave factor.
	virtual bool lowerInterleavedStore(StoreInst SI, ShuffleVectorInst SVI,
	unsigned Factor) const {
	return false;
	}

	/// Return true if zero-extending the specific node Val to type VT2 is free
	/// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or
	/// because it's folded such as X86 zero-extending loads).
	virtual bool isZExtFree(SDValue Val, EVT VT2) const {
	return isZExtFree(Val.getValueType(), VT2);
	}

	/// Return true if an fpext operation is free (for instance, because
	/// single-precision floating-point numbers are implicitly extended to
	/// double-precision).
	virtual bool isFPExtFree(EVT DestVT, EVT SrcVT) const {
	assert(SrcVT.isFloatingPoint() && DestVT.isFloatingPoint() &&
	"invalid fpext types");
	return false;
	}

	/// Return true if an fpext operation input to an \p Opcode operation is free
	/// (for instance, because half-precision floating-point numbers are
	/// implicitly extended to float-precision) for an FMA instruction.
	virtual bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const {
	assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
	"invalid fpext types");
	return isFPExtFree(DestVT, SrcVT);
	}

	/// Return true if folding a vector load into ExtVal (a sign, zero, or any
	/// extend node) is profitable.
	virtual bool isVectorLoadExtDesirable(SDValue ExtVal) const { return false; }

	/// Return true if an fneg operation is free to the point where it is never
	/// worthwhile to replace it with a bitwise operation.
	virtual bool isFNegFree(EVT VT) const {
	assert(VT.isFloatingPoint());
	return false;
	}

	/// Return true if an fabs operation is free to the point where it is never
	/// worthwhile to replace it with a bitwise operation.
	virtual bool isFAbsFree(EVT VT) const {
	assert(VT.isFloatingPoint());
	return false;
	}

	/// Return true if an FMA operation is faster than a pair of fmul and fadd
	/// instructions. fmuladd intrinsics will be expanded to FMAs when this method
	/// returns true, otherwise fmuladd is expanded to fmul + fadd.
	///
	/// NOTE: This may be called before legalization on types for which FMAs are
	/// not legal, but should return true if those types will eventually legalize
	/// to types that support FMAs. After legalization, it will only be called on
	/// types that support FMAs (via Legal or Custom actions)
	virtual bool isFMAFasterThanFMulAndFAdd(EVT) const {
	return false;
	}

	/// Return true if it's profitable to narrow operations of type VT1 to
	/// VT2. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
	/// i32 to i16.
	virtual bool isNarrowingProfitable(EVT /VT1/, EVT /VT2/) const {
	return false;
	}

	/// Return true if it is beneficial to convert a load of a constant to
	/// just the constant itself.
	/// On some targets it might be more efficient to use a combination of
	/// arithmetic instructions to materialize the constant instead of loading it
	/// from a constant pool.
	virtual bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	return false;
	}

	/// Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type
	/// from this source type with this index. This is needed because
	/// EXTRACT_SUBVECTOR usually has custom lowering that depends on the index of
	/// the first element, and only the target knows which lowering is cheap.
	virtual bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	return false;
	}

	/// Try to convert an extract element of a vector binary operation into an
	/// extract element followed by a scalar operation.
	virtual bool shouldScalarizeBinop(SDValue VecOp) const {
	return false;
	}

	/// Return true if extraction of a scalar element from the given vector type
	/// at the given index is cheap. For example, if scalar operations occur on
	/// the same register file as vector operations, then an extract element may
	/// be a sub-register rename rather than an actual instruction.
	virtual bool isExtractVecEltCheap(EVT VT, unsigned Index) const {
	return false;
	}

	/// Try to convert math with an overflow comparison into the corresponding DAG
	/// node operation. Targets may want to override this independently of whether
	/// the operation is legal/custom for the given type because it may obscure
	/// matching of other patterns.
	virtual bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const {
	// TODO: The default logic is inherited from code in CodeGenPrepare.
	// The opcode should not make a difference by default?
	if (Opcode != ISD::UADDO)
	return false;

	// Allow the transform as long as we have an integer type that is not
	// obviously illegal and unsupported.
	if (VT.isVector())
	return false;
	return VT.isSimple() \|\| !isOperationExpand(Opcode, VT);
	}

	// Return true if it is profitable to use a scalar input to a BUILD_VECTOR
	// even if the vector itself has multiple uses.
	virtual bool aggressivelyPreferBuildVectorSources(EVT VecVT) const {
	return false;
	}

	// Return true if CodeGenPrepare should consider splitting large offset of a
	// GEP to make the GEP fit into the addressing mode and can be sunk into the
	// same blocks of its users.
	virtual bool shouldConsiderGEPOffsetSplit() const { return false; }

	//===--------------------------------------------------------------------===//
	// Runtime Library hooks
	//

	/// Rename the default libcall routine name for the specified libcall.
	void setLibcallName(RTLIB::Libcall Call, const char *Name) {
	LibcallRoutineNames[Call] = Name;
	}

	/// Get the libcall routine name for the specified libcall.
	const char *getLibcallName(RTLIB::Libcall Call) const {
	return LibcallRoutineNames[Call];
	}

	/// Override the default CondCode to be used to test the result of the
	/// comparison libcall against zero.
	void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) {
	CmpLibcallCCs[Call] = CC;
	}

	/// Get the CondCode that's to be used to test the result of the comparison
	/// libcall against zero.
	ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const {
	return CmpLibcallCCs[Call];
	}

	/// Set the CallingConv that should be used for the specified libcall.
	void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
	LibcallCallingConvs[Call] = CC;
	}

	/// Get the CallingConv that should be used for the specified libcall.
	CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
	return LibcallCallingConvs[Call];
	}

	/// Execute target specific actions to finalize target lowering.
	/// This is used to set extra flags in MachineFrameInformation and freezing
	/// the set of reserved registers.
	/// The default implementation just freezes the set of reserved registers.
	virtual void finalizeLowering(MachineFunction &MF) const;

	private:
	const TargetMachine &TM;

	/// Tells the code generator that the target has multiple (allocatable)
	/// condition registers that can be used to store the results of comparisons
	/// for use by selects and conditional branches. With multiple condition
	/// registers, the code generator will not aggressively sink comparisons into
	/// the blocks of their users.
	bool HasMultipleConditionRegisters;

	/// Tells the code generator that the target has BitExtract instructions.
	/// The code generator will aggressively sink "shift"s into the blocks of
	/// their users if the users will generate "and" instructions which can be
	/// combined with "shift" to BitExtract instructions.
	bool HasExtractBitsInsn;

	/// Tells the code generator to bypass slow divide or remainder
	/// instructions. For example, BypassSlowDivWidths[32,8] tells the code
	/// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer
	/// div/rem when the operands are positive and less than 256.
	DenseMap <unsigned int, unsigned int> BypassSlowDivWidths;

	/// Tells the code generator that it shouldn't generate extra flow control
	/// instructions and should attempt to combine flow control instructions via
	/// predication.
	bool JumpIsExpensive;

	/// This target prefers to use _setjmp to implement llvm.setjmp.
	///
	/// Defaults to false.
	bool UseUnderscoreSetJmp;

	/// This target prefers to use _longjmp to implement llvm.longjmp.
	///
	/// Defaults to false.
	bool UseUnderscoreLongJmp;

	/// Information about the contents of the high-bits in boolean values held in
	/// a type wider than i1. See getBooleanContents.
	BooleanContent BooleanContents;

	/// Information about the contents of the high-bits in boolean values held in
	/// a type wider than i1. See getBooleanContents.
	BooleanContent BooleanFloatContents;

	/// Information about the contents of the high-bits in boolean vector values
	/// when the element type is wider than i1. See getBooleanContents.
	BooleanContent BooleanVectorContents;

	/// The target scheduling preference: shortest possible total cycles or lowest
	/// register usage.
	Sched::Preference SchedPreferenceInfo;

	/// The size, in bytes, of the target's jmp_buf buffers
	unsigned JumpBufSize;

	/// The alignment, in bytes, of the target's jmp_buf buffers
	unsigned JumpBufAlignment;

	/// The minimum alignment that any argument on the stack needs to have.
	unsigned MinStackArgumentAlignment;

	/// The minimum function alignment (used when optimizing for size, and to
	/// prevent explicitly provided alignment from leading to incorrect code).
	unsigned MinFunctionAlignment;

	/// The preferred function alignment (used when alignment unspecified and
	/// optimizing for speed).
	unsigned PrefFunctionAlignment;

	/// The preferred loop alignment.
	unsigned PrefLoopAlignment;

	/// Size in bits of the maximum atomics size the backend supports.
	/// Accesses larger than this will be expanded by AtomicExpandPass.
	unsigned MaxAtomicSizeInBitsSupported;

	/// Size in bits of the minimum cmpxchg or ll/sc operation the
	/// backend supports.
	unsigned MinCmpXchgSizeInBits;

	/// This indicates if the target supports unaligned atomic operations.
	bool SupportsUnalignedAtomics;

	/// If set to a physical register, this specifies the register that
	/// llvm.savestack/llvm.restorestack should save and restore.
	unsigned StackPointerRegisterToSaveRestore;

	/// This indicates the default register class to use for each ValueType the
	/// target supports natively.
	const TargetRegisterClass *RegClassForVT[MVT::LAST_VALUETYPE];
	unsigned char NumRegistersForVT[MVT::LAST_VALUETYPE];
	MVT RegisterTypeForVT[MVT::LAST_VALUETYPE];

	/// This indicates the "representative" register class to use for each
	/// ValueType the target supports natively. This information is used by the
	/// scheduler to track register pressure. By default, the representative
	/// register class is the largest legal super-reg register class of the
	/// register class of the specified type. e.g. On x86, i8, i16, and i32's
	/// representative class would be GR32.
	const TargetRegisterClass *RepRegClassForVT[MVT::LAST_VALUETYPE];

	/// This indicates the "cost" of the "representative" register class for each
	/// ValueType. The cost is used by the scheduler to approximate register
	/// pressure.
	uint8_t RepRegClassCostForVT[MVT::LAST_VALUETYPE];

	/// For any value types we are promoting or expanding, this contains the value
	/// type that we are changing to. For Expanded types, this contains one step
	/// of the expand (e.g. i64 -> i32), even if there are multiple steps required
	/// (e.g. i64 -> i16). For types natively supported by the system, this holds
	/// the same type (e.g. i32 -> i32).
	MVT TransformToType[MVT::LAST_VALUETYPE];

	/// For each operation and each value type, keep a LegalizeAction that
	/// indicates how instruction selection should deal with the operation. Most
	/// operations are Legal (aka, supported natively by the target), but
	/// operations that are not should be described. Note that operations on
	/// non-legal value types are not described here.
	LegalizeAction OpActions[MVT::LAST_VALUETYPE][ISD::BUILTIN_OP_END];

	/// For each load extension type and each value type, keep a LegalizeAction
	/// that indicates how instruction selection should deal with a load of a
	/// specific value type and extension type. Uses 4-bits to store the action
	/// for each of the 4 load ext types.
	uint16_t LoadExtActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE];

	/// For each value type pair keep a LegalizeAction that indicates whether a
	/// truncating store of a specific value type and truncating type is legal.
	LegalizeAction TruncStoreActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE];

	/// For each indexed mode and each value type, keep a pair of LegalizeAction
	/// that indicates how instruction selection should deal with the load /
	/// store.
	///
	/// The first dimension is the value_type for the reference. The second
	/// dimension represents the various modes for load store.
	uint8_t IndexedModeActions[MVT::LAST_VALUETYPE][ISD::LAST_INDEXED_MODE];

	/// For each condition code (ISD::CondCode) keep a LegalizeAction that
	/// indicates how instruction selection should deal with the condition code.
	///
	/// Because each CC action takes up 4 bits, we need to have the array size be
	/// large enough to fit all of the value types. This can be done by rounding
	/// up the MVT::LAST_VALUETYPE value to the next multiple of 8.
	uint32_t CondCodeActions[ISD::SETCC_INVALID][(MVT::LAST_VALUETYPE + 7) / 8];

	protected:
	ValueTypeActionImpl ValueTypeActions;

	private:
	LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const;

	/// Targets can specify ISD nodes that they would like PerformDAGCombine
	/// callbacks for by calling setTargetDAGCombine(), which sets a bit in this
	/// array.
	unsigned char
	TargetDAGCombineArray[(ISD::BUILTIN_OP_END+CHAR_BIT-1)/CHAR_BIT];

	/// For operations that must be promoted to a specific type, this holds the
	/// destination type. This map should be sparse, so don't hold it as an
	/// array.
	///
	/// Targets add entries to this map with AddPromotedToType(..), clients access
	/// this with getTypeToPromoteTo(..).
	std::map<std::pair<unsigned, MVT::SimpleValueType>, MVT::SimpleValueType>
	PromoteToType;

	/// Stores the name each libcall.
	const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1];

	/// The ISD::CondCode that should be used to test the result of each of the
	/// comparison libcall against zero.
	ISD::CondCode CmpLibcallCCs[RTLIB::UNKNOWN_LIBCALL];

	/// Stores the CallingConv that should be used for each libcall.
	CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL];

	/// Set default libcall names and calling conventions.
	void InitLibcalls(const Triple &TT);

	protected:
	/// Return true if the extension represented by \p I is free.
	/// \pre \p I is a sign, zero, or fp extension and
	/// is[Z\|FP]ExtFree of the related types is not true.
	virtual bool isExtFreeImpl(const Instruction *I) const { return false; }

	/// Depth that GatherAllAliases should should continue looking for chain
	/// dependencies when trying to find a more preferable chain. As an
	/// approximation, this should be more than the number of consecutive stores
	/// expected to be merged.
	unsigned GatherAllAliasesMaxDepth;

	/// Specify maximum number of store instructions per memset call.
	///
	/// When lowering \@llvm.memset this field specifies the maximum number of
	/// store operations that may be substituted for the call to memset. Targets
	/// must set this value based on the cost threshold for that target. Targets
	/// should assume that the memset will be done using as many of the largest
	/// store operations first, followed by smaller ones, if necessary, per
	/// alignment restrictions. For example, storing 9 bytes on a 32-bit machine
	/// with 16-bit alignment would result in four 2-byte stores and one 1-byte
	/// store. This only applies to setting a constant array of a constant size.
	unsigned MaxStoresPerMemset;

	/// Maximum number of stores operations that may be substituted for the call
	/// to memset, used for functions with OptSize attribute.
	unsigned MaxStoresPerMemsetOptSize;

	/// Specify maximum bytes of store instructions per memcpy call.
	///
	/// When lowering \@llvm.memcpy this field specifies the maximum number of
	/// store operations that may be substituted for a call to memcpy. Targets
	/// must set this value based on the cost threshold for that target. Targets
	/// should assume that the memcpy will be done using as many of the largest
	/// store operations first, followed by smaller ones, if necessary, per
	/// alignment restrictions. For example, storing 7 bytes on a 32-bit machine
	/// with 32-bit alignment would result in one 4-byte store, a one 2-byte store
	/// and one 1-byte store. This only applies to copying a constant array of
	/// constant size.
	unsigned MaxStoresPerMemcpy;


	/// \brief Specify max number of store instructions to glue in inlined memcpy.
	///
	/// When memcpy is inlined based on MaxStoresPerMemcpy, specify maximum number
	/// of store instructions to keep together. This helps in pairing and
	// vectorization later on.
	unsigned MaxGluedStoresPerMemcpy = 0;

	/// Maximum number of store operations that may be substituted for a call to
	/// memcpy, used for functions with OptSize attribute.
	unsigned MaxStoresPerMemcpyOptSize;
	unsigned MaxLoadsPerMemcmp;
	unsigned MaxLoadsPerMemcmpOptSize;

	/// Specify maximum bytes of store instructions per memmove call.
	///
	/// When lowering \@llvm.memmove this field specifies the maximum number of
	/// store instructions that may be substituted for a call to memmove. Targets
	/// must set this value based on the cost threshold for that target. Targets
	/// should assume that the memmove will be done using as many of the largest
	/// store operations first, followed by smaller ones, if necessary, per
	/// alignment restrictions. For example, moving 9 bytes on a 32-bit machine
	/// with 8-bit alignment would result in nine 1-byte stores. This only
	/// applies to copying a constant array of constant size.
	unsigned MaxStoresPerMemmove;

	/// Maximum number of store instructions that may be substituted for a call to
	/// memmove, used for functions with OptSize attribute.
	unsigned MaxStoresPerMemmoveOptSize;

	/// Tells the code generator that select is more expensive than a branch if
	/// the branch is usually predicted right.
	bool PredictableSelectIsExpensive;

	/// \see enableExtLdPromotion.
	bool EnableExtLdPromotion;

	/// Return true if the value types that can be represented by the specified
	/// register class are all legal.
	bool isLegalRC(const TargetRegisterInfo &TRI,
	const TargetRegisterClass &RC) const;

	/// Replace/modify any TargetFrameIndex operands with a targte-dependent
	/// sequence of memory operands that is recognized by PrologEpilogInserter.
	MachineBasicBlock *emitPatchPoint(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	/// Replace/modify the XRay custom event operands with target-dependent
	/// details.
	MachineBasicBlock *emitXRayCustomEvent(MachineInstr &MI,
	MachineBasicBlock *MBB) const;

	/// Replace/modify the XRay typed event operands with target-dependent
	/// details.
	MachineBasicBlock *emitXRayTypedEvent(MachineInstr &MI,
	MachineBasicBlock *MBB) const;
	};

	/// This class defines information used to lower LLVM code to legal SelectionDAG
	/// operators that the target instruction selector can accept natively.
	///
	/// This class also defines callbacks that targets must implement to lower
	/// target-specific constructs to SelectionDAG operators.
	class TargetLowering : public TargetLoweringBase {
	public:
	struct DAGCombinerInfo;

	TargetLowering(const TargetLowering &) = delete;
	TargetLowering &operator=(const TargetLowering &) = delete;

	/// NOTE: The TargetMachine owns TLOF.
	explicit TargetLowering(const TargetMachine &TM);

	bool isPositionIndependent() const;

	virtual bool isSDNodeSourceOfDivergence(const SDNode *N,
	FunctionLoweringInfo *FLI,
	LegacyDivergenceAnalysis *DA) const {
	return false;
	}

	virtual bool isSDNodeAlwaysUniform(const SDNode * N) const {
	return false;
	}

	/// Returns true by value, base pointer and offset pointer and addressing mode
	/// by reference if the node's address can be legally represented as
	/// pre-indexed load / store address.
	virtual bool getPreIndexedAddressParts(SDNode * /N/, SDValue &/Base/,
	SDValue &/Offset/,
	ISD::MemIndexedMode &/AM/,
	SelectionDAG &/DAG/) const {
	return false;
	}

	/// Returns true by value, base pointer and offset pointer and addressing mode
	/// by reference if this node can be combined with a load / store to form a
	/// post-indexed load / store.
	virtual bool getPostIndexedAddressParts(SDNode * /N/, SDNode * /Op/,
	SDValue &/Base/,
	SDValue &/Offset/,
	ISD::MemIndexedMode &/AM/,
	SelectionDAG &/DAG/) const {
	return false;
	}

	/// Return the entry encoding for a jump table in the current function. The
	/// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum.
	virtual unsigned getJumpTableEncoding() const;

	virtual const MCExpr *
	LowerCustomJumpTableEntry(const MachineJumpTableInfo * /MJTI/,
	const MachineBasicBlock * /MBB/, unsigned /uid/,
	MCContext &/Ctx/) const {
	llvm_unreachable("Need to implement this hook if target has custom JTIs");
	}

	/// Returns relocation base for the given PIC jumptable.
	virtual SDValue getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const;

	/// This returns the relocation base for the given PIC jumptable, the same as
	/// getPICJumpTableRelocBase, but as an MCExpr.
	virtual const MCExpr *
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
	unsigned JTI, MCContext &Ctx) const;

	/// Return true if folding a constant offset with the given GlobalAddress is
	/// legal. It is frequently not legal in PIC relocation models.
	virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;

	bool isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
	SDValue &Chain) const;

	void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS,
	SDValue &NewRHS, ISD::CondCode &CCCode,
	const SDLoc &DL) const;

	/// Returns a pair of (return value, chain).
	/// It is an error to pass RTLIB::UNKNOWN_LIBCALL as \p LC.
	std::pair<SDValue, SDValue> makeLibCall(
	SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef<SDValue> Ops,
	bool isSigned, const SDLoc &dl, bool doesNotReturn = false,
	bool isReturnValueUsed = true, bool isPostTypeLegalization = false) const;

	/// Check whether parameters to a call that are passed in callee saved
	/// registers are the same as from the calling function. This needs to be
	/// checked for tail call eligibility.
	bool parametersInCSRMatch(const MachineRegisterInfo &MRI,
	const uint32_t *CallerPreservedMask,
	const SmallVectorImpl<CCValAssign> &ArgLocs,
	const SmallVectorImpl<SDValue> &OutVals) const;

	//===--------------------------------------------------------------------===//
	// TargetLowering Optimization Methods
	//

	/// A convenience struct that encapsulates a DAG, and two SDValues for
	/// returning information from TargetLowering to its clients that want to
	/// combine.
	struct TargetLoweringOpt {
	SelectionDAG &DAG;
	bool LegalTys;
	bool LegalOps;
	SDValue Old;
	SDValue New;

	explicit TargetLoweringOpt(SelectionDAG &InDAG,
	bool LT, bool LO) :
	DAG(InDAG), LegalTys(LT), LegalOps(LO) {}

	bool LegalTypes() const { return LegalTys; }
	bool LegalOperations() const { return LegalOps; }

	bool CombineTo(SDValue O, SDValue N) {
	Old = O;
	New = N;
	return true;
	}
	};

	/// Determines the optimal series of memory ops to replace the memset / memcpy.
	/// Return true if the number of memory ops is below the threshold (Limit).
	/// It returns the types of the sequence of memory ops to perform
	/// memset / memcpy by reference.
	bool findOptimalMemOpLowering(std::vector<EVT> &MemOps,
	unsigned Limit, uint64_t Size,
	unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset,
	bool ZeroMemset,
	bool MemcpyStrSrc,
	bool AllowOverlap,
	unsigned DstAS, unsigned SrcAS,
	const AttributeList &FuncAttributes) const;

	/// Check to see if the specified operand of the specified instruction is a
	/// constant integer. If so, check to see if there are any bits set in the
	/// constant that are not demanded. If so, shrink the constant and return
	/// true.
	bool ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
	TargetLoweringOpt &TLO) const;

	// Target hook to do target-specific const optimization, which is called by
	// ShrinkDemandedConstant. This function should return true if the target
	// doesn't want ShrinkDemandedConstant to further optimize the constant.
	virtual bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
	TargetLoweringOpt &TLO) const {
	return false;
	}

	/// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free. This
	/// uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
	/// generalized for targets with other types of implicit widening casts.
	bool ShrinkDemandedOp(SDValue Op, unsigned BitWidth, const APInt &Demanded,
	TargetLoweringOpt &TLO) const;

	/// Look at Op. At this point, we know that only the DemandedBits bits of the
	/// result of Op are ever used downstream. If we can use this information to
	/// simplify Op, create a new simplified DAG node and return true, returning
	/// the original and new nodes in Old and New. Otherwise, analyze the
	/// expression and return a mask of KnownOne and KnownZero bits for the
	/// expression (used to simplify the caller). The KnownZero/One bits may only
	/// be accurate for those bits in the Demanded masks.
	/// \p AssumeSingleUse When this parameter is true, this function will
	/// attempt to simplify \p Op even if there are multiple uses.
	/// Callers are responsible for correctly updating the DAG based on the
	/// results of this function, because simply replacing replacing TLO.Old
	/// with TLO.New will be incorrect when this parameter is true and TLO.Old
	/// has multiple uses.
	bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
	const APInt &DemandedElts, KnownBits &Known,
	TargetLoweringOpt &TLO, unsigned Depth = 0,
	bool AssumeSingleUse = false) const;

	/// Helper wrapper around SimplifyDemandedBits, demanding all elements.
	/// Adds Op back to the worklist upon success.
	bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
	KnownBits &Known, TargetLoweringOpt &TLO,
	unsigned Depth = 0,
	bool AssumeSingleUse = false) const;

	/// Helper wrapper around SimplifyDemandedBits.
	/// Adds Op back to the worklist upon success.
	bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
	DAGCombinerInfo &DCI) const;

	/// Look at Vector Op. At this point, we know that only the DemandedElts
	/// elements of the result of Op are ever used downstream. If we can use
	/// this information to simplify Op, create a new simplified DAG node and
	/// return true, storing the original and new nodes in TLO.
	/// Otherwise, analyze the expression and return a mask of KnownUndef and
	/// KnownZero elements for the expression (used to simplify the caller).
	/// The KnownUndef/Zero elements may only be accurate for those bits
	/// in the DemandedMask.
	/// \p AssumeSingleUse When this parameter is true, this function will
	/// attempt to simplify \p Op even if there are multiple uses.
	/// Callers are responsible for correctly updating the DAG based on the
	/// results of this function, because simply replacing replacing TLO.Old
	/// with TLO.New will be incorrect when this parameter is true and TLO.Old
	/// has multiple uses.
	bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask,
	APInt &KnownUndef, APInt &KnownZero,
	TargetLoweringOpt &TLO, unsigned Depth = 0,
	bool AssumeSingleUse = false) const;

	/// Helper wrapper around SimplifyDemandedVectorElts.
	/// Adds Op back to the worklist upon success.
	bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
	APInt &KnownUndef, APInt &KnownZero,
	DAGCombinerInfo &DCI) const;

	/// Determine which of the bits specified in Mask are known to be either zero
	/// or one and return them in the KnownZero/KnownOne bitsets. The DemandedElts
	/// argument allows us to only collect the known bits that are shared by the
	/// requested vector elements.
	virtual void computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth = 0) const;

	/// Determine which of the bits of FrameIndex \p FIOp are known to be 0.
	/// Default implementation computes low bits based on alignment
	/// information. This should preserve known bits passed into it.
	virtual void computeKnownBitsForFrameIndex(const SDValue FIOp,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth = 0) const;

	/// This method can be implemented by targets that want to expose additional
	/// information about sign bits to the DAG Combiner. The DemandedElts
	/// argument allows us to only collect the minimum sign bits that are shared
	/// by the requested vector elements.
	virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth = 0) const;

	/// Attempt to simplify any target nodes based on the demanded vector
	/// elements, returning true on success. Otherwise, analyze the expression and
	/// return a mask of KnownUndef and KnownZero elements for the expression
	/// (used to simplify the caller). The KnownUndef/Zero elements may only be
	/// accurate for those bits in the DemandedMask.
	virtual bool SimplifyDemandedVectorEltsForTargetNode(
	SDValue Op, const APInt &DemandedElts, APInt &KnownUndef,
	APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const;

	/// Attempt to simplify any target nodes based on the demanded bits/elts,
	/// returning true on success. Otherwise, analyze the
	/// expression and return a mask of KnownOne and KnownZero bits for the
	/// expression (used to simplify the caller). The KnownZero/One bits may only
	/// be accurate for those bits in the Demanded masks.
	virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op,
	const APInt &DemandedBits,
	const APInt &DemandedElts,
	KnownBits &Known,
	TargetLoweringOpt &TLO,
	unsigned Depth = 0) const;

	/// This method returns the constant pool value that will be loaded by LD.
	/// NOTE: You must check for implicit extensions of the constant by LD.
	virtual const Constant getTargetConstantFromLoad(LoadSDNode LD) const;

	/// If \p SNaN is false, \returns true if \p Op is known to never be any
	/// NaN. If \p sNaN is true, returns if \p Op is known to never be a signaling
	/// NaN.
	virtual bool isKnownNeverNaNForTargetNode(SDValue Op,
	const SelectionDAG &DAG,
	bool SNaN = false,
	unsigned Depth = 0) const;
	struct DAGCombinerInfo {
	void *DC; // The DAG Combiner object.
	CombineLevel Level;
	bool CalledByLegalizer;

	public:
	SelectionDAG &DAG;

	DAGCombinerInfo(SelectionDAG &dag, CombineLevel level, bool cl, void *dc)
	: DC(dc), Level(level), CalledByLegalizer(cl), DAG(dag) {}

	bool isBeforeLegalize() const { return Level == BeforeLegalizeTypes; }
	bool isBeforeLegalizeOps() const { return Level < AfterLegalizeVectorOps; }
	bool isAfterLegalizeDAG() const {
	return Level == AfterLegalizeDAG;
	}
	CombineLevel getDAGCombineLevel() { return Level; }
	bool isCalledByLegalizer() const { return CalledByLegalizer; }

	void AddToWorklist(SDNode *N);
	SDValue CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo = true);
	SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true);
	SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true);

	void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO);
	};

	/// Return if the N is a constant or constant vector equal to the true value
	/// from getBooleanContents().
	bool isConstTrueVal(const SDNode *N) const;

	/// Return if the N is a constant or constant vector equal to the false value
	/// from getBooleanContents().
	bool isConstFalseVal(const SDNode *N) const;

	/// Return if \p N is a True value when extended to \p VT.
	bool isExtendedTrueVal(const ConstantSDNode *N, EVT VT, bool SExt) const;

	/// Try to simplify a setcc built with the specified operands and cc. If it is
	/// unable to simplify it, return a null SDValue.
	SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
	bool foldBooleans, DAGCombinerInfo &DCI,
	const SDLoc &dl) const;

	// For targets which wrap address, unwrap for analysis.
	virtual SDValue unwrapAddress(SDValue N) const { return N; }

	/// Returns true (and the GlobalValue and the offset) if the node is a
	/// GlobalAddress + offset.
	virtual bool
	isGAPlusOffset(SDNode N, const GlobalValue &GA, int64_t &Offset) const;

	/// This method will be invoked for all target nodes and for any
	/// target-independent nodes that the target has registered with invoke it
	/// for.
	///
	/// The semantics are as follows:
	/// Return Value:
	/// SDValue.Val == 0 - No change was made
	/// SDValue.Val == N - N was replaced, is dead, and is already handled.
	/// otherwise - N should be replaced by the returned Operand.
	///
	/// In addition, methods provided by DAGCombinerInfo may be used to perform
	/// more complex transformations.
	///
	virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;

	/// Return true if it is profitable to move this shift by a constant amount
	/// though its operand, adjusting any immediate operands as necessary to
	/// preserve semantics. This transformation may not be desirable if it
	/// disrupts a particularly auspicious target-specific tree (e.g. bitfield
	/// extraction in AArch64). By default, it returns true.
	///
	/// @param N the shift node
	/// @param Level the current DAGCombine legalization level.
	virtual bool isDesirableToCommuteWithShift(const SDNode *N,
	CombineLevel Level) const {
	return true;
	}

	// Return true if it is profitable to combine a BUILD_VECTOR with a stride-pattern
	// to a shuffle and a truncate.
	// Example of such a combine:
	// v4i32 build_vector((extract_elt V, 1),
	// (extract_elt V, 3),
	// (extract_elt V, 5),
	// (extract_elt V, 7))
	// -->
	// v4i32 truncate (bitcast (shuffle<1,u,3,u,5,u,7,u> V, u) to v4i64)
	virtual bool isDesirableToCombineBuildVectorToShuffleTruncate(
	ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
	return false;
	}

	/// Return true if the target has native support for the specified value type
	/// and it is 'desirable' to use the type for the given node type. e.g. On x86
	/// i16 is legal, but undesirable since i16 instruction encodings are longer
	/// and some i16 instructions are slow.
	virtual bool isTypeDesirableForOp(unsigned /Opc/, EVT VT) const {
	// By default, assume all legal types are desirable.
	return isTypeLegal(VT);
	}

	/// Return true if it is profitable for dag combiner to transform a floating
	/// point op of specified opcode to a equivalent op of an integer
	/// type. e.g. f32 load -> i32 load can be profitable on ARM.
	virtual bool isDesirableToTransformToIntegerOp(unsigned /Opc/,
	EVT /VT/) const {
	return false;
	}

	/// This method query the target whether it is beneficial for dag combiner to
	/// promote the specified node. If true, it should return the desired
	/// promotion type by reference.
	virtual bool IsDesirableToPromoteOp(SDValue /Op/, EVT &/PVT/) const {
	return false;
	}

	/// Return true if the target supports swifterror attribute. It optimizes
	/// loads and stores to reading and writing a specific register.
	virtual bool supportSwiftError() const {
	return false;
	}

	/// Return true if the target supports that a subset of CSRs for the given
	/// machine function is handled explicitly via copies.
	virtual bool supportSplitCSR(MachineFunction *MF) const {
	return false;
	}

	/// Perform necessary initialization to handle a subset of CSRs explicitly
	/// via copies. This function is called at the beginning of instruction
	/// selection.
	virtual void initializeSplitCSR(MachineBasicBlock *Entry) const {
	llvm_unreachable("Not Implemented");
	}

	/// Insert explicit copies in entry and exit blocks. We copy a subset of
	/// CSRs to virtual registers in the entry block, and copy them back to
	/// physical registers in the exit blocks. This function is called at the end
	/// of instruction selection.
	virtual void insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	llvm_unreachable("Not Implemented");
	}

	//===--------------------------------------------------------------------===//
	// Lowering methods - These methods must be implemented by targets so that
	// the SelectionDAGBuilder code knows how to lower these.
	//

	/// This hook must be implemented to lower the incoming (formal) arguments,
	/// described by the Ins array, into the specified DAG. The implementation
	/// should fill in the InVals array with legal-type argument values, and
	/// return the resulting token chain value.
	virtual SDValue LowerFormalArguments(
	SDValue /Chain/, CallingConv::ID /CallConv/, bool /isVarArg/,
	const SmallVectorImpl<ISD::InputArg> & /Ins/, const SDLoc & /dl/,
	SelectionDAG & /DAG/, SmallVectorImpl<SDValue> & /InVals/) const {
	llvm_unreachable("Not Implemented");
	}

	/// This structure contains all information that is necessary for lowering
	/// calls. It is passed to TLI::LowerCallTo when the SelectionDAG builder
	/// needs to lower a call, and targets will see this struct in their LowerCall
	/// implementation.
	struct CallLoweringInfo {
	SDValue Chain;
	Type *RetTy = nullptr;
	bool RetSExt : 1;
	bool RetZExt : 1;
	bool IsVarArg : 1;
	bool IsInReg : 1;
	bool DoesNotReturn : 1;
	bool IsReturnValueUsed : 1;
	bool IsConvergent : 1;
	bool IsPatchPoint : 1;

	// IsTailCall should be modified by implementations of
	// TargetLowering::LowerCall that perform tail call conversions.
	bool IsTailCall = false;

	// Is Call lowering done post SelectionDAG type legalization.
	bool IsPostTypeLegalization = false;

	unsigned NumFixedArgs = -1;
	CallingConv::ID CallConv = CallingConv::C;
	SDValue Callee;
	ArgListTy Args;
	SelectionDAG &DAG;
	SDLoc DL;
	ImmutableCallSite CS;
	SmallVector<ISD::OutputArg, 32> Outs;
	SmallVector<SDValue, 32> OutVals;
	SmallVector<ISD::InputArg, 32> Ins;
	SmallVector<SDValue, 4> InVals;

	CallLoweringInfo(SelectionDAG &DAG)
	: RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false),
	DoesNotReturn(false), IsReturnValueUsed(true), IsConvergent(false),
	IsPatchPoint(false), DAG(DAG) {}

	CallLoweringInfo &setDebugLoc(const SDLoc &dl) {
	DL = dl;
	return *this;
	}

	CallLoweringInfo &setChain(SDValue InChain) {
	Chain = InChain;
	return *this;
	}

	// setCallee with target/module-specific attributes
	CallLoweringInfo &setLibCallee(CallingConv::ID CC, Type *ResultType,
	SDValue Target, ArgListTy &&ArgsList) {
	RetTy = ResultType;
	Callee = Target;
	CallConv = CC;
	NumFixedArgs = ArgsList.size();
	Args = std::move(ArgsList);

	DAG.getTargetLoweringInfo().markLibCallAttributes(
	&(DAG.getMachineFunction()), CC, Args);
	return *this;
	}

	CallLoweringInfo &setCallee(CallingConv::ID CC, Type *ResultType,
	SDValue Target, ArgListTy &&ArgsList) {
	RetTy = ResultType;
	Callee = Target;
	CallConv = CC;
	NumFixedArgs = ArgsList.size();
	Args = std::move(ArgsList);
	return *this;
	}

	CallLoweringInfo &setCallee(Type ResultType, FunctionType FTy,
	SDValue Target, ArgListTy &&ArgsList,
	ImmutableCallSite Call) {
	RetTy = ResultType;

	IsInReg = Call.hasRetAttr(Attribute::InReg);
	DoesNotReturn =
	Call.doesNotReturn() \|\|
	(!Call.isInvoke() &&
	isa<UnreachableInst>(Call.getInstruction()->getNextNode()));
	IsVarArg = FTy->isVarArg();
	IsReturnValueUsed = !Call.getInstruction()->use_empty();
	RetSExt = Call.hasRetAttr(Attribute::SExt);
	RetZExt = Call.hasRetAttr(Attribute::ZExt);

	Callee = Target;

	CallConv = Call.getCallingConv();
	NumFixedArgs = FTy->getNumParams();
	Args = std::move(ArgsList);

	CS = Call;

	return *this;
	}

	CallLoweringInfo &setInRegister(bool Value = true) {
	IsInReg = Value;
	return *this;
	}

	CallLoweringInfo &setNoReturn(bool Value = true) {
	DoesNotReturn = Value;
	return *this;
	}

	CallLoweringInfo &setVarArg(bool Value = true) {
	IsVarArg = Value;
	return *this;
	}

	CallLoweringInfo &setTailCall(bool Value = true) {
	IsTailCall = Value;
	return *this;
	}

	CallLoweringInfo &setDiscardResult(bool Value = true) {
	IsReturnValueUsed = !Value;
	return *this;
	}

	CallLoweringInfo &setConvergent(bool Value = true) {
	IsConvergent = Value;
	return *this;
	}

	CallLoweringInfo &setSExtResult(bool Value = true) {
	RetSExt = Value;
	return *this;
	}

	CallLoweringInfo &setZExtResult(bool Value = true) {
	RetZExt = Value;
	return *this;
	}

	CallLoweringInfo &setIsPatchPoint(bool Value = true) {
	IsPatchPoint = Value;
	return *this;
	}

	CallLoweringInfo &setIsPostTypeLegalization(bool Value=true) {
	IsPostTypeLegalization = Value;
	return *this;
	}

	ArgListTy &getArgs() {
	return Args;
	}
	};

	/// This function lowers an abstract call to a function into an actual call.
	/// This returns a pair of operands. The first element is the return value
	/// for the function (if RetTy is not VoidTy). The second element is the
	/// outgoing token chain. It calls LowerCall to do the actual lowering.
	std::pair<SDValue, SDValue> LowerCallTo(CallLoweringInfo &CLI) const;

	/// This hook must be implemented to lower calls into the specified
	/// DAG. The outgoing arguments to the call are described by the Outs array,
	/// and the values to be returned by the call are described by the Ins
	/// array. The implementation should fill in the InVals array with legal-type
	/// return values from the call, and return the resulting token chain value.
	virtual SDValue
	LowerCall(CallLoweringInfo &/CLI/,
	SmallVectorImpl<SDValue> &/InVals/) const {
	llvm_unreachable("Not Implemented");
	}

	/// Target-specific cleanup for formal ByVal parameters.
	virtual void HandleByVal(CCState *, unsigned &, unsigned) const {}

	/// This hook should be implemented to check whether the return values
	/// described by the Outs array can fit into the return registers. If false
	/// is returned, an sret-demotion is performed.
	virtual bool CanLowerReturn(CallingConv::ID /CallConv/,
	MachineFunction &/MF/, bool /isVarArg/,
	const SmallVectorImpl<ISD::OutputArg> &/Outs/,
	LLVMContext &/Context/) const
	{
	// Return true by default to get preexisting behavior.
	return true;
	}

	/// This hook must be implemented to lower outgoing return values, described
	/// by the Outs array, into the specified DAG. The implementation should
	/// return the resulting token chain value.
	virtual SDValue LowerReturn(SDValue /Chain/, CallingConv::ID /CallConv/,
	bool /isVarArg/,
	const SmallVectorImpl<ISD::OutputArg> & /Outs/,
	const SmallVectorImpl<SDValue> & /OutVals/,
	const SDLoc & /dl/,
	SelectionDAG & /DAG/) const {
	llvm_unreachable("Not Implemented");
	}

	/// Return true if result of the specified node is used by a return node
	/// only. It also compute and return the input chain for the tail call.
	///
	/// This is used to determine whether it is possible to codegen a libcall as
	/// tail call at legalization time.
	virtual bool isUsedByReturnOnly(SDNode , SDValue &/Chain*/) const {
	return false;
	}

	/// Return true if the target may be able emit the call instruction as a tail
	/// call. This is used by optimization passes to determine if it's profitable
	/// to duplicate return instructions to enable tailcall optimization.
	virtual bool mayBeEmittedAsTailCall(const CallInst *) const {
	return false;
	}

	/// Return the builtin name for the __builtin___clear_cache intrinsic
	/// Default is to invoke the clear cache library call
	virtual const char * getClearCacheBuiltinName() const {
	return "__clear_cache";
	}

	/// Return the register ID of the name passed in. Used by named register
	/// global variables extension. There is no target-independent behaviour
	/// so the default action is to bail.
	virtual unsigned getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	report_fatal_error("Named registers not implemented for this target");
	}

	/// Return the type that should be used to zero or sign extend a
	/// zeroext/signext integer return value. FIXME: Some C calling conventions
	/// require the return type to be promoted, but this is not true all the time,
	/// e.g. i1/i8/i16 on x86/x86_64. It is also not necessary for non-C calling
	/// conventions. The frontend should handle this and include all of the
	/// necessary information.
	virtual EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType /ExtendKind/) const {
	EVT MinVT = getRegisterType(Context, MVT::i32);
	return VT.bitsLT(MinVT) ? MinVT : VT;
	}

	/// For some targets, an LLVM struct type must be broken down into multiple
	/// simple types, but the calling convention specifies that the entire struct
	/// must be passed in a block of consecutive registers.
	virtual bool
	functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv,
	bool isVarArg) const {
	return false;
	}

	/// For most targets, an LLVM type must be broken down into multiple
	/// smaller types. Usually the halves are ordered according to the endianness
	/// but for some platform that would break. So this method will default to
	/// matching the endianness but can be overridden.
	virtual bool
	shouldSplitFunctionArgumentsAsLittleEndian(const DataLayout &DL) const {
	return DL.isLittleEndian();
	}

	/// Returns a 0 terminated array of registers that can be safely used as
	/// scratch registers.
	virtual const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const {
	return nullptr;
	}

	/// This callback is used to prepare for a volatile or atomic load.
	/// It takes a chain node as input and returns the chain for the load itself.
	///
	/// Having a callback like this is necessary for targets like SystemZ,
	/// which allows a CPU to reuse the result of a previous load indefinitely,
	/// even if a cache-coherent store is performed by another CPU. The default
	/// implementation does nothing.
	virtual SDValue prepareVolatileOrAtomicLoad(SDValue Chain, const SDLoc &DL,
	SelectionDAG &DAG) const {
	return Chain;
	}

	/// This callback is used to inspect load/store instructions and add
	/// target-specific MachineMemOperand flags to them. The default
	/// implementation does nothing.
	virtual MachineMemOperand::Flags getMMOFlags(const Instruction &I) const {
	return MachineMemOperand::MONone;
	}

	/// This callback is invoked by the type legalizer to legalize nodes with an
	/// illegal operand type but legal result types. It replaces the
	/// LowerOperation callback in the type Legalizer. The reason we can not do
	/// away with LowerOperation entirely is that LegalizeDAG isn't yet ready to
	/// use this callback.
	///
	/// TODO: Consider merging with ReplaceNodeResults.
	///
	/// The target places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	/// The default implementation calls LowerOperation.
	virtual void LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const;

	/// This callback is invoked for operations that are unsupported by the
	/// target, which are registered to use 'custom' lowering, and whose defined
	/// values are all legal. If the target has no operations that require custom
	/// lowering, it need not implement this. The default implementation of this
	/// aborts.
	virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;

	/// This callback is invoked when a node result type is illegal for the
	/// target, and the operation was registered to use 'custom' lowering for that
	/// result type. The target places new result values for the node in Results
	/// (their number and types must exactly match those of the original return
	/// values of the node), or leaves Results empty, which indicates that the
	/// node is not to be custom lowered after all.
	///
	/// If the target has no operations that require custom lowering, it need not
	/// implement this. The default implementation aborts.
	virtual void ReplaceNodeResults(SDNode * /N/,
	SmallVectorImpl<SDValue> &/Results/,
	SelectionDAG &/DAG/) const {
	llvm_unreachable("ReplaceNodeResults not implemented for this target!");
	}

	/// This method returns the name of a target specific DAG node.
	virtual const char *getTargetNodeName(unsigned Opcode) const;

	/// This method returns a target specific FastISel object, or null if the
	/// target does not support "fast" ISel.
	virtual FastISel *createFastISel(FunctionLoweringInfo &,
	const TargetLibraryInfo *) const {
	return nullptr;
	}

	bool verifyReturnAddressArgumentIsConstant(SDValue Op,
	SelectionDAG &DAG) const;

	//===--------------------------------------------------------------------===//
	// Inline Asm Support hooks
	//

	/// This hook allows the target to expand an inline asm call to be explicit
	/// llvm code if it wants to. This is useful for turning simple inline asms
	/// into LLVM intrinsics, which gives the compiler more information about the
	/// behavior of the code.
	virtual bool ExpandInlineAsm(CallInst *) const {
	return false;
	}

	enum ConstraintType {
	C_Register, // Constraint represents specific register(s).
	C_RegisterClass, // Constraint represents any of register(s) in class.
	C_Memory, // Memory constraint.
	+ C_Immediate, // Requires an immediate.
	C_Other, // Something else.
	C_Unknown // Unsupported constraint.
	};

	enum ConstraintWeight {
	// Generic weights.
	CW_Invalid = -1, // No match.
	CW_Okay = 0, // Acceptable.
	CW_Good = 1, // Good weight.
	CW_Better = 2, // Better weight.
	CW_Best = 3, // Best weight.

	// Well-known weights.
	CW_SpecificReg = CW_Okay, // Specific register operands.
	CW_Register = CW_Good, // Register operands.
	CW_Memory = CW_Better, // Memory operands.
	CW_Constant = CW_Best, // Constant operand.
	CW_Default = CW_Okay // Default or don't know type.
	};

	/// This contains information for each constraint that we are lowering.
	struct AsmOperandInfo : public InlineAsm::ConstraintInfo {
	/// This contains the actual string for the code, like "m". TargetLowering
	/// picks the 'best' code from ConstraintInfo::Codes that most closely
	/// matches the operand.
	std::string ConstraintCode;

	/// Information about the constraint code, e.g. Register, RegisterClass,
	/// Memory, Other, Unknown.
	TargetLowering::ConstraintType ConstraintType = TargetLowering::C_Unknown;

	/// If this is the result output operand or a clobber, this is null,
	/// otherwise it is the incoming operand to the CallInst. This gets
	/// modified as the asm is processed.
	Value *CallOperandVal = nullptr;

	/// The ValueType for the operand value.
	MVT ConstraintVT = MVT::Other;

	/// Copy constructor for copying from a ConstraintInfo.
	AsmOperandInfo(InlineAsm::ConstraintInfo Info)
	: InlineAsm::ConstraintInfo(std::move(Info)) {}

	/// Return true of this is an input operand that is a matching constraint
	/// like "4".
	bool isMatchingInputConstraint() const;

	/// If this is an input matching constraint, this method returns the output
	/// operand it matches.
	unsigned getMatchedOperand() const;
	};

	using AsmOperandInfoVector = std::vector<AsmOperandInfo>;

	/// Split up the constraint string from the inline assembly value into the
	/// specific constraints and their prefixes, and also tie in the associated
	/// operand values. If this returns an empty vector, and if the constraint
	/// string itself isn't empty, there was an error parsing.
	virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL,
	const TargetRegisterInfo *TRI,
	ImmutableCallSite CS) const;

	/// Examine constraint type and operand type and determine a weight value.
	/// The operand object must already have been set up with the operand type.
	virtual ConstraintWeight getMultipleConstraintMatchWeight(
	AsmOperandInfo &info, int maIndex) const;

	/// Examine constraint string and operand type and determine a weight value.
	/// The operand object must already have been set up with the operand type.
	virtual ConstraintWeight getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const;

	/// Determines the constraint code and constraint type to use for the specific
	/// AsmOperandInfo, setting OpInfo.ConstraintCode and OpInfo.ConstraintType.
	/// If the actual operand being passed in is available, it can be passed in as
	/// Op, otherwise an empty SDValue can be passed.
	virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo,
	SDValue Op,
	SelectionDAG *DAG = nullptr) const;

	/// Given a constraint, return the type of constraint it is for this target.
	virtual ConstraintType getConstraintType(StringRef Constraint) const;

	/// Given a physical register constraint (e.g. {edx}), return the register
	/// number and the register class for the register.
	///
	/// Given a register class constraint, like 'r', if this corresponds directly
	/// to an LLVM register class, return a register of 0 and the register class
	/// pointer.
	///
	/// This should only be used for C_Register constraints. On error, this
	/// returns a register number of 0 and a null register class pointer.
	virtual std::pair<unsigned, const TargetRegisterClass *>
	getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint, MVT VT) const;

	virtual unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const {
	if (ConstraintCode == "i")
	return InlineAsm::Constraint_i;
	else if (ConstraintCode == "m")
	return InlineAsm::Constraint_m;
	return InlineAsm::Constraint_Unknown;
	}

	/// Try to replace an X constraint, which matches anything, with another that
	/// has more specific requirements based on the type of the corresponding
	/// operand. This returns null if there is no replacement to make.
	virtual const char *LowerXConstraint(EVT ConstraintVT) const;

	/// Lower the specified operand into the Ops vector. If it is invalid, don't
	/// add anything to Ops.
	virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
	std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const;

	// Lower custom output constraints. If invalid, return SDValue().
	virtual SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
	SDLoc DL,
	const AsmOperandInfo &OpInfo,
	SelectionDAG &DAG) const;

	//===--------------------------------------------------------------------===//
	// Div utility functions
	//
	SDValue BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
	SmallVectorImpl<SDNode *> &Created) const;
	SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
	SmallVectorImpl<SDNode *> &Created) const;

	/// Targets may override this function to provide custom SDIV lowering for
	/// power-of-2 denominators. If the target returns an empty SDValue, LLVM
	/// assumes SDIV is expensive and replaces it with a series of other integer
	/// operations.
	virtual SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const;

	/// Indicate whether this target prefers to combine FDIVs with the same
	/// divisor. If the transform should never be done, return zero. If the
	/// transform should be done, return the minimum number of divisor uses
	/// that must exist.
	virtual unsigned combineRepeatedFPDivisors() const {
	return 0;
	}

	/// Hooks for building estimates in place of slower divisions and square
	/// roots.

	/// Return either a square root or its reciprocal estimate value for the input
	/// operand.
	/// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or
	/// 'Enabled' as set by a potential default override attribute.
	/// If \p RefinementSteps is 'Unspecified', the number of Newton-Raphson
	/// refinement iterations required to generate a sufficient (though not
	/// necessarily IEEE-754 compliant) estimate is returned in that parameter.
	/// The boolean UseOneConstNR output is used to select a Newton-Raphson
	/// algorithm implementation that uses either one or two constants.
	/// The boolean Reciprocal is used to select whether the estimate is for the
	/// square root of the input operand or the reciprocal of its square root.
	/// A target may choose to implement its own refinement within this function.
	/// If that's true, then return '0' as the number of RefinementSteps to avoid
	/// any further refinement of the estimate.
	/// An empty SDValue return means no estimate sequence can be created.
	virtual SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
	int Enabled, int &RefinementSteps,
	bool &UseOneConstNR, bool Reciprocal) const {
	return SDValue();
	}

	/// Return a reciprocal estimate value for the input operand.
	/// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or
	/// 'Enabled' as set by a potential default override attribute.
	/// If \p RefinementSteps is 'Unspecified', the number of Newton-Raphson
	/// refinement iterations required to generate a sufficient (though not
	/// necessarily IEEE-754 compliant) estimate is returned in that parameter.
	/// A target may choose to implement its own refinement within this function.
	/// If that's true, then return '0' as the number of RefinementSteps to avoid
	/// any further refinement of the estimate.
	/// An empty SDValue return means no estimate sequence can be created.
	virtual SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
	int Enabled, int &RefinementSteps) const {
	return SDValue();
	}

	//===--------------------------------------------------------------------===//
	// Legalization utility functions
	//

	/// Expand a MUL or [US]MUL_LOHI of n-bit values into two or four nodes,
	/// respectively, each computing an n/2-bit part of the result.
	/// \param Result A vector that will be filled with the parts of the result
	/// in little-endian order.
	/// \param LL Low bits of the LHS of the MUL. You can use this parameter
	/// if you want to control how low bits are extracted from the LHS.
	/// \param LH High bits of the LHS of the MUL. See LL for meaning.
	/// \param RL Low bits of the RHS of the MUL. See LL for meaning
	/// \param RH High bits of the RHS of the MUL. See LL for meaning.
	/// \returns true if the node has been expanded, false if it has not
	bool expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl, SDValue LHS,
	SDValue RHS, SmallVectorImpl<SDValue> &Result, EVT HiLoVT,
	SelectionDAG &DAG, MulExpansionKind Kind,
	SDValue LL = SDValue(), SDValue LH = SDValue(),
	SDValue RL = SDValue(), SDValue RH = SDValue()) const;

	/// Expand a MUL into two nodes. One that computes the high bits of
	/// the result and one that computes the low bits.
	/// \param HiLoVT The value type to use for the Lo and Hi nodes.
	/// \param LL Low bits of the LHS of the MUL. You can use this parameter
	/// if you want to control how low bits are extracted from the LHS.
	/// \param LH High bits of the LHS of the MUL. See LL for meaning.
	/// \param RL Low bits of the RHS of the MUL. See LL for meaning
	/// \param RH High bits of the RHS of the MUL. See LL for meaning.
	/// \returns true if the node has been expanded. false if it has not
	bool expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
	SelectionDAG &DAG, MulExpansionKind Kind,
	SDValue LL = SDValue(), SDValue LH = SDValue(),
	SDValue RL = SDValue(), SDValue RH = SDValue()) const;

	/// Expand funnel shift.
	/// \param N Node to expand
	/// \param Result output after conversion
	/// \returns True, if the expansion was successful, false otherwise
	bool expandFunnelShift(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;

	/// Expand rotations.
	/// \param N Node to expand
	/// \param Result output after conversion
	/// \returns True, if the expansion was successful, false otherwise
	bool expandROT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;

	/// Expand float(f32) to SINT(i64) conversion
	/// \param N Node to expand
	/// \param Result output after conversion
	/// \returns True, if the expansion was successful, false otherwise
	bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;

	/// Expand float to UINT conversion
	/// \param N Node to expand
	/// \param Result output after conversion
	/// \returns True, if the expansion was successful, false otherwise
	bool expandFP_TO_UINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;

	/// Expand UINT(i64) to double(f64) conversion
	/// \param N Node to expand
	/// \param Result output after conversion
	/// \returns True, if the expansion was successful, false otherwise
	bool expandUINT_TO_FP(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;

	/// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
	SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;

	/// Expand CTPOP nodes. Expands vector/scalar CTPOP nodes,
	/// vector nodes can only succeed if all operations are legal/custom.
	/// \param N Node to expand
	/// \param Result output after conversion
	/// \returns True, if the expansion was successful, false otherwise
	bool expandCTPOP(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;

	/// Expand CTLZ/CTLZ_ZERO_UNDEF nodes. Expands vector/scalar CTLZ nodes,
	/// vector nodes can only succeed if all operations are legal/custom.
	/// \param N Node to expand
	/// \param Result output after conversion
	/// \returns True, if the expansion was successful, false otherwise
	bool expandCTLZ(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;

	/// Expand CTTZ/CTTZ_ZERO_UNDEF nodes. Expands vector/scalar CTTZ nodes,
	/// vector nodes can only succeed if all operations are legal/custom.
	/// \param N Node to expand
	/// \param Result output after conversion
	/// \returns True, if the expansion was successful, false otherwise
	bool expandCTTZ(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;

	/// Expand ABS nodes. Expands vector/scalar ABS nodes,
	/// vector nodes can only succeed if all operations are legal/custom.
	/// (ABS x) -> (XOR (ADD x, (SRA x, type_size)), (SRA x, type_size))
	/// \param N Node to expand
	/// \param Result output after conversion
	/// \returns True, if the expansion was successful, false otherwise
	bool expandABS(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;

	/// Turn load of vector type into a load of the individual elements.
	/// \param LD load to expand
	/// \returns MERGE_VALUEs of the scalar loads with their chains.
	SDValue scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const;

	// Turn a store of a vector type into stores of the individual elements.
	/// \param ST Store with a vector value type
	/// \returns MERGE_VALUs of the individual store chains.
	SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const;

	/// Expands an unaligned load to 2 half-size loads for an integer, and
	/// possibly more for vectors.
	std::pair<SDValue, SDValue> expandUnalignedLoad(LoadSDNode *LD,
	SelectionDAG &DAG) const;

	/// Expands an unaligned store to 2 half-size stores for integer values, and
	/// possibly more for vectors.
	SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const;

	/// Increments memory address \p Addr according to the type of the value
	/// \p DataVT that should be stored. If the data is stored in compressed
	/// form, the memory address should be incremented according to the number of
	/// the stored elements. This number is equal to the number of '1's bits
	/// in the \p Mask.
	/// \p DataVT is a vector type. \p Mask is a vector value.
	/// \p DataVT and \p Mask have the same number of vector elements.
	SDValue IncrementMemoryAddress(SDValue Addr, SDValue Mask, const SDLoc &DL,
	EVT DataVT, SelectionDAG &DAG,
	bool IsCompressedMemory) const;

	/// Get a pointer to vector element \p Idx located in memory for a vector of
	/// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of
	/// bounds the returned pointer is unspecified, but will be within the vector
	/// bounds.
	SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
	SDValue Index) const;

	/// Method for building the DAG expansion of ISD::[US][ADD\|SUB]SAT. This
	/// method accepts integers as its arguments.
	SDValue expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const;

	/// Method for building the DAG expansion of ISD::SMULFIX. This method accepts
	/// integers as its arguments.
	SDValue expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const;

	/// Method for building the DAG expansion of ISD::U(ADD\|SUB)O. Expansion
	/// always suceeds and populates the Result and Overflow arguments.
	void expandUADDSUBO(SDNode *Node, SDValue &Result, SDValue &Overflow,
	SelectionDAG &DAG) const;

	/// Method for building the DAG expansion of ISD::S(ADD\|SUB)O. Expansion
	/// always suceeds and populates the Result and Overflow arguments.
	void expandSADDSUBO(SDNode *Node, SDValue &Result, SDValue &Overflow,
	SelectionDAG &DAG) const;

	/// Method for building the DAG expansion of ISD::[US]MULO. Returns whether
	/// expansion was successful and populates the Result and Overflow arguments.
	bool expandMULO(SDNode *Node, SDValue &Result, SDValue &Overflow,
	SelectionDAG &DAG) const;

	/// Expand a VECREDUCE_* into an explicit calculation. If Count is specified,
	/// only the first Count elements of the vector are used.
	SDValue expandVecReduce(SDNode *Node, SelectionDAG &DAG) const;

	//===--------------------------------------------------------------------===//
	// Instruction Emitting Hooks
	//

	/// This method should be implemented by targets that mark instructions with
	/// the 'usesCustomInserter' flag. These instructions are special in various
	/// ways, which require special support to insert. The specified MachineInstr
	/// is created but not inserted into any basic blocks, and this method is
	/// called to expand it into a sequence of instructions, potentially also
	/// creating new basic blocks and control flow.
	/// As long as the returned basic block is different (i.e., we created a new
	/// one), the custom inserter is free to modify the rest of \p MBB.
	virtual MachineBasicBlock *
	EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;

	/// This method should be implemented by targets that mark instructions with
	/// the 'hasPostISelHook' flag. These instructions must be adjusted after
	/// instruction selection by target hooks. e.g. To fill in optional defs for
	/// ARM 's' setting instructions.
	virtual void AdjustInstrPostInstrSelection(MachineInstr &MI,
	SDNode *Node) const;

	/// If this function returns true, SelectionDAGBuilder emits a
	/// LOAD_STACK_GUARD node when it is lowering Intrinsic::stackprotector.
	virtual bool useLoadStackGuardNode() const {
	return false;
	}

	virtual SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
	const SDLoc &DL) const {
	llvm_unreachable("not implemented for this target");
	}

	/// Lower TLS global address SDNode for target independent emulated TLS model.
	virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
	SelectionDAG &DAG) const;

	/// Expands target specific indirect branch for the case of JumpTable
	/// expanasion.
	virtual SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, SDValue Addr,
	SelectionDAG &DAG) const {
	return DAG.getNode(ISD::BRIND, dl, MVT::Other, Value, Addr);
	}

	// seteq(x, 0) -> truncate(srl(ctlz(zext(x)), log2(#bits)))
	// If we're comparing for equality to zero and isCtlzFast is true, expose the
	// fact that this can be implemented as a ctlz/srl pair, so that the dag
	// combiner can fold the new nodes.
	SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) const;

	private:
	SDValue foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
	const SDLoc &DL, DAGCombinerInfo &DCI) const;
	SDValue foldSetCCWithBinOp(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
	const SDLoc &DL, DAGCombinerInfo &DCI) const;

	SDValue optimizeSetCCOfSignedTruncationCheck(EVT SCCVT, SDValue N0,
	SDValue N1, ISD::CondCode Cond,
	DAGCombinerInfo &DCI,
	const SDLoc &DL) const;

	SDValue prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
	SDValue CompTargetNode, ISD::CondCode Cond,
	DAGCombinerInfo &DCI, const SDLoc &DL,
	SmallVectorImpl<SDNode *> &Created) const;
	SDValue buildUREMEqFold(EVT SETCCVT, SDValue REMNode, SDValue CompTargetNode,
	ISD::CondCode Cond, DAGCombinerInfo &DCI,
	const SDLoc &DL) const;
	};

	/// Given an LLVM IR type and return type attributes, compute the return value
	/// EVTs and flags, and optionally also the offsets, if the return value is
	/// being lowered to memory.
	void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr,
	SmallVectorImpl<ISD::OutputArg> &Outs,
	const TargetLowering &TLI, const DataLayout &DL);

	} // end namespace llvm

	#endif // LLVM_CODEGEN_TARGETLOWERING_H
	Index: vendor/llvm/dist-release_90/include/llvm/ExecutionEngine/Orc/LambdaResolver.h
	===================================================================
	--- vendor/llvm/dist-release_90/include/llvm/ExecutionEngine/Orc/LambdaResolver.h (revision 351302)
	+++ vendor/llvm/dist-release_90/include/llvm/ExecutionEngine/Orc/LambdaResolver.h (revision 351303)
	@@ -1,83 +1,84 @@
	//===- LambdaResolverMM - Redirect symbol lookup via a functor --- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// Defines a RuntimeDyld::SymbolResolver subclass that uses a user-supplied
	// functor for symbol resolution.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_EXECUTIONENGINE_ORC_LAMBDARESOLVER_H
	#define LLVM_EXECUTIONENGINE_ORC_LAMBDARESOLVER_H

	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ExecutionEngine/JITSymbol.h"
	+#include "llvm/ExecutionEngine/OrcV1Deprecation.h"
	#include <memory>

	namespace llvm {
	namespace orc {

	template <typename DylibLookupFtorT, typename ExternalLookupFtorT>
	class LambdaResolver : public LegacyJITSymbolResolver {
	public:
	LLVM_ATTRIBUTE_DEPRECATED(
	LambdaResolver(DylibLookupFtorT DylibLookupFtor,
	ExternalLookupFtorT ExternalLookupFtor),
	"ORCv1 utilities (including resolvers) are deprecated and will be "
	"removed "
	"in the next release. Please use ORCv2 (see docs/ORCv2.rst)");

	LambdaResolver(ORCv1DeprecationAcknowledgement,
	DylibLookupFtorT DylibLookupFtor,
	ExternalLookupFtorT ExternalLookupFtor)
	: DylibLookupFtor(DylibLookupFtor),
	ExternalLookupFtor(ExternalLookupFtor) {}

	JITSymbol findSymbolInLogicalDylib(const std::string &Name) final {
	return DylibLookupFtor(Name);
	}

	JITSymbol findSymbol(const std::string &Name) final {
	return ExternalLookupFtor(Name);
	}

	private:
	DylibLookupFtorT DylibLookupFtor;
	ExternalLookupFtorT ExternalLookupFtor;
	};

	template <typename DylibLookupFtorT, typename ExternalLookupFtorT>
	LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>::LambdaResolver(
	DylibLookupFtorT DylibLookupFtor, ExternalLookupFtorT ExternalLookupFtor)
	: DylibLookupFtor(DylibLookupFtor), ExternalLookupFtor(ExternalLookupFtor) {
	}

	template <typename DylibLookupFtorT,
	typename ExternalLookupFtorT>
	std::shared_ptr<LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>>
	createLambdaResolver(DylibLookupFtorT DylibLookupFtor,
	ExternalLookupFtorT ExternalLookupFtor) {
	using LR = LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>;
	return make_unique<LR>(std::move(DylibLookupFtor),
	std::move(ExternalLookupFtor));
	}

	template <typename DylibLookupFtorT, typename ExternalLookupFtorT>
	std::shared_ptr<LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>>
	createLambdaResolver(ORCv1DeprecationAcknowledgement,
	DylibLookupFtorT DylibLookupFtor,
	ExternalLookupFtorT ExternalLookupFtor) {
	using LR = LambdaResolver<DylibLookupFtorT, ExternalLookupFtorT>;
	return make_unique<LR>(AcknowledgeORCv1Deprecation,
	std::move(DylibLookupFtor),
	std::move(ExternalLookupFtor));
	}

	} // end namespace orc
	} // end namespace llvm

	#endif // LLVM_EXECUTIONENGINE_ORC_LAMBDARESOLVER_H
	Index: vendor/llvm/dist-release_90/include/llvm/MC/MCContext.h
	===================================================================
	--- vendor/llvm/dist-release_90/include/llvm/MC/MCContext.h (revision 351302)
	+++ vendor/llvm/dist-release_90/include/llvm/MC/MCContext.h (revision 351303)
	@@ -1,754 +1,767 @@
	//===- MCContext.h - Machine Code Context ------------------------ C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_MC_MCCONTEXT_H
	#define LLVM_MC_MCCONTEXT_H

	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringMap.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/BinaryFormat/Dwarf.h"
	#include "llvm/BinaryFormat/XCOFF.h"
	#include "llvm/MC/MCAsmMacro.h"
	#include "llvm/MC/MCDwarf.h"
	#include "llvm/MC/MCSubtargetInfo.h"
	#include "llvm/MC/SectionKind.h"
	#include "llvm/Support/Allocator.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Error.h"
	#include "llvm/Support/MD5.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <map>
	#include <memory>
	#include <string>
	#include <utility>
	#include <vector>

	namespace llvm {

	class CodeViewContext;
	class MCAsmInfo;
	class MCLabel;
	class MCObjectFileInfo;
	class MCRegisterInfo;
	class MCSection;
	class MCSectionCOFF;
	class MCSectionELF;
	class MCSectionMachO;
	class MCSectionWasm;
	class MCSectionXCOFF;
	class MCStreamer;
	class MCSymbol;
	class MCSymbolELF;
	class MCSymbolWasm;
	class SMLoc;
	class SourceMgr;

	/// Context object for machine code objects. This class owns all of the
	/// sections that it creates.
	///
	class MCContext {
	public:
	using SymbolTable = StringMap<MCSymbol *, BumpPtrAllocator &>;

	private:
	/// The SourceMgr for this object, if any.
	const SourceMgr *SrcMgr;

	/// The SourceMgr for inline assembly, if any.
	SourceMgr *InlineSrcMgr;

	/// The MCAsmInfo for this target.
	const MCAsmInfo *MAI;

	/// The MCRegisterInfo for this target.
	const MCRegisterInfo *MRI;

	/// The MCObjectFileInfo for this target.
	const MCObjectFileInfo *MOFI;

	std::unique_ptr<CodeViewContext> CVContext;

	/// Allocator object used for creating machine code objects.
	///
	/// We use a bump pointer allocator to avoid the need to track all allocated
	/// objects.
	BumpPtrAllocator Allocator;

	SpecificBumpPtrAllocator<MCSectionCOFF> COFFAllocator;
	SpecificBumpPtrAllocator<MCSectionELF> ELFAllocator;
	SpecificBumpPtrAllocator<MCSectionMachO> MachOAllocator;
	SpecificBumpPtrAllocator<MCSectionWasm> WasmAllocator;
	SpecificBumpPtrAllocator<MCSectionXCOFF> XCOFFAllocator;

	/// Bindings of names to symbols.
	SymbolTable Symbols;

	/// A mapping from a local label number and an instance count to a symbol.
	/// For example, in the assembly
	/// 1:
	/// 2:
	/// 1:
	/// We have three labels represented by the pairs (1, 0), (2, 0) and (1, 1)
	DenseMap<std::pair<unsigned, unsigned>, MCSymbol *> LocalSymbols;

	/// Keeps tracks of names that were used both for used declared and
	/// artificial symbols. The value is "true" if the name has been used for a
	/// non-section symbol (there can be at most one of those, plus an unlimited
	/// number of section symbols with the same name).
	StringMap<bool, BumpPtrAllocator &> UsedNames;

	+ /// Keeps track of labels that are used in inline assembly.
	+ SymbolTable InlineAsmUsedLabelNames;
	+
	/// The next ID to dole out to an unnamed assembler temporary symbol with
	/// a given prefix.
	StringMap<unsigned> NextID;

	/// Instances of directional local labels.
	DenseMap<unsigned, MCLabel *> Instances;
	/// NextInstance() creates the next instance of the directional local label
	/// for the LocalLabelVal and adds it to the map if needed.
	unsigned NextInstance(unsigned LocalLabelVal);
	/// GetInstance() gets the current instance of the directional local label
	/// for the LocalLabelVal and adds it to the map if needed.
	unsigned GetInstance(unsigned LocalLabelVal);

	/// The file name of the log file from the environment variable
	/// AS_SECURE_LOG_FILE. Which must be set before the .secure_log_unique
	/// directive is used or it is an error.
	char *SecureLogFile;
	/// The stream that gets written to for the .secure_log_unique directive.
	std::unique_ptr<raw_fd_ostream> SecureLog;
	/// Boolean toggled when .secure_log_unique / .secure_log_reset is seen to
	/// catch errors if .secure_log_unique appears twice without
	/// .secure_log_reset appearing between them.
	bool SecureLogUsed = false;

	/// The compilation directory to use for DW_AT_comp_dir.
	SmallString<128> CompilationDir;

	/// Prefix replacement map for source file information.
	std::map<const std::string, const std::string> DebugPrefixMap;

	/// The main file name if passed in explicitly.
	std::string MainFileName;

	/// The dwarf file and directory tables from the dwarf .file directive.
	/// We now emit a line table for each compile unit. To reduce the prologue
	/// size of each line table, the files and directories used by each compile
	/// unit are separated.
	std::map<unsigned, MCDwarfLineTable> MCDwarfLineTablesCUMap;

	/// The current dwarf line information from the last dwarf .loc directive.
	MCDwarfLoc CurrentDwarfLoc;
	bool DwarfLocSeen = false;

	/// Generate dwarf debugging info for assembly source files.
	bool GenDwarfForAssembly = false;

	/// The current dwarf file number when generate dwarf debugging info for
	/// assembly source files.
	unsigned GenDwarfFileNumber = 0;

	/// Sections for generating the .debug_ranges and .debug_aranges sections.
	SetVector<MCSection *> SectionsForRanges;

	/// The information gathered from labels that will have dwarf label
	/// entries when generating dwarf assembly source files.
	std::vector<MCGenDwarfLabelEntry> MCGenDwarfLabelEntries;

	/// The string to embed in the debug information for the compile unit, if
	/// non-empty.
	StringRef DwarfDebugFlags;

	/// The string to embed in as the dwarf AT_producer for the compile unit, if
	/// non-empty.
	StringRef DwarfDebugProducer;

	/// The maximum version of dwarf that we should emit.
	uint16_t DwarfVersion = 4;

	/// Honor temporary labels, this is useful for debugging semantic
	/// differences between temporary and non-temporary labels (primarily on
	/// Darwin).
	bool AllowTemporaryLabels = true;
	bool UseNamesOnTempLabels = true;

	/// The Compile Unit ID that we are currently processing.
	unsigned DwarfCompileUnitID = 0;

	struct ELFSectionKey {
	std::string SectionName;
	StringRef GroupName;
	unsigned UniqueID;

	ELFSectionKey(StringRef SectionName, StringRef GroupName,
	unsigned UniqueID)
	: SectionName(SectionName), GroupName(GroupName), UniqueID(UniqueID) {
	}

	bool operator<(const ELFSectionKey &Other) const {
	if (SectionName != Other.SectionName)
	return SectionName < Other.SectionName;
	if (GroupName != Other.GroupName)
	return GroupName < Other.GroupName;
	return UniqueID < Other.UniqueID;
	}
	};

	struct COFFSectionKey {
	std::string SectionName;
	StringRef GroupName;
	int SelectionKey;
	unsigned UniqueID;

	COFFSectionKey(StringRef SectionName, StringRef GroupName,
	int SelectionKey, unsigned UniqueID)
	: SectionName(SectionName), GroupName(GroupName),
	SelectionKey(SelectionKey), UniqueID(UniqueID) {}

	bool operator<(const COFFSectionKey &Other) const {
	if (SectionName != Other.SectionName)
	return SectionName < Other.SectionName;
	if (GroupName != Other.GroupName)
	return GroupName < Other.GroupName;
	if (SelectionKey != Other.SelectionKey)
	return SelectionKey < Other.SelectionKey;
	return UniqueID < Other.UniqueID;
	}
	};

	struct WasmSectionKey {
	std::string SectionName;
	StringRef GroupName;
	unsigned UniqueID;

	WasmSectionKey(StringRef SectionName, StringRef GroupName,
	unsigned UniqueID)
	: SectionName(SectionName), GroupName(GroupName), UniqueID(UniqueID) {
	}

	bool operator<(const WasmSectionKey &Other) const {
	if (SectionName != Other.SectionName)
	return SectionName < Other.SectionName;
	if (GroupName != Other.GroupName)
	return GroupName < Other.GroupName;
	return UniqueID < Other.UniqueID;
	}
	};

	struct XCOFFSectionKey {
	std::string SectionName;
	XCOFF::StorageMappingClass MappingClass;

	XCOFFSectionKey(StringRef SectionName,
	XCOFF::StorageMappingClass MappingClass)
	: SectionName(SectionName), MappingClass(MappingClass) {}

	bool operator<(const XCOFFSectionKey &Other) const {
	return std::tie(SectionName, MappingClass) <
	std::tie(Other.SectionName, Other.MappingClass);
	}
	};

	StringMap<MCSectionMachO *> MachOUniquingMap;
	std::map<ELFSectionKey, MCSectionELF *> ELFUniquingMap;
	std::map<COFFSectionKey, MCSectionCOFF *> COFFUniquingMap;
	std::map<WasmSectionKey, MCSectionWasm *> WasmUniquingMap;
	std::map<XCOFFSectionKey, MCSectionXCOFF *> XCOFFUniquingMap;
	StringMap<bool> RelSecNames;

	SpecificBumpPtrAllocator<MCSubtargetInfo> MCSubtargetAllocator;

	/// Do automatic reset in destructor
	bool AutoReset;

	bool HadError = false;

	MCSymbol createSymbolImpl(const StringMapEntry<bool> Name,
	bool CanBeUnnamed);
	MCSymbol *createSymbol(StringRef Name, bool AlwaysAddSuffix,
	bool IsTemporary);

	MCSymbol *getOrCreateDirectionalLocalSymbol(unsigned LocalLabelVal,
	unsigned Instance);

	MCSectionELF *createELFSectionImpl(StringRef Section, unsigned Type,
	unsigned Flags, SectionKind K,
	unsigned EntrySize,
	const MCSymbolELF *Group,
	unsigned UniqueID,
	const MCSymbolELF *Associated);

	/// Map of currently defined macros.
	StringMap<MCAsmMacro> MacroMap;

	public:
	explicit MCContext(const MCAsmInfo MAI, const MCRegisterInfo MRI,
	const MCObjectFileInfo *MOFI,
	const SourceMgr *Mgr = nullptr, bool DoAutoReset = true);
	MCContext(const MCContext &) = delete;
	MCContext &operator=(const MCContext &) = delete;
	~MCContext();

	const SourceMgr *getSourceManager() const { return SrcMgr; }

	void setInlineSourceManager(SourceMgr *SM) { InlineSrcMgr = SM; }

	const MCAsmInfo *getAsmInfo() const { return MAI; }

	const MCRegisterInfo *getRegisterInfo() const { return MRI; }

	const MCObjectFileInfo *getObjectFileInfo() const { return MOFI; }

	CodeViewContext &getCVContext();

	void setAllowTemporaryLabels(bool Value) { AllowTemporaryLabels = Value; }
	void setUseNamesOnTempLabels(bool Value) { UseNamesOnTempLabels = Value; }

	/// \name Module Lifetime Management
	/// @{

	/// reset - return object to right after construction state to prepare
	/// to process a new module
	void reset();

	/// @}

	/// \name Symbol Management
	/// @{

	/// Create and return a new linker temporary symbol with a unique but
	/// unspecified name.
	MCSymbol *createLinkerPrivateTempSymbol();

	/// Create and return a new assembler temporary symbol with a unique but
	/// unspecified name.
	MCSymbol *createTempSymbol(bool CanBeUnnamed = true);

	MCSymbol *createTempSymbol(const Twine &Name, bool AlwaysAddSuffix,
	bool CanBeUnnamed = true);

	/// Create the definition of a directional local symbol for numbered label
	/// (used for "1:" definitions).
	MCSymbol *createDirectionalLocalSymbol(unsigned LocalLabelVal);

	/// Create and return a directional local symbol for numbered label (used
	/// for "1b" or 1f" references).
	MCSymbol *getDirectionalLocalSymbol(unsigned LocalLabelVal, bool Before);

	/// Lookup the symbol inside with the specified \p Name. If it exists,
	/// return it. If not, create a forward reference and return it.
	///
	/// \param Name - The symbol name, which must be unique across all symbols.
	MCSymbol *getOrCreateSymbol(const Twine &Name);

	/// Gets a symbol that will be defined to the final stack offset of a local
	/// variable after codegen.
	///
	/// \param Idx - The index of a local variable passed to \@llvm.localescape.
	MCSymbol *getOrCreateFrameAllocSymbol(StringRef FuncName, unsigned Idx);

	MCSymbol *getOrCreateParentFrameOffsetSymbol(StringRef FuncName);

	MCSymbol *getOrCreateLSDASymbol(StringRef FuncName);

	/// Get the symbol for \p Name, or null.
	MCSymbol *lookupSymbol(const Twine &Name) const;

	/// Set value for a symbol.
	void setSymbolValue(MCStreamer &Streamer, StringRef Sym, uint64_t Val);

	/// getSymbols - Get a reference for the symbol table for clients that
	/// want to, for example, iterate over all symbols. 'const' because we
	/// still want any modifications to the table itself to use the MCContext
	/// APIs.
	const SymbolTable &getSymbols() const { return Symbols; }
	+
	+ /// isInlineAsmLabel - Return true if the name is a label referenced in
	+ /// inline assembly.
	+ MCSymbol *getInlineAsmLabel(StringRef Name) const {
	+ return InlineAsmUsedLabelNames.lookup(Name);
	+ }
	+
	+ /// registerInlineAsmLabel - Records that the name is a label referenced in
	+ /// inline assembly.
	+ void registerInlineAsmLabel(MCSymbol *Sym);

	/// @}

	/// \name Section Management
	/// @{

	enum : unsigned {
	/// Pass this value as the UniqueID during section creation to get the
	/// generic section with the given name and characteristics. The usual
	/// sections such as .text use this ID.
	GenericSectionID = ~0U
	};

	/// Return the MCSection for the specified mach-o section. This requires
	/// the operands to be valid.
	MCSectionMachO *getMachOSection(StringRef Segment, StringRef Section,
	unsigned TypeAndAttributes,
	unsigned Reserved2, SectionKind K,
	const char *BeginSymName = nullptr);

	MCSectionMachO *getMachOSection(StringRef Segment, StringRef Section,
	unsigned TypeAndAttributes, SectionKind K,
	const char *BeginSymName = nullptr) {
	return getMachOSection(Segment, Section, TypeAndAttributes, 0, K,
	BeginSymName);
	}

	MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
	unsigned Flags) {
	return getELFSection(Section, Type, Flags, 0, "");
	}

	MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
	unsigned Flags, unsigned EntrySize,
	const Twine &Group) {
	return getELFSection(Section, Type, Flags, EntrySize, Group, ~0);
	}

	MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
	unsigned Flags, unsigned EntrySize,
	const Twine &Group, unsigned UniqueID) {
	return getELFSection(Section, Type, Flags, EntrySize, Group, UniqueID,
	nullptr);
	}

	MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
	unsigned Flags, unsigned EntrySize,
	const Twine &Group, unsigned UniqueID,
	const MCSymbolELF *Associated);

	MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
	unsigned Flags, unsigned EntrySize,
	const MCSymbolELF *Group, unsigned UniqueID,
	const MCSymbolELF *Associated);

	/// Get a section with the provided group identifier. This section is
	/// named by concatenating \p Prefix with '.' then \p Suffix. The \p Type
	/// describes the type of the section and \p Flags are used to further
	/// configure this named section.
	MCSectionELF *getELFNamedSection(const Twine &Prefix, const Twine &Suffix,
	unsigned Type, unsigned Flags,
	unsigned EntrySize = 0);

	MCSectionELF *createELFRelSection(const Twine &Name, unsigned Type,
	unsigned Flags, unsigned EntrySize,
	const MCSymbolELF *Group,
	const MCSectionELF *RelInfoSection);

	void renameELFSection(MCSectionELF *Section, StringRef Name);

	MCSectionELF createELFGroupSection(const MCSymbolELF Group);

	MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics,
	SectionKind Kind, StringRef COMDATSymName,
	int Selection,
	unsigned UniqueID = GenericSectionID,
	const char *BeginSymName = nullptr);

	MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics,
	SectionKind Kind,
	const char *BeginSymName = nullptr);

	/// Gets or creates a section equivalent to Sec that is associated with the
	/// section containing KeySym. For example, to create a debug info section
	/// associated with an inline function, pass the normal debug info section
	/// as Sec and the function symbol as KeySym.
	MCSectionCOFF *
	getAssociativeCOFFSection(MCSectionCOFF Sec, const MCSymbol KeySym,
	unsigned UniqueID = GenericSectionID);

	MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K) {
	return getWasmSection(Section, K, nullptr);
	}

	MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
	const char *BeginSymName) {
	return getWasmSection(Section, K, "", ~0, BeginSymName);
	}

	MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
	const Twine &Group, unsigned UniqueID) {
	return getWasmSection(Section, K, Group, UniqueID, nullptr);
	}

	MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
	const Twine &Group, unsigned UniqueID,
	const char *BeginSymName);

	MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
	const MCSymbolWasm *Group, unsigned UniqueID,
	const char *BeginSymName);

	MCSectionXCOFF *getXCOFFSection(StringRef Section,
	XCOFF::StorageMappingClass MappingClass,
	SectionKind K,
	const char *BeginSymName = nullptr);

	// Create and save a copy of STI and return a reference to the copy.
	MCSubtargetInfo &getSubtargetCopy(const MCSubtargetInfo &STI);

	/// @}

	/// \name Dwarf Management
	/// @{

	/// Get the compilation directory for DW_AT_comp_dir
	/// The compilation directory should be set with \c setCompilationDir before
	/// calling this function. If it is unset, an empty string will be returned.
	StringRef getCompilationDir() const { return CompilationDir; }

	/// Set the compilation directory for DW_AT_comp_dir
	void setCompilationDir(StringRef S) { CompilationDir = S.str(); }

	/// Add an entry to the debug prefix map.
	void addDebugPrefixMapEntry(const std::string &From, const std::string &To);

	// Remaps all debug directory paths in-place as per the debug prefix map.
	void RemapDebugPaths();

	/// Get the main file name for use in error messages and debug
	/// info. This can be set to ensure we've got the correct file name
	/// after preprocessing or for -save-temps.
	const std::string &getMainFileName() const { return MainFileName; }

	/// Set the main file name and override the default.
	void setMainFileName(StringRef S) { MainFileName = S; }

	/// Creates an entry in the dwarf file and directory tables.
	Expected<unsigned> getDwarfFile(StringRef Directory, StringRef FileName,
	unsigned FileNumber,
	Optional<MD5::MD5Result> Checksum,
	Optional<StringRef> Source, unsigned CUID);

	bool isValidDwarfFileNumber(unsigned FileNumber, unsigned CUID = 0);

	const std::map<unsigned, MCDwarfLineTable> &getMCDwarfLineTables() const {
	return MCDwarfLineTablesCUMap;
	}

	MCDwarfLineTable &getMCDwarfLineTable(unsigned CUID) {
	return MCDwarfLineTablesCUMap[CUID];
	}

	const MCDwarfLineTable &getMCDwarfLineTable(unsigned CUID) const {
	auto I = MCDwarfLineTablesCUMap.find(CUID);
	assert(I != MCDwarfLineTablesCUMap.end());
	return I->second;
	}

	const SmallVectorImpl<MCDwarfFile> &getMCDwarfFiles(unsigned CUID = 0) {
	return getMCDwarfLineTable(CUID).getMCDwarfFiles();
	}

	const SmallVectorImpl<std::string> &getMCDwarfDirs(unsigned CUID = 0) {
	return getMCDwarfLineTable(CUID).getMCDwarfDirs();
	}

	unsigned getDwarfCompileUnitID() { return DwarfCompileUnitID; }

	void setDwarfCompileUnitID(unsigned CUIndex) {
	DwarfCompileUnitID = CUIndex;
	}

	/// Specifies the "root" file and directory of the compilation unit.
	/// These are "file 0" and "directory 0" in DWARF v5.
	void setMCLineTableRootFile(unsigned CUID, StringRef CompilationDir,
	StringRef Filename,
	Optional<MD5::MD5Result> Checksum,
	Optional<StringRef> Source) {
	getMCDwarfLineTable(CUID).setRootFile(CompilationDir, Filename, Checksum,
	Source);
	}

	/// Reports whether MD5 checksum usage is consistent (all-or-none).
	bool isDwarfMD5UsageConsistent(unsigned CUID) const {
	return getMCDwarfLineTable(CUID).isMD5UsageConsistent();
	}

	/// Saves the information from the currently parsed dwarf .loc directive
	/// and sets DwarfLocSeen. When the next instruction is assembled an entry
	/// in the line number table with this information and the address of the
	/// instruction will be created.
	void setCurrentDwarfLoc(unsigned FileNum, unsigned Line, unsigned Column,
	unsigned Flags, unsigned Isa,
	unsigned Discriminator) {
	CurrentDwarfLoc.setFileNum(FileNum);
	CurrentDwarfLoc.setLine(Line);
	CurrentDwarfLoc.setColumn(Column);
	CurrentDwarfLoc.setFlags(Flags);
	CurrentDwarfLoc.setIsa(Isa);
	CurrentDwarfLoc.setDiscriminator(Discriminator);
	DwarfLocSeen = true;
	}

	void clearDwarfLocSeen() { DwarfLocSeen = false; }

	bool getDwarfLocSeen() { return DwarfLocSeen; }
	const MCDwarfLoc &getCurrentDwarfLoc() { return CurrentDwarfLoc; }

	bool getGenDwarfForAssembly() { return GenDwarfForAssembly; }
	void setGenDwarfForAssembly(bool Value) { GenDwarfForAssembly = Value; }
	unsigned getGenDwarfFileNumber() { return GenDwarfFileNumber; }

	void setGenDwarfFileNumber(unsigned FileNumber) {
	GenDwarfFileNumber = FileNumber;
	}

	/// Specifies information about the "root file" for assembler clients
	/// (e.g., llvm-mc). Assumes compilation dir etc. have been set up.
	void setGenDwarfRootFile(StringRef FileName, StringRef Buffer);

	const SetVector<MCSection *> &getGenDwarfSectionSyms() {
	return SectionsForRanges;
	}

	bool addGenDwarfSection(MCSection *Sec) {
	return SectionsForRanges.insert(Sec);
	}

	void finalizeDwarfSections(MCStreamer &MCOS);

	const std::vector<MCGenDwarfLabelEntry> &getMCGenDwarfLabelEntries() const {
	return MCGenDwarfLabelEntries;
	}

	void addMCGenDwarfLabelEntry(const MCGenDwarfLabelEntry &E) {
	MCGenDwarfLabelEntries.push_back(E);
	}

	void setDwarfDebugFlags(StringRef S) { DwarfDebugFlags = S; }
	StringRef getDwarfDebugFlags() { return DwarfDebugFlags; }

	void setDwarfDebugProducer(StringRef S) { DwarfDebugProducer = S; }
	StringRef getDwarfDebugProducer() { return DwarfDebugProducer; }

	dwarf::DwarfFormat getDwarfFormat() const {
	// TODO: Support DWARF64
	return dwarf::DWARF32;
	}

	void setDwarfVersion(uint16_t v) { DwarfVersion = v; }
	uint16_t getDwarfVersion() const { return DwarfVersion; }

	/// @}

	char *getSecureLogFile() { return SecureLogFile; }
	raw_fd_ostream *getSecureLog() { return SecureLog.get(); }

	void setSecureLog(std::unique_ptr<raw_fd_ostream> Value) {
	SecureLog = std::move(Value);
	}

	bool getSecureLogUsed() { return SecureLogUsed; }
	void setSecureLogUsed(bool Value) { SecureLogUsed = Value; }

	void *allocate(unsigned Size, unsigned Align = 8) {
	return Allocator.Allocate(Size, Align);
	}

	void deallocate(void *Ptr) {}

	bool hadError() { return HadError; }
	void reportError(SMLoc L, const Twine &Msg);
	// Unrecoverable error has occurred. Display the best diagnostic we can
	// and bail via exit(1). For now, most MC backend errors are unrecoverable.
	// FIXME: We should really do something about that.
	LLVM_ATTRIBUTE_NORETURN void reportFatalError(SMLoc L,
	const Twine &Msg);

	const MCAsmMacro *lookupMacro(StringRef Name) {
	StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
	return (I == MacroMap.end()) ? nullptr : &I->getValue();
	}

	void defineMacro(StringRef Name, MCAsmMacro Macro) {
	MacroMap.insert(std::make_pair(Name, std::move(Macro)));
	}

	void undefineMacro(StringRef Name) { MacroMap.erase(Name); }
	};

	} // end namespace llvm

	// operator new and delete aren't allowed inside namespaces.
	// The throw specifications are mandated by the standard.
	/// Placement new for using the MCContext's allocator.
	///
	/// This placement form of operator new uses the MCContext's allocator for
	/// obtaining memory. It is a non-throwing new, which means that it returns
	/// null on error. (If that is what the allocator does. The current does, so if
	/// this ever changes, this operator will have to be changed, too.)
	/// Usage looks like this (assuming there's an MCContext 'Context' in scope):
	/// \code
	/// // Default alignment (8)
	/// IntegerLiteral *Ex = new (Context) IntegerLiteral(arguments);
	/// // Specific alignment
	/// IntegerLiteral *Ex2 = new (Context, 4) IntegerLiteral(arguments);
	/// \endcode
	/// Please note that you cannot use delete on the pointer; it must be
	/// deallocated using an explicit destructor call followed by
	/// \c Context.Deallocate(Ptr).
	///
	/// \param Bytes The number of bytes to allocate. Calculated by the compiler.
	/// \param C The MCContext that provides the allocator.
	/// \param Alignment The alignment of the allocated memory (if the underlying
	/// allocator supports it).
	/// \return The allocated memory. Could be NULL.
	inline void *operator new(size_t Bytes, llvm::MCContext &C,
	size_t Alignment = 8) noexcept {
	return C.allocate(Bytes, Alignment);
	}
	/// Placement delete companion to the new above.
	///
	/// This operator is just a companion to the new above. There is no way of
	/// invoking it directly; see the new operator for more details. This operator
	/// is called implicitly by the compiler if a placement new expression using
	/// the MCContext throws in the object constructor.
	inline void operator delete(void *Ptr, llvm::MCContext &C, size_t) noexcept {
	C.deallocate(Ptr);
	}

	/// This placement form of operator new[] uses the MCContext's allocator for
	/// obtaining memory. It is a non-throwing new[], which means that it returns
	/// null on error.
	/// Usage looks like this (assuming there's an MCContext 'Context' in scope):
	/// \code
	/// // Default alignment (8)
	/// char *data = new (Context) char[10];
	/// // Specific alignment
	/// char *data = new (Context, 4) char[10];
	/// \endcode
	/// Please note that you cannot use delete on the pointer; it must be
	/// deallocated using an explicit destructor call followed by
	/// \c Context.Deallocate(Ptr).
	///
	/// \param Bytes The number of bytes to allocate. Calculated by the compiler.
	/// \param C The MCContext that provides the allocator.
	/// \param Alignment The alignment of the allocated memory (if the underlying
	/// allocator supports it).
	/// \return The allocated memory. Could be NULL.
	inline void *operator new[](size_t Bytes, llvm::MCContext &C,
	size_t Alignment = 8) noexcept {
	return C.allocate(Bytes, Alignment);
	}

	/// Placement delete[] companion to the new[] above.
	///
	/// This operator is just a companion to the new[] above. There is no way of
	/// invoking it directly; see the new[] operator for more details. This operator
	/// is called implicitly by the compiler if a placement new[] expression using
	/// the MCContext throws in the object constructor.
	inline void operator delete[](void *Ptr, llvm::MCContext &C) noexcept {
	C.deallocate(Ptr);
	}

	#endif // LLVM_MC_MCCONTEXT_H
	Index: vendor/llvm/dist-release_90/include/llvm/Support/AArch64TargetParser.def
	===================================================================
	--- vendor/llvm/dist-release_90/include/llvm/Support/AArch64TargetParser.def (revision 351302)
	+++ vendor/llvm/dist-release_90/include/llvm/Support/AArch64TargetParser.def (revision 351303)
	@@ -1,141 +1,141 @@
	//===- AARCH64TargetParser.def - AARCH64 target parsing defines ---------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file provides defines to build up the AARCH64 target parser's logic.
	//
	//===----------------------------------------------------------------------===//

	// NOTE: NO INCLUDE GUARD DESIRED!

	#ifndef AARCH64_ARCH
	#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT)
	#endif
	AARCH64_ARCH("invalid", INVALID, "", "",
	ARMBuildAttrs::CPUArch::v8_A, FK_NONE, AArch64::AEK_NONE)
	AARCH64_ARCH("armv8-a", ARMV8A, "8-A", "v8", ARMBuildAttrs::CPUArch::v8_A,
	FK_CRYPTO_NEON_FP_ARMV8,
	(AArch64::AEK_CRYPTO \| AArch64::AEK_FP \| AArch64::AEK_SIMD))
	AARCH64_ARCH("armv8.1-a", ARMV8_1A, "8.1-A", "v8.1a",
	ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
	(AArch64::AEK_CRC \| AArch64::AEK_CRYPTO \| AArch64::AEK_FP \|
	AArch64::AEK_SIMD \| AArch64::AEK_LSE \| AArch64::AEK_RDM))
	AARCH64_ARCH("armv8.2-a", ARMV8_2A, "8.2-A", "v8.2a",
	ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
	(AArch64::AEK_CRC \| AArch64::AEK_CRYPTO \| AArch64::AEK_FP \|
	AArch64::AEK_SIMD \| AArch64::AEK_RAS \| AArch64::AEK_LSE \|
	AArch64::AEK_RDM))
	AARCH64_ARCH("armv8.3-a", ARMV8_3A, "8.3-A", "v8.3a",
	ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
	(AArch64::AEK_CRC \| AArch64::AEK_CRYPTO \| AArch64::AEK_FP \|
	AArch64::AEK_SIMD \| AArch64::AEK_RAS \| AArch64::AEK_LSE \|
	AArch64::AEK_RDM \| AArch64::AEK_RCPC))
	AARCH64_ARCH("armv8.4-a", ARMV8_4A, "8.4-A", "v8.4a",
	ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
	(AArch64::AEK_CRC \| AArch64::AEK_CRYPTO \| AArch64::AEK_FP \|
	AArch64::AEK_SIMD \| AArch64::AEK_RAS \| AArch64::AEK_LSE \|
	AArch64::AEK_RDM \| AArch64::AEK_RCPC \| AArch64::AEK_DOTPROD))
	AARCH64_ARCH("armv8.5-a", ARMV8_5A, "8.5-A", "v8.5a",
	ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
	(AArch64::AEK_CRC \| AArch64::AEK_CRYPTO \| AArch64::AEK_FP \|
	AArch64::AEK_SIMD \| AArch64::AEK_RAS \| AArch64::AEK_LSE \|
	AArch64::AEK_RDM \| AArch64::AEK_RCPC \| AArch64::AEK_DOTPROD))
	#undef AARCH64_ARCH

	#ifndef AARCH64_ARCH_EXT_NAME
	#define AARCH64_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE)
	#endif
	// FIXME: This would be nicer were it tablegen
	-AARCH64_ARCH_EXT_NAME("invalid", AArch64::AEK_INVALID, nullptr, nullptr)
	-AARCH64_ARCH_EXT_NAME("none", AArch64::AEK_NONE, nullptr, nullptr)
	-AARCH64_ARCH_EXT_NAME("crc", AArch64::AEK_CRC, "+crc", "-crc")
	-AARCH64_ARCH_EXT_NAME("lse", AArch64::AEK_LSE, "+lse", "-lse")
	-AARCH64_ARCH_EXT_NAME("rdm", AArch64::AEK_RDM, "+rdm", "-rdm")
	-AARCH64_ARCH_EXT_NAME("crypto", AArch64::AEK_CRYPTO, "+crypto","-crypto")
	-AARCH64_ARCH_EXT_NAME("sm4", AArch64::AEK_SM4, "+sm4", "-sm4")
	-AARCH64_ARCH_EXT_NAME("sha3", AArch64::AEK_SHA3, "+sha3", "-sha3")
	-AARCH64_ARCH_EXT_NAME("sha2", AArch64::AEK_SHA2, "+sha2", "-sha2")
	-AARCH64_ARCH_EXT_NAME("aes", AArch64::AEK_AES, "+aes", "-aes")
	-AARCH64_ARCH_EXT_NAME("dotprod", AArch64::AEK_DOTPROD, "+dotprod","-dotprod")
	-AARCH64_ARCH_EXT_NAME("fp", AArch64::AEK_FP, "+fp-armv8", "-fp-armv8")
	-AARCH64_ARCH_EXT_NAME("simd", AArch64::AEK_SIMD, "+neon", "-neon")
	-AARCH64_ARCH_EXT_NAME("fp16", AArch64::AEK_FP16, "+fullfp16", "-fullfp16")
	-AARCH64_ARCH_EXT_NAME("fp16fml", AArch64::AEK_FP16FML, "+fp16fml", "-fp16fml")
	-AARCH64_ARCH_EXT_NAME("profile", AArch64::AEK_PROFILE, "+spe", "-spe")
	-AARCH64_ARCH_EXT_NAME("ras", AArch64::AEK_RAS, "+ras", "-ras")
	-AARCH64_ARCH_EXT_NAME("sve", AArch64::AEK_SVE, "+sve", "-sve")
	-AARCH64_ARCH_EXT_NAME("sve2", AArch64::AEK_SVE2, "+sve2", "-sve2")
	-AARCH64_ARCH_EXT_NAME("sve2-aes", AArch64::AEK_SVE2AES, "+sve2-aes", "-sve2-aes")
	-AARCH64_ARCH_EXT_NAME("sve2-sm4", AArch64::AEK_SVE2SM4, "+sve2-sm4", "-sve2-sm4")
	-AARCH64_ARCH_EXT_NAME("sve2-sha3", AArch64::AEK_SVE2SHA3, "+sve2-sha3", "-sve2-sha3")
	-AARCH64_ARCH_EXT_NAME("bitperm", AArch64::AEK_BITPERM, "+bitperm", "-bitperm")
	-AARCH64_ARCH_EXT_NAME("rcpc", AArch64::AEK_RCPC, "+rcpc", "-rcpc")
	-AARCH64_ARCH_EXT_NAME("rng", AArch64::AEK_RAND, "+rand", "-rand")
	-AARCH64_ARCH_EXT_NAME("memtag", AArch64::AEK_MTE, "+mte", "-mte")
	-AARCH64_ARCH_EXT_NAME("ssbs", AArch64::AEK_SSBS, "+ssbs", "-ssbs")
	-AARCH64_ARCH_EXT_NAME("sb", AArch64::AEK_SB, "+sb", "-sb")
	-AARCH64_ARCH_EXT_NAME("predres", AArch64::AEK_PREDRES, "+predres", "-predres")
	+AARCH64_ARCH_EXT_NAME("invalid", AArch64::AEK_INVALID, nullptr, nullptr)
	+AARCH64_ARCH_EXT_NAME("none", AArch64::AEK_NONE, nullptr, nullptr)
	+AARCH64_ARCH_EXT_NAME("crc", AArch64::AEK_CRC, "+crc", "-crc")
	+AARCH64_ARCH_EXT_NAME("lse", AArch64::AEK_LSE, "+lse", "-lse")
	+AARCH64_ARCH_EXT_NAME("rdm", AArch64::AEK_RDM, "+rdm", "-rdm")
	+AARCH64_ARCH_EXT_NAME("crypto", AArch64::AEK_CRYPTO, "+crypto","-crypto")
	+AARCH64_ARCH_EXT_NAME("sm4", AArch64::AEK_SM4, "+sm4", "-sm4")
	+AARCH64_ARCH_EXT_NAME("sha3", AArch64::AEK_SHA3, "+sha3", "-sha3")
	+AARCH64_ARCH_EXT_NAME("sha2", AArch64::AEK_SHA2, "+sha2", "-sha2")
	+AARCH64_ARCH_EXT_NAME("aes", AArch64::AEK_AES, "+aes", "-aes")
	+AARCH64_ARCH_EXT_NAME("dotprod", AArch64::AEK_DOTPROD, "+dotprod","-dotprod")
	+AARCH64_ARCH_EXT_NAME("fp", AArch64::AEK_FP, "+fp-armv8", "-fp-armv8")
	+AARCH64_ARCH_EXT_NAME("simd", AArch64::AEK_SIMD, "+neon", "-neon")
	+AARCH64_ARCH_EXT_NAME("fp16", AArch64::AEK_FP16, "+fullfp16", "-fullfp16")
	+AARCH64_ARCH_EXT_NAME("fp16fml", AArch64::AEK_FP16FML, "+fp16fml", "-fp16fml")
	+AARCH64_ARCH_EXT_NAME("profile", AArch64::AEK_PROFILE, "+spe", "-spe")
	+AARCH64_ARCH_EXT_NAME("ras", AArch64::AEK_RAS, "+ras", "-ras")
	+AARCH64_ARCH_EXT_NAME("sve", AArch64::AEK_SVE, "+sve", "-sve")
	+AARCH64_ARCH_EXT_NAME("sve2", AArch64::AEK_SVE2, "+sve2", "-sve2")
	+AARCH64_ARCH_EXT_NAME("sve2-aes", AArch64::AEK_SVE2AES, "+sve2-aes", "-sve2-aes")
	+AARCH64_ARCH_EXT_NAME("sve2-sm4", AArch64::AEK_SVE2SM4, "+sve2-sm4", "-sve2-sm4")
	+AARCH64_ARCH_EXT_NAME("sve2-sha3", AArch64::AEK_SVE2SHA3, "+sve2-sha3", "-sve2-sha3")
	+AARCH64_ARCH_EXT_NAME("sve2-bitperm", AArch64::AEK_SVE2BITPERM, "+sve2-bitperm", "-sve2-bitperm")
	+AARCH64_ARCH_EXT_NAME("rcpc", AArch64::AEK_RCPC, "+rcpc", "-rcpc")
	+AARCH64_ARCH_EXT_NAME("rng", AArch64::AEK_RAND, "+rand", "-rand")
	+AARCH64_ARCH_EXT_NAME("memtag", AArch64::AEK_MTE, "+mte", "-mte")
	+AARCH64_ARCH_EXT_NAME("ssbs", AArch64::AEK_SSBS, "+ssbs", "-ssbs")
	+AARCH64_ARCH_EXT_NAME("sb", AArch64::AEK_SB, "+sb", "-sb")
	+AARCH64_ARCH_EXT_NAME("predres", AArch64::AEK_PREDRES, "+predres", "-predres")
	#undef AARCH64_ARCH_EXT_NAME

	#ifndef AARCH64_CPU_NAME
	#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)
	#endif
	AARCH64_CPU_NAME("cortex-a35", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_CRC))
	AARCH64_CPU_NAME("cortex-a53", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, true,
	(AArch64::AEK_CRC))
	AARCH64_CPU_NAME("cortex-a55", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_FP16 \| AArch64::AEK_DOTPROD \| AArch64::AEK_RCPC))
	AARCH64_CPU_NAME("cortex-a57", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_CRC))
	AARCH64_CPU_NAME("cortex-a72", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_CRC))
	AARCH64_CPU_NAME("cortex-a73", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_CRC))
	AARCH64_CPU_NAME("cortex-a75", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_FP16 \| AArch64::AEK_DOTPROD \| AArch64::AEK_RCPC))
	AARCH64_CPU_NAME("cortex-a76", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_FP16 \| AArch64::AEK_DOTPROD \| AArch64::AEK_RCPC \|
	AArch64::AEK_SSBS))
	AARCH64_CPU_NAME("cortex-a76ae", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_FP16 \| AArch64::AEK_DOTPROD \| AArch64::AEK_RCPC \|
	AArch64::AEK_SSBS))
	AARCH64_CPU_NAME("cyclone", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_NONE))
	AARCH64_CPU_NAME("exynos-m1", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_CRC))
	AARCH64_CPU_NAME("exynos-m2", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_CRC))
	AARCH64_CPU_NAME("exynos-m3", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_CRC))
	AARCH64_CPU_NAME("exynos-m4", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_DOTPROD \| AArch64::AEK_FP16))
	AARCH64_CPU_NAME("exynos-m5", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_DOTPROD \| AArch64::AEK_FP16))
	AARCH64_CPU_NAME("falkor", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_CRC \| AArch64::AEK_RDM))
	AARCH64_CPU_NAME("saphira", ARMV8_3A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_PROFILE))
	AARCH64_CPU_NAME("kryo", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_CRC))
	AARCH64_CPU_NAME("thunderx2t99", ARMV8_1A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_NONE))
	AARCH64_CPU_NAME("thunderx", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_CRC \| AArch64::AEK_PROFILE))
	AARCH64_CPU_NAME("thunderxt88", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_CRC \| AArch64::AEK_PROFILE))
	AARCH64_CPU_NAME("thunderxt81", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_CRC \| AArch64::AEK_PROFILE))
	AARCH64_CPU_NAME("thunderxt83", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_CRC \| AArch64::AEK_PROFILE))
	AARCH64_CPU_NAME("tsv110", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false,
	(AArch64::AEK_DOTPROD \|
	AArch64::AEK_FP16 \| AArch64::AEK_FP16FML \|
	AArch64::AEK_PROFILE))
	// Invalid CPU
	AARCH64_CPU_NAME("invalid", INVALID, FK_INVALID, true, AArch64::AEK_INVALID)
	#undef AARCH64_CPU_NAME
	Index: vendor/llvm/dist-release_90/include/llvm/Support/AArch64TargetParser.h
	===================================================================
	--- vendor/llvm/dist-release_90/include/llvm/Support/AArch64TargetParser.h (revision 351302)
	+++ vendor/llvm/dist-release_90/include/llvm/Support/AArch64TargetParser.h (revision 351303)
	@@ -1,128 +1,128 @@
	//===-- AArch64TargetParser - Parser for AArch64 features -------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements a target parser to recognise AArch64 hardware features
	// such as FPU/CPU/ARCH and extension names.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_SUPPORT_AARCH64TARGETPARSERCOMMON_H
	#define LLVM_SUPPORT_AARCH64TARGETPARSERCOMMON_H

	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/Support/ARMTargetParser.h"
	#include <vector>

	// FIXME:This should be made into class design,to avoid dupplication.
	namespace llvm {
	namespace AArch64 {

	// Arch extension modifiers for CPUs.
	enum ArchExtKind : unsigned {
	AEK_INVALID = 0,
	AEK_NONE = 1,
	AEK_CRC = 1 << 1,
	AEK_CRYPTO = 1 << 2,
	AEK_FP = 1 << 3,
	AEK_SIMD = 1 << 4,
	AEK_FP16 = 1 << 5,
	AEK_PROFILE = 1 << 6,
	AEK_RAS = 1 << 7,
	AEK_LSE = 1 << 8,
	AEK_SVE = 1 << 9,
	AEK_DOTPROD = 1 << 10,
	AEK_RCPC = 1 << 11,
	AEK_RDM = 1 << 12,
	AEK_SM4 = 1 << 13,
	AEK_SHA3 = 1 << 14,
	AEK_SHA2 = 1 << 15,
	AEK_AES = 1 << 16,
	AEK_FP16FML = 1 << 17,
	AEK_RAND = 1 << 18,
	AEK_MTE = 1 << 19,
	AEK_SSBS = 1 << 20,
	AEK_SB = 1 << 21,
	AEK_PREDRES = 1 << 22,
	AEK_SVE2 = 1 << 23,
	AEK_SVE2AES = 1 << 24,
	AEK_SVE2SM4 = 1 << 25,
	AEK_SVE2SHA3 = 1 << 26,
	- AEK_BITPERM = 1 << 27,
	+ AEK_SVE2BITPERM = 1 << 27,
	};

	enum class ArchKind {
	#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) ID,
	#include "AArch64TargetParser.def"
	};

	const ARM::ArchNames<ArchKind> AArch64ARCHNames[] = {
	#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, \
	ARCH_BASE_EXT) \
	{NAME, \
	sizeof(NAME) - 1, \
	CPU_ATTR, \
	sizeof(CPU_ATTR) - 1, \
	SUB_ARCH, \
	sizeof(SUB_ARCH) - 1, \
	ARM::FPUKind::ARCH_FPU, \
	ARCH_BASE_EXT, \
	AArch64::ArchKind::ID, \
	ARCH_ATTR},
	#include "AArch64TargetParser.def"
	};

	const ARM::ExtName AArch64ARCHExtNames[] = {
	#define AARCH64_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) \
	{NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE},
	#include "AArch64TargetParser.def"
	};

	const ARM::CpuNames<ArchKind> AArch64CPUNames[] = {
	#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
	{NAME, sizeof(NAME) - 1, AArch64::ArchKind::ID, IS_DEFAULT, DEFAULT_EXT},
	#include "AArch64TargetParser.def"
	};

	const ArchKind ArchKinds[] = {
	#define AARCH64_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) \
	ArchKind::ID,
	#include "AArch64TargetParser.def"
	};

	// FIXME: These should be moved to TargetTuple once it exists
	bool getExtensionFeatures(unsigned Extensions,
	std::vector<StringRef> &Features);
	bool getArchFeatures(ArchKind AK, std::vector<StringRef> &Features);

	StringRef getArchName(ArchKind AK);
	unsigned getArchAttr(ArchKind AK);
	StringRef getCPUAttr(ArchKind AK);
	StringRef getSubArch(ArchKind AK);
	StringRef getArchExtName(unsigned ArchExtKind);
	StringRef getArchExtFeature(StringRef ArchExt);

	// Information by Name
	unsigned getDefaultFPU(StringRef CPU, ArchKind AK);
	unsigned getDefaultExtensions(StringRef CPU, ArchKind AK);
	StringRef getDefaultCPU(StringRef Arch);
	ArchKind getCPUArchKind(StringRef CPU);

	// Parser
	ArchKind parseArch(StringRef Arch);
	ArchExtKind parseArchExt(StringRef ArchExt);
	ArchKind parseCPUArch(StringRef CPU);
	// Used by target parser tests
	void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);

	bool isX18ReservedByDefault(const Triple &TT);

	} // namespace AArch64
	} // namespace llvm

	#endif
	Index: vendor/llvm/dist-release_90/include/llvm/Support/ARMTargetParser.h
	===================================================================
	--- vendor/llvm/dist-release_90/include/llvm/Support/ARMTargetParser.h (revision 351302)
	+++ vendor/llvm/dist-release_90/include/llvm/Support/ARMTargetParser.h (revision 351303)
	@@ -1,273 +1,267 @@
	//===-- ARMTargetParser - Parser for ARM target features --------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements a target parser to recognise ARM hardware features
	// such as FPU/CPU/ARCH/extensions and specific support such as HWDIV.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_SUPPORT_ARMTARGETPARSER_H
	#define LLVM_SUPPORT_ARMTARGETPARSER_H

	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/Support/ARMBuildAttributes.h"
	#include <vector>

	namespace llvm {
	namespace ARM {

	// Arch extension modifiers for CPUs.
	// Note that this is not the same as the AArch64 list
	enum ArchExtKind : unsigned {
	AEK_INVALID = 0,
	AEK_NONE = 1,
	AEK_CRC = 1 << 1,
	AEK_CRYPTO = 1 << 2,
	AEK_FP = 1 << 3,
	AEK_HWDIVTHUMB = 1 << 4,
	AEK_HWDIVARM = 1 << 5,
	AEK_MP = 1 << 6,
	AEK_SIMD = 1 << 7,
	AEK_SEC = 1 << 8,
	AEK_VIRT = 1 << 9,
	AEK_DSP = 1 << 10,
	AEK_FP16 = 1 << 11,
	AEK_RAS = 1 << 12,
	- AEK_SVE = 1 << 13,
	- AEK_DOTPROD = 1 << 14,
	- AEK_SHA2 = 1 << 15,
	- AEK_AES = 1 << 16,
	- AEK_FP16FML = 1 << 17,
	- AEK_SB = 1 << 18,
	- AEK_SVE2 = 1 << 19,
	- AEK_SVE2AES = 1 << 20,
	- AEK_SVE2SM4 = 1 << 21,
	- AEK_SVE2SHA3 = 1 << 22,
	- AEK_BITPERM = 1 << 23,
	- AEK_FP_DP = 1 << 24,
	- AEK_LOB = 1 << 25,
	+ AEK_DOTPROD = 1 << 13,
	+ AEK_SHA2 = 1 << 14,
	+ AEK_AES = 1 << 15,
	+ AEK_FP16FML = 1 << 16,
	+ AEK_SB = 1 << 17,
	+ AEK_FP_DP = 1 << 18,
	+ AEK_LOB = 1 << 19,
	// Unsupported extensions.
	AEK_OS = 0x8000000,
	AEK_IWMMXT = 0x10000000,
	AEK_IWMMXT2 = 0x20000000,
	AEK_MAVERICK = 0x40000000,
	AEK_XSCALE = 0x80000000,
	};

	// List of Arch Extension names.
	// FIXME: TableGen this.
	struct ExtName {
	const char *NameCStr;
	size_t NameLength;
	unsigned ID;
	const char *Feature;
	const char *NegFeature;

	StringRef getName() const { return StringRef(NameCStr, NameLength); }
	};

	const ExtName ARCHExtNames[] = {
	#define ARM_ARCH_EXT_NAME(NAME, ID, FEATURE, NEGFEATURE) \
	{NAME, sizeof(NAME) - 1, ID, FEATURE, NEGFEATURE},
	#include "ARMTargetParser.def"
	};

	// List of HWDiv names (use getHWDivSynonym) and which architectural
	// features they correspond to (use getHWDivFeatures).
	// FIXME: TableGen this.
	const struct {
	const char *NameCStr;
	size_t NameLength;
	unsigned ID;

	StringRef getName() const { return StringRef(NameCStr, NameLength); }
	} HWDivNames[] = {
	#define ARM_HW_DIV_NAME(NAME, ID) {NAME, sizeof(NAME) - 1, ID},
	#include "ARMTargetParser.def"
	};

	// Arch names.
	enum class ArchKind {
	#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, ARCH_BASE_EXT) ID,
	#include "ARMTargetParser.def"
	};

	// List of CPU names and their arches.
	// The same CPU can have multiple arches and can be default on multiple arches.
	// When finding the Arch for a CPU, first-found prevails. Sort them accordingly.
	// When this becomes table-generated, we'd probably need two tables.
	// FIXME: TableGen this.
	template <typename T> struct CpuNames {
	const char *NameCStr;
	size_t NameLength;
	T ArchID;
	bool Default; // is $Name the default CPU for $ArchID ?
	unsigned DefaultExtensions;

	StringRef getName() const { return StringRef(NameCStr, NameLength); }
	};

	const CpuNames<ArchKind> CPUNames[] = {
	#define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
	{NAME, sizeof(NAME) - 1, ARM::ArchKind::ID, IS_DEFAULT, DEFAULT_EXT},
	#include "ARMTargetParser.def"
	};

	// FPU names.
	enum FPUKind {
	#define ARM_FPU(NAME, KIND, VERSION, NEON_SUPPORT, RESTRICTION) KIND,
	#include "ARMTargetParser.def"
	FK_LAST
	};

	// FPU Version
	enum class FPUVersion {
	NONE,
	VFPV2,
	VFPV3,
	VFPV3_FP16,
	VFPV4,
	VFPV5,
	VFPV5_FULLFP16,
	};

	// An FPU name restricts the FPU in one of three ways:
	enum class FPURestriction {
	None = 0, ///< No restriction
	D16, ///< Only 16 D registers
	SP_D16 ///< Only single-precision instructions, with 16 D registers
	};

	// An FPU name implies one of three levels of Neon support:
	enum class NeonSupportLevel {
	None = 0, ///< No Neon
	Neon, ///< Neon
	Crypto ///< Neon with Crypto
	};

	// ISA kinds.
	enum class ISAKind { INVALID = 0, ARM, THUMB, AARCH64 };

	// Endianness
	// FIXME: BE8 vs. BE32?
	enum class EndianKind { INVALID = 0, LITTLE, BIG };

	// v6/v7/v8 Profile
	enum class ProfileKind { INVALID = 0, A, R, M };

	// List of canonical FPU names (use getFPUSynonym) and which architectural
	// features they correspond to (use getFPUFeatures).
	// FIXME: TableGen this.
	// The entries must appear in the order listed in ARM::FPUKind for correct
	// indexing
	struct FPUName {
	const char *NameCStr;
	size_t NameLength;
	FPUKind ID;
	FPUVersion FPUVer;
	NeonSupportLevel NeonSupport;
	FPURestriction Restriction;

	StringRef getName() const { return StringRef(NameCStr, NameLength); }
	};

	static const FPUName FPUNames[] = {
	#define ARM_FPU(NAME, KIND, VERSION, NEON_SUPPORT, RESTRICTION) \
	{NAME, sizeof(NAME) - 1, KIND, VERSION, NEON_SUPPORT, RESTRICTION},
	#include "llvm/Support/ARMTargetParser.def"
	};

	// List of canonical arch names (use getArchSynonym).
	// This table also provides the build attribute fields for CPU arch
	// and Arch ID, according to the Addenda to the ARM ABI, chapters
	// 2.4 and 2.3.5.2 respectively.
	// FIXME: SubArch values were simplified to fit into the expectations
	// of the triples and are not conforming with their official names.
	// Check to see if the expectation should be changed.
	// FIXME: TableGen this.
	template <typename T> struct ArchNames {
	const char *NameCStr;
	size_t NameLength;
	const char *CPUAttrCStr;
	size_t CPUAttrLength;
	const char *SubArchCStr;
	size_t SubArchLength;
	unsigned DefaultFPU;
	unsigned ArchBaseExtensions;
	T ID;
	ARMBuildAttrs::CPUArch ArchAttr; // Arch ID in build attributes.

	StringRef getName() const { return StringRef(NameCStr, NameLength); }

	// CPU class in build attributes.
	StringRef getCPUAttr() const { return StringRef(CPUAttrCStr, CPUAttrLength); }

	// Sub-Arch name.
	StringRef getSubArch() const { return StringRef(SubArchCStr, SubArchLength); }
	};

	static const ArchNames<ArchKind> ARCHNames[] = {
	#define ARM_ARCH(NAME, ID, CPU_ATTR, SUB_ARCH, ARCH_ATTR, ARCH_FPU, \
	ARCH_BASE_EXT) \
	{NAME, sizeof(NAME) - 1, \
	CPU_ATTR, sizeof(CPU_ATTR) - 1, \
	SUB_ARCH, sizeof(SUB_ARCH) - 1, \
	ARCH_FPU, ARCH_BASE_EXT, \
	ArchKind::ID, ARCH_ATTR},
	#include "llvm/Support/ARMTargetParser.def"
	};

	// Information by ID
	StringRef getFPUName(unsigned FPUKind);
	FPUVersion getFPUVersion(unsigned FPUKind);
	NeonSupportLevel getFPUNeonSupportLevel(unsigned FPUKind);
	FPURestriction getFPURestriction(unsigned FPUKind);

	// FIXME: These should be moved to TargetTuple once it exists
	bool getFPUFeatures(unsigned FPUKind, std::vector<StringRef> &Features);
	bool getHWDivFeatures(unsigned HWDivKind, std::vector<StringRef> &Features);
	bool getExtensionFeatures(unsigned Extensions,
	std::vector<StringRef> &Features);

	StringRef getArchName(ArchKind AK);
	unsigned getArchAttr(ArchKind AK);
	StringRef getCPUAttr(ArchKind AK);
	StringRef getSubArch(ArchKind AK);
	StringRef getArchExtName(unsigned ArchExtKind);
	StringRef getArchExtFeature(StringRef ArchExt);
	bool appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK, StringRef ArchExt,
	std::vector<StringRef> &Features);
	StringRef getHWDivName(unsigned HWDivKind);

	// Information by Name
	unsigned getDefaultFPU(StringRef CPU, ArchKind AK);
	unsigned getDefaultExtensions(StringRef CPU, ArchKind AK);
	StringRef getDefaultCPU(StringRef Arch);
	StringRef getCanonicalArchName(StringRef Arch);
	StringRef getFPUSynonym(StringRef FPU);
	StringRef getArchSynonym(StringRef Arch);

	// Parser
	unsigned parseHWDiv(StringRef HWDiv);
	unsigned parseFPU(StringRef FPU);
	ArchKind parseArch(StringRef Arch);
	unsigned parseArchExt(StringRef ArchExt);
	ArchKind parseCPUArch(StringRef CPU);
	ISAKind parseArchISA(StringRef Arch);
	EndianKind parseArchEndian(StringRef Arch);
	ProfileKind parseArchProfile(StringRef Arch);
	unsigned parseArchVersion(StringRef Arch);

	void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
	StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU);

	} // namespace ARM
	} // namespace llvm

	#endif
	Index: vendor/llvm/dist-release_90/include/llvm/Transforms/Utils/BypassSlowDivision.h
	===================================================================
	--- vendor/llvm/dist-release_90/include/llvm/Transforms/Utils/BypassSlowDivision.h (revision 351302)
	+++ vendor/llvm/dist-release_90/include/llvm/Transforms/Utils/BypassSlowDivision.h (revision 351303)
	@@ -1,69 +1,72 @@
	//===- llvm/Transforms/Utils/BypassSlowDivision.h ---------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains an optimization for div and rem on architectures that
	// execute short instructions significantly faster than longer instructions.
	// For example, on Intel Atom 32-bit divides are slow enough that during
	// runtime it is profitable to check the value of the operands, and if they are
	// positive and less than 256 use an unsigned 8-bit divide.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
	#define LLVM_TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H

	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseMapInfo.h"
	+#include "llvm/IR/ValueHandle.h"
	#include <cstdint>

	namespace llvm {

	class BasicBlock;
	class Value;

	struct DivRemMapKey {
	bool SignedOp;
	- Value *Dividend;
	- Value *Divisor;
	+ AssertingVH<Value> Dividend;
	+ AssertingVH<Value> Divisor;

	DivRemMapKey(bool InSignedOp, Value InDividend, Value InDivisor)
	: SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
	};

	template <> struct DenseMapInfo<DivRemMapKey> {
	static bool isEqual(const DivRemMapKey &Val1, const DivRemMapKey &Val2) {
	return Val1.SignedOp == Val2.SignedOp && Val1.Dividend == Val2.Dividend &&
	Val1.Divisor == Val2.Divisor;
	}

	static DivRemMapKey getEmptyKey() {
	return DivRemMapKey(false, nullptr, nullptr);
	}

	static DivRemMapKey getTombstoneKey() {
	return DivRemMapKey(true, nullptr, nullptr);
	}

	static unsigned getHashValue(const DivRemMapKey &Val) {
	- return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^
	- reinterpret_cast<uintptr_t>(Val.Divisor)) ^
	+ return (unsigned)(reinterpret_cast<uintptr_t>(
	+ static_cast<Value *>(Val.Dividend)) ^
	+ reinterpret_cast<uintptr_t>(
	+ static_cast<Value *>(Val.Divisor))) ^
	(unsigned)Val.SignedOp;
	}
	};

	/// This optimization identifies DIV instructions in a BB that can be
	/// profitably bypassed and carried out with a shorter, faster divide.
	///
	/// This optimization may add basic blocks immediately after BB; for obvious
	/// reasons, you shouldn't pass those blocks to bypassSlowDivision.
	bool bypassSlowDivision(
	BasicBlock *BB, const DenseMap<unsigned int, unsigned int> &BypassWidth);

	} // end namespace llvm

	#endif // LLVM_TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
	Index: vendor/llvm/dist-release_90/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp (revision 351303)
	@@ -1,663 +1,664 @@
	//===-- AsmPrinterInlineAsm.cpp - AsmPrinter Inline Asm Handling ----------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the inline assembler pieces of the AsmPrinter class.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/CodeGen/AsmPrinter.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Module.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCParser/MCTargetAsmParser.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MCSubtargetInfo.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MemoryBuffer.h"
	#include "llvm/Support/SourceMgr.h"
	#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	using namespace llvm;

	#define DEBUG_TYPE "asm-printer"

	/// srcMgrDiagHandler - This callback is invoked when the SourceMgr for an
	/// inline asm has an error in it. diagInfo is a pointer to the SrcMgrDiagInfo
	/// struct above.
	static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
	AsmPrinter::SrcMgrDiagInfo *DiagInfo =
	static_cast<AsmPrinter::SrcMgrDiagInfo *>(diagInfo);
	assert(DiagInfo && "Diagnostic context not passed down?");

	// Look up a LocInfo for the buffer this diagnostic is coming from.
	unsigned BufNum = DiagInfo->SrcMgr.FindBufferContainingLoc(Diag.getLoc());
	const MDNode *LocInfo = nullptr;
	if (BufNum > 0 && BufNum <= DiagInfo->LocInfos.size())
	LocInfo = DiagInfo->LocInfos[BufNum-1];

	// If the inline asm had metadata associated with it, pull out a location
	// cookie corresponding to which line the error occurred on.
	unsigned LocCookie = 0;
	if (LocInfo) {
	unsigned ErrorLine = Diag.getLineNo()-1;
	if (ErrorLine >= LocInfo->getNumOperands())
	ErrorLine = 0;

	if (LocInfo->getNumOperands() != 0)
	if (const ConstantInt *CI =
	mdconst::dyn_extract<ConstantInt>(LocInfo->getOperand(ErrorLine)))
	LocCookie = CI->getZExtValue();
	}

	DiagInfo->DiagHandler(Diag, DiagInfo->DiagContext, LocCookie);
	}

	unsigned AsmPrinter::addInlineAsmDiagBuffer(StringRef AsmStr,
	const MDNode *LocMDNode) const {
	if (!DiagInfo) {
	DiagInfo = make_unique<SrcMgrDiagInfo>();

	MCContext &Context = MMI->getContext();
	Context.setInlineSourceManager(&DiagInfo->SrcMgr);

	LLVMContext &LLVMCtx = MMI->getModule()->getContext();
	if (LLVMCtx.getInlineAsmDiagnosticHandler()) {
	DiagInfo->DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler();
	DiagInfo->DiagContext = LLVMCtx.getInlineAsmDiagnosticContext();
	DiagInfo->SrcMgr.setDiagHandler(srcMgrDiagHandler, DiagInfo.get());
	}
	}

	SourceMgr &SrcMgr = DiagInfo->SrcMgr;

	std::unique_ptr<MemoryBuffer> Buffer;
	// The inline asm source manager will outlive AsmStr, so make a copy of the
	// string for SourceMgr to own.
	Buffer = MemoryBuffer::getMemBufferCopy(AsmStr, "<inline asm>");

	// Tell SrcMgr about this buffer, it takes ownership of the buffer.
	unsigned BufNum = SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc());

	// Store LocMDNode in DiagInfo, using BufNum as an identifier.
	if (LocMDNode) {
	DiagInfo->LocInfos.resize(BufNum);
	DiagInfo->LocInfos[BufNum - 1] = LocMDNode;
	}

	return BufNum;
	}


	/// EmitInlineAsm - Emit a blob of inline asm to the output streamer.
	void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
	const MCTargetOptions &MCOptions,
	const MDNode *LocMDNode,
	InlineAsm::AsmDialect Dialect) const {
	assert(!Str.empty() && "Can't emit empty inline asm block");

	// Remember if the buffer is nul terminated or not so we can avoid a copy.
	bool isNullTerminated = Str.back() == 0;
	if (isNullTerminated)
	Str = Str.substr(0, Str.size()-1);

	// If the output streamer does not have mature MC support or the integrated
	// assembler has been disabled, just emit the blob textually.
	// Otherwise parse the asm and emit it via MC support.
	// This is useful in case the asm parser doesn't handle something but the
	// system assembler does.
	const MCAsmInfo *MCAI = TM.getMCAsmInfo();
	assert(MCAI && "No MCAsmInfo");
	if (!MCAI->useIntegratedAssembler() &&
	!OutStreamer->isIntegratedAssemblerRequired()) {
	emitInlineAsmStart();
	OutStreamer->EmitRawText(Str);
	emitInlineAsmEnd(STI, nullptr);
	return;
	}

	unsigned BufNum = addInlineAsmDiagBuffer(Str, LocMDNode);
	DiagInfo->SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths);

	std::unique_ptr<MCAsmParser> Parser(createMCAsmParser(
	DiagInfo->SrcMgr, OutContext, OutStreamer, MAI, BufNum));

	// Do not use assembler-level information for parsing inline assembly.
	OutStreamer->setUseAssemblerInfoForParsing(false);

	// We create a new MCInstrInfo here since we might be at the module level
	// and not have a MachineFunction to initialize the TargetInstrInfo from and
	// we only need MCInstrInfo for asm parsing. We create one unconditionally
	// because it's not subtarget dependent.
	std::unique_ptr<MCInstrInfo> MII(TM.getTarget().createMCInstrInfo());
	std::unique_ptr<MCTargetAsmParser> TAP(TM.getTarget().createMCAsmParser(
	STI, Parser, MII, MCOptions));
	if (!TAP)
	report_fatal_error("Inline asm not supported by this streamer because"
	" we don't have an asm parser for this target\n");
	Parser->setAssemblerDialect(Dialect);
	Parser->setTargetParser(*TAP.get());
	// Enable lexing Masm binary and hex integer literals in intel inline
	// assembly.
	if (Dialect == InlineAsm::AD_Intel)
	Parser->getLexer().setLexMasmIntegers(true);

	emitInlineAsmStart();
	// Don't implicitly switch to the text section before the asm.
	int Res = Parser->Run(/NoInitialTextSection/ true,
	/NoFinalize/ true);
	emitInlineAsmEnd(STI, &TAP->getSTI());

	if (Res && !DiagInfo->DiagHandler)
	report_fatal_error("Error parsing inline asm\n");
	}

	static void EmitMSInlineAsmStr(const char AsmStr, const MachineInstr MI,
	MachineModuleInfo MMI, AsmPrinter AP,
	unsigned LocCookie, raw_ostream &OS) {
	// Switch to the inline assembly variant.
	OS << "\t.intel_syntax\n\t";

	const char *LastEmitted = AsmStr; // One past the last character emitted.
	unsigned NumOperands = MI->getNumOperands();

	while (*LastEmitted) {
	switch (*LastEmitted) {
	default: {
	// Not a special case, emit the string section literally.
	const char *LiteralEnd = LastEmitted+1;
	while (LiteralEnd && LiteralEnd != '{' && *LiteralEnd != '\|' &&
	LiteralEnd != '}' && LiteralEnd != '$' && *LiteralEnd != '\n')
	++LiteralEnd;

	OS.write(LastEmitted, LiteralEnd-LastEmitted);
	LastEmitted = LiteralEnd;
	break;
	}
	case '\n':
	++LastEmitted; // Consume newline character.
	OS << '\n'; // Indent code with newline.
	break;
	case '$': {
	++LastEmitted; // Consume '$' character.
	bool Done = true;

	// Handle escapes.
	switch (*LastEmitted) {
	default: Done = false; break;
	case '$':
	++LastEmitted; // Consume second '$' character.
	break;
	}
	if (Done) break;

	// If we have ${:foo}, then this is not a real operand reference, it is a
	// "magic" string reference, just like in .td files. Arrange to call
	// PrintSpecial.
	if (LastEmitted[0] == '{' && LastEmitted[1] == ':') {
	LastEmitted += 2;
	const char *StrStart = LastEmitted;
	const char *StrEnd = strchr(StrStart, '}');
	if (!StrEnd)
	report_fatal_error("Unterminated ${:foo} operand in inline asm"
	" string: '" + Twine(AsmStr) + "'");

	std::string Val(StrStart, StrEnd);
	AP->PrintSpecial(MI, OS, Val.c_str());
	LastEmitted = StrEnd+1;
	break;
	}

	const char *IDStart = LastEmitted;
	const char *IDEnd = IDStart;
	while (IDEnd >= '0' && IDEnd <= '9') ++IDEnd;

	unsigned Val;
	if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val))
	report_fatal_error("Bad $ operand number in inline asm string: '" +
	Twine(AsmStr) + "'");
	LastEmitted = IDEnd;

	if (Val >= NumOperands-1)
	report_fatal_error("Invalid $ operand number in inline asm string: '" +
	Twine(AsmStr) + "'");

	// Okay, we finally have a value number. Ask the target to print this
	// operand!
	unsigned OpNo = InlineAsm::MIOp_FirstOperand;

	bool Error = false;

	// Scan to find the machine operand number for the operand.
	for (; Val; --Val) {
	if (OpNo >= MI->getNumOperands()) break;
	unsigned OpFlags = MI->getOperand(OpNo).getImm();
	OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
	}

	// We may have a location metadata attached to the end of the
	// instruction, and at no point should see metadata at any
	// other point while processing. It's an error if so.
	if (OpNo >= MI->getNumOperands() \|\|
	MI->getOperand(OpNo).isMetadata()) {
	Error = true;
	} else {
	unsigned OpFlags = MI->getOperand(OpNo).getImm();
	++OpNo; // Skip over the ID number.

	if (InlineAsm::isMemKind(OpFlags)) {
	Error = AP->PrintAsmMemoryOperand(MI, OpNo, /Modifier/ nullptr, OS);
	} else {
	Error = AP->PrintAsmOperand(MI, OpNo, /Modifier/ nullptr, OS);
	}
	}
	if (Error) {
	std::string msg;
	raw_string_ostream Msg(msg);
	Msg << "invalid operand in inline asm: '" << AsmStr << "'";
	MMI->getModule()->getContext().emitError(LocCookie, Msg.str());
	}
	break;
	}
	}
	}
	OS << "\n\t.att_syntax\n" << (char)0; // null terminate string.
	}

	static void EmitGCCInlineAsmStr(const char AsmStr, const MachineInstr MI,
	MachineModuleInfo *MMI, int AsmPrinterVariant,
	AsmPrinter *AP, unsigned LocCookie,
	raw_ostream &OS) {
	int CurVariant = -1; // The number of the {.\|.\|.} region we are in.
	const char *LastEmitted = AsmStr; // One past the last character emitted.
	unsigned NumOperands = MI->getNumOperands();

	OS << '\t';

	while (*LastEmitted) {
	switch (*LastEmitted) {
	default: {
	// Not a special case, emit the string section literally.
	const char *LiteralEnd = LastEmitted+1;
	while (LiteralEnd && LiteralEnd != '{' && *LiteralEnd != '\|' &&
	LiteralEnd != '}' && LiteralEnd != '$' && *LiteralEnd != '\n')
	++LiteralEnd;
	if (CurVariant == -1 \|\| CurVariant == AsmPrinterVariant)
	OS.write(LastEmitted, LiteralEnd-LastEmitted);
	LastEmitted = LiteralEnd;
	break;
	}
	case '\n':
	++LastEmitted; // Consume newline character.
	OS << '\n'; // Indent code with newline.
	break;
	case '$': {
	++LastEmitted; // Consume '$' character.
	bool Done = true;

	// Handle escapes.
	switch (*LastEmitted) {
	default: Done = false; break;
	case '$': // $$ -> $
	if (CurVariant == -1 \|\| CurVariant == AsmPrinterVariant)
	OS << '$';
	++LastEmitted; // Consume second '$' character.
	break;
	case '(': // $( -> same as GCC's { character.
	++LastEmitted; // Consume '(' character.
	if (CurVariant != -1)
	report_fatal_error("Nested variants found in inline asm string: '" +
	Twine(AsmStr) + "'");
	CurVariant = 0; // We're in the first variant now.
	break;
	case '\|':
	++LastEmitted; // consume '\|' character.
	if (CurVariant == -1)
	OS << '\|'; // this is gcc's behavior for \| outside a variant
	else
	++CurVariant; // We're in the next variant.
	break;
	case ')': // $) -> same as GCC's } char.
	++LastEmitted; // consume ')' character.
	if (CurVariant == -1)
	OS << '}'; // this is gcc's behavior for } outside a variant
	else
	CurVariant = -1;
	break;
	}
	if (Done) break;

	bool HasCurlyBraces = false;
	if (*LastEmitted == '{') { // ${variable}
	++LastEmitted; // Consume '{' character.
	HasCurlyBraces = true;
	}

	// If we have ${:foo}, then this is not a real operand reference, it is a
	// "magic" string reference, just like in .td files. Arrange to call
	// PrintSpecial.
	if (HasCurlyBraces && *LastEmitted == ':') {
	++LastEmitted;
	const char *StrStart = LastEmitted;
	const char *StrEnd = strchr(StrStart, '}');
	if (!StrEnd)
	report_fatal_error("Unterminated ${:foo} operand in inline asm"
	" string: '" + Twine(AsmStr) + "'");

	std::string Val(StrStart, StrEnd);
	AP->PrintSpecial(MI, OS, Val.c_str());
	LastEmitted = StrEnd+1;
	break;
	}

	const char *IDStart = LastEmitted;
	const char *IDEnd = IDStart;
	while (IDEnd >= '0' && IDEnd <= '9') ++IDEnd;

	unsigned Val;
	if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val))
	report_fatal_error("Bad $ operand number in inline asm string: '" +
	Twine(AsmStr) + "'");
	LastEmitted = IDEnd;

	char Modifier[2] = { 0, 0 };

	if (HasCurlyBraces) {
	// If we have curly braces, check for a modifier character. This
	// supports syntax like ${0:u}, which correspond to "%u0" in GCC asm.
	if (*LastEmitted == ':') {
	++LastEmitted; // Consume ':' character.
	if (*LastEmitted == 0)
	report_fatal_error("Bad ${:} expression in inline asm string: '" +
	Twine(AsmStr) + "'");

	Modifier[0] = *LastEmitted;
	++LastEmitted; // Consume modifier character.
	}

	if (*LastEmitted != '}')
	report_fatal_error("Bad ${} expression in inline asm string: '" +
	Twine(AsmStr) + "'");
	++LastEmitted; // Consume '}' character.
	}

	if (Val >= NumOperands-1)
	report_fatal_error("Invalid $ operand number in inline asm string: '" +
	Twine(AsmStr) + "'");

	// Okay, we finally have a value number. Ask the target to print this
	// operand!
	if (CurVariant == -1 \|\| CurVariant == AsmPrinterVariant) {
	unsigned OpNo = InlineAsm::MIOp_FirstOperand;

	bool Error = false;

	// Scan to find the machine operand number for the operand.
	for (; Val; --Val) {
	if (OpNo >= MI->getNumOperands()) break;
	unsigned OpFlags = MI->getOperand(OpNo).getImm();
	OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
	}

	// We may have a location metadata attached to the end of the
	// instruction, and at no point should see metadata at any
	// other point while processing. It's an error if so.
	if (OpNo >= MI->getNumOperands() \|\|
	MI->getOperand(OpNo).isMetadata()) {
	Error = true;
	} else {
	unsigned OpFlags = MI->getOperand(OpNo).getImm();
	++OpNo; // Skip over the ID number.

	// FIXME: Shouldn't arch-independent output template handling go into
	// PrintAsmOperand?
	if (Modifier[0] == 'l') { // Labels are target independent.
	if (MI->getOperand(OpNo).isBlockAddress()) {
	const BlockAddress *BA = MI->getOperand(OpNo).getBlockAddress();
	MCSymbol *Sym = AP->GetBlockAddressSymbol(BA);
	Sym->print(OS, AP->MAI);
	+ MMI->getContext().registerInlineAsmLabel(Sym);
	} else if (MI->getOperand(OpNo).isMBB()) {
	const MCSymbol *Sym = MI->getOperand(OpNo).getMBB()->getSymbol();
	Sym->print(OS, AP->MAI);
	} else {
	Error = true;
	}
	} else {
	if (InlineAsm::isMemKind(OpFlags)) {
	Error = AP->PrintAsmMemoryOperand(
	MI, OpNo, Modifier[0] ? Modifier : nullptr, OS);
	} else {
	Error = AP->PrintAsmOperand(MI, OpNo,
	Modifier[0] ? Modifier : nullptr, OS);
	}
	}
	}
	if (Error) {
	std::string msg;
	raw_string_ostream Msg(msg);
	Msg << "invalid operand in inline asm: '" << AsmStr << "'";
	MMI->getModule()->getContext().emitError(LocCookie, Msg.str());
	}
	}
	break;
	}
	}
	}
	OS << '\n' << (char)0; // null terminate string.
	}

	/// EmitInlineAsm - This method formats and emits the specified machine
	/// instruction that is an inline asm.
	void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
	assert(MI->isInlineAsm() && "printInlineAsm only works on inline asms");

	// Count the number of register definitions to find the asm string.
	unsigned NumDefs = 0;
	for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef();
	++NumDefs)
	assert(NumDefs != MI->getNumOperands()-2 && "No asm string?");

	assert(MI->getOperand(NumDefs).isSymbol() && "No asm string?");

	// Disassemble the AsmStr, printing out the literal pieces, the operands, etc.
	const char *AsmStr = MI->getOperand(NumDefs).getSymbolName();

	// If this asmstr is empty, just print the #APP/#NOAPP markers.
	// These are useful to see where empty asm's wound up.
	if (AsmStr[0] == 0) {
	OutStreamer->emitRawComment(MAI->getInlineAsmStart());
	OutStreamer->emitRawComment(MAI->getInlineAsmEnd());
	return;
	}

	// Emit the #APP start marker. This has to happen even if verbose-asm isn't
	// enabled, so we use emitRawComment.
	OutStreamer->emitRawComment(MAI->getInlineAsmStart());

	// Get the !srcloc metadata node if we have it, and decode the loc cookie from
	// it.
	unsigned LocCookie = 0;
	const MDNode *LocMD = nullptr;
	for (unsigned i = MI->getNumOperands(); i != 0; --i) {
	if (MI->getOperand(i-1).isMetadata() &&
	(LocMD = MI->getOperand(i-1).getMetadata()) &&
	LocMD->getNumOperands() != 0) {
	if (const ConstantInt *CI =
	mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) {
	LocCookie = CI->getZExtValue();
	break;
	}
	}
	}

	// Emit the inline asm to a temporary string so we can emit it through
	// EmitInlineAsm.
	SmallString<256> StringData;
	raw_svector_ostream OS(StringData);

	// The variant of the current asmprinter.
	int AsmPrinterVariant = MAI->getAssemblerDialect();
	AsmPrinter AP = const_cast<AsmPrinter>(this);
	if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT)
	EmitGCCInlineAsmStr(AsmStr, MI, MMI, AsmPrinterVariant, AP, LocCookie, OS);
	else
	EmitMSInlineAsmStr(AsmStr, MI, MMI, AP, LocCookie, OS);

	// Emit warnings if we use reserved registers on the clobber list, as
	// that might give surprising results.
	std::vector<std::string> RestrRegs;
	// Start with the first operand descriptor, and iterate over them.
	for (unsigned I = InlineAsm::MIOp_FirstOperand, NumOps = MI->getNumOperands();
	I < NumOps; ++I) {
	const MachineOperand &MO = MI->getOperand(I);
	if (MO.isImm()) {
	unsigned Flags = MO.getImm();
	const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
	if (InlineAsm::getKind(Flags) == InlineAsm::Kind_Clobber &&
	!TRI->isAsmClobberable(*MF, MI->getOperand(I + 1).getReg())) {
	RestrRegs.push_back(TRI->getName(MI->getOperand(I + 1).getReg()));
	}
	// Skip to one before the next operand descriptor, if it exists.
	I += InlineAsm::getNumOperandRegisters(Flags);
	}
	}

	if (!RestrRegs.empty()) {
	unsigned BufNum = addInlineAsmDiagBuffer(OS.str(), LocMD);
	auto &SrcMgr = DiagInfo->SrcMgr;
	SMLoc Loc = SMLoc::getFromPointer(
	SrcMgr.getMemoryBuffer(BufNum)->getBuffer().begin());

	std::string Msg = "inline asm clobber list contains reserved registers: ";
	for (auto I = RestrRegs.begin(), E = RestrRegs.end(); I != E; I++) {
	if(I != RestrRegs.begin())
	Msg += ", ";
	Msg += *I;
	}
	std::string Note = "Reserved registers on the clobber list may not be "
	"preserved across the asm statement, and clobbering them may "
	"lead to undefined behaviour.";
	SrcMgr.PrintMessage(Loc, SourceMgr::DK_Warning, Msg);
	SrcMgr.PrintMessage(Loc, SourceMgr::DK_Note, Note);
	}

	EmitInlineAsm(OS.str(), getSubtargetInfo(), TM.Options.MCOptions, LocMD,
	MI->getInlineAsmDialect());

	// Emit the #NOAPP end marker. This has to happen even if verbose-asm isn't
	// enabled, so we use emitRawComment.
	OutStreamer->emitRawComment(MAI->getInlineAsmEnd());
	}


	/// PrintSpecial - Print information related to the specified machine instr
	/// that is independent of the operand, and may be independent of the instr
	/// itself. This can be useful for portably encoding the comment character
	/// or other bits of target-specific knowledge into the asmstrings. The
	/// syntax used is ${:comment}. Targets can override this to add support
	/// for their own strange codes.
	void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
	const char *Code) const {
	if (!strcmp(Code, "private")) {
	const DataLayout &DL = MF->getDataLayout();
	OS << DL.getPrivateGlobalPrefix();
	} else if (!strcmp(Code, "comment")) {
	OS << MAI->getCommentString();
	} else if (!strcmp(Code, "uid")) {
	// Comparing the address of MI isn't sufficient, because machineinstrs may
	// be allocated to the same address across functions.

	// If this is a new LastFn instruction, bump the counter.
	if (LastMI != MI \|\| LastFn != getFunctionNumber()) {
	++Counter;
	LastMI = MI;
	LastFn = getFunctionNumber();
	}
	OS << Counter;
	} else {
	std::string msg;
	raw_string_ostream Msg(msg);
	Msg << "Unknown special formatter '" << Code
	<< "' for machine instr: " << *MI;
	report_fatal_error(Msg.str());
	}
	}

	void AsmPrinter::PrintSymbolOperand(const MachineOperand &MO, raw_ostream &OS) {
	assert(MO.isGlobal() && "caller should check MO.isGlobal");
	getSymbol(MO.getGlobal())->print(OS, MAI);
	printOffset(MO.getOffset(), OS);
	}

	/// PrintAsmOperand - Print the specified operand of MI, an INLINEASM
	/// instruction, using the specified assembler variant. Targets should
	/// override this to format as appropriate for machine specific ExtraCodes
	/// or when the arch-independent handling would be too complex otherwise.
	bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
	const char *ExtraCode, raw_ostream &O) {
	// Does this asm operand have a single letter operand modifier?
	if (ExtraCode && ExtraCode[0]) {
	if (ExtraCode[1] != 0) return true; // Unknown modifier.

	// https://gcc.gnu.org/onlinedocs/gccint/Output-Template.html
	const MachineOperand &MO = MI->getOperand(OpNo);
	switch (ExtraCode[0]) {
	default:
	return true; // Unknown modifier.
	case 'a': // Print as memory address.
	if (MO.isReg()) {
	PrintAsmMemoryOperand(MI, OpNo, nullptr, O);
	return false;
	}
	LLVM_FALLTHROUGH; // GCC allows '%a' to behave like '%c' with immediates.
	case 'c': // Substitute immediate value without immediate syntax
	if (MO.isImm()) {
	O << MO.getImm();
	return false;
	}
	if (MO.isGlobal()) {
	PrintSymbolOperand(MO, O);
	return false;
	}
	return true;
	case 'n': // Negate the immediate constant.
	if (!MO.isImm())
	return true;
	O << -MO.getImm();
	return false;
	case 's': // The GCC deprecated s modifier
	if (!MO.isImm())
	return true;
	O << ((32 - MO.getImm()) & 31);
	return false;
	}
	}
	return true;
	}

	bool AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
	const char *ExtraCode, raw_ostream &O) {
	// Target doesn't support this yet!
	return true;
	}

	void AsmPrinter::emitInlineAsmStart() const {}

	void AsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
	const MCSubtargetInfo *EndInfo) const {}
	Index: vendor/llvm/dist-release_90/lib/CodeGen/CodeGenPrepare.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/CodeGen/CodeGenPrepare.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/CodeGen/CodeGenPrepare.cpp (revision 351303)
	@@ -1,7350 +1,7351 @@
	//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass munges the code in the input function to better prepare it for
	// SelectionDAG-based code generation. This works around limitations in it's
	// basic-block-at-a-time approach. It should eventually be removed.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/MapVector.h"
	#include "llvm/ADT/PointerIntPair.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/BlockFrequencyInfo.h"
	#include "llvm/Analysis/BranchProbabilityInfo.h"
	#include "llvm/Analysis/ConstantFolding.h"
	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/MemoryBuiltins.h"
	#include "llvm/Analysis/ProfileSummaryInfo.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/CodeGen/Analysis.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/Config/llvm-config.h"
	#include "llvm/IR/Argument.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/MDBuilder.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Statepoint.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/IR/ValueHandle.h"
	#include "llvm/IR/ValueMap.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/BlockFrequency.h"
	#include "llvm/Support/BranchProbability.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MachineValueType.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
	#include "llvm/Transforms/Utils/BypassSlowDivision.h"
	#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <iterator>
	#include <limits>
	#include <memory>
	#include <utility>
	#include <vector>

	using namespace llvm;
	using namespace llvm::PatternMatch;

	#define DEBUG_TYPE "codegenprepare"

	STATISTIC(NumBlocksElim, "Number of blocks eliminated");
	STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated");
	STATISTIC(NumGEPsElim, "Number of GEPs converted to casts");
	STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of "
	"sunken Cmps");
	STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
	"of sunken Casts");
	STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
	"computations were sunk");
	STATISTIC(NumMemoryInstsPhiCreated,
	"Number of phis created when address "
	"computations were sunk to memory instructions");
	STATISTIC(NumMemoryInstsSelectCreated,
	"Number of select created when address "
	"computations were sunk to memory instructions");
	STATISTIC(NumExtsMoved, "Number of [s\|z]ext instructions combined with loads");
	STATISTIC(NumExtUses, "Number of uses of [s\|z]ext instructions optimized");
	STATISTIC(NumAndsAdded,
	"Number of and mask instructions added to form ext loads");
	STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
	STATISTIC(NumRetsDup, "Number of return instructions duplicated");
	STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
	STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
	STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");

	static cl::opt<bool> DisableBranchOpts(
	"disable-cgp-branch-opts", cl::Hidden, cl::init(false),
	cl::desc("Disable branch optimizations in CodeGenPrepare"));

	static cl::opt<bool>
	DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
	cl::desc("Disable GC optimizations in CodeGenPrepare"));

	static cl::opt<bool> DisableSelectToBranch(
	"disable-cgp-select2branch", cl::Hidden, cl::init(false),
	cl::desc("Disable select to branch conversion."));

	static cl::opt<bool> AddrSinkUsingGEPs(
	"addr-sink-using-gep", cl::Hidden, cl::init(true),
	cl::desc("Address sinking in CGP using GEPs."));

	static cl::opt<bool> EnableAndCmpSinking(
	"enable-andcmp-sinking", cl::Hidden, cl::init(true),
	cl::desc("Enable sinkinig and/cmp into branches."));

	static cl::opt<bool> DisableStoreExtract(
	"disable-cgp-store-extract", cl::Hidden, cl::init(false),
	cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));

	static cl::opt<bool> StressStoreExtract(
	"stress-cgp-store-extract", cl::Hidden, cl::init(false),
	cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));

	static cl::opt<bool> DisableExtLdPromotion(
	"disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
	cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "
	"CodeGenPrepare"));

	static cl::opt<bool> StressExtLdPromotion(
	"stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
	cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
	"optimization in CodeGenPrepare"));

	static cl::opt<bool> DisablePreheaderProtect(
	"disable-preheader-prot", cl::Hidden, cl::init(false),
	cl::desc("Disable protection against removing loop preheaders"));

	static cl::opt<bool> ProfileGuidedSectionPrefix(
	"profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore,
	cl::desc("Use profile info to add section prefix for hot/cold functions"));

	static cl::opt<unsigned> FreqRatioToSkipMerge(
	"cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2),
	cl::desc("Skip merging empty blocks if (frequency of empty block) / "
	"(frequency of destination block) is greater than this ratio"));

	static cl::opt<bool> ForceSplitStore(
	"force-split-store", cl::Hidden, cl::init(false),
	cl::desc("Force store splitting no matter what the target query says."));

	static cl::opt<bool>
	EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden,
	cl::desc("Enable merging of redundant sexts when one is dominating"
	" the other."), cl::init(true));

	static cl::opt<bool> DisableComplexAddrModes(
	"disable-complex-addr-modes", cl::Hidden, cl::init(false),
	cl::desc("Disables combining addressing modes with different parts "
	"in optimizeMemoryInst."));

	static cl::opt<bool>
	AddrSinkNewPhis("addr-sink-new-phis", cl::Hidden, cl::init(false),
	cl::desc("Allow creation of Phis in Address sinking."));

	static cl::opt<bool>
	AddrSinkNewSelects("addr-sink-new-select", cl::Hidden, cl::init(true),
	cl::desc("Allow creation of selects in Address sinking."));

	static cl::opt<bool> AddrSinkCombineBaseReg(
	"addr-sink-combine-base-reg", cl::Hidden, cl::init(true),
	cl::desc("Allow combining of BaseReg field in Address sinking."));

	static cl::opt<bool> AddrSinkCombineBaseGV(
	"addr-sink-combine-base-gv", cl::Hidden, cl::init(true),
	cl::desc("Allow combining of BaseGV field in Address sinking."));

	static cl::opt<bool> AddrSinkCombineBaseOffs(
	"addr-sink-combine-base-offs", cl::Hidden, cl::init(true),
	cl::desc("Allow combining of BaseOffs field in Address sinking."));

	static cl::opt<bool> AddrSinkCombineScaledReg(
	"addr-sink-combine-scaled-reg", cl::Hidden, cl::init(true),
	cl::desc("Allow combining of ScaledReg field in Address sinking."));

	static cl::opt<bool>
	EnableGEPOffsetSplit("cgp-split-large-offset-gep", cl::Hidden,
	cl::init(true),
	cl::desc("Enable splitting large offset of GEP."));

	namespace {

	enum ExtType {
	ZeroExtension, // Zero extension has been seen.
	SignExtension, // Sign extension has been seen.
	BothExtension // This extension type is used if we saw sext after
	// ZeroExtension had been set, or if we saw zext after
	// SignExtension had been set. It makes the type
	// information of a promoted instruction invalid.
	};

	using SetOfInstrs = SmallPtrSet<Instruction *, 16>;
	using TypeIsSExt = PointerIntPair<Type *, 2, ExtType>;
	using InstrToOrigTy = DenseMap<Instruction *, TypeIsSExt>;
	using SExts = SmallVector<Instruction *, 16>;
	using ValueToSExts = DenseMap<Value *, SExts>;

	class TypePromotionTransaction;

	class CodeGenPrepare : public FunctionPass {
	const TargetMachine *TM = nullptr;
	const TargetSubtargetInfo *SubtargetInfo;
	const TargetLowering *TLI = nullptr;
	const TargetRegisterInfo *TRI;
	const TargetTransformInfo *TTI = nullptr;
	const TargetLibraryInfo *TLInfo;
	const LoopInfo *LI;
	std::unique_ptr<BlockFrequencyInfo> BFI;
	std::unique_ptr<BranchProbabilityInfo> BPI;

	/// As we scan instructions optimizing them, this is the next instruction
	/// to optimize. Transforms that can invalidate this should update it.
	BasicBlock::iterator CurInstIterator;

	/// Keeps track of non-local addresses that have been sunk into a block.
	/// This allows us to avoid inserting duplicate code for blocks with
	/// multiple load/stores of the same address. The usage of WeakTrackingVH
	/// enables SunkAddrs to be treated as a cache whose entries can be
	/// invalidated if a sunken address computation has been erased.
	ValueMap<Value*, WeakTrackingVH> SunkAddrs;

	/// Keeps track of all instructions inserted for the current function.
	SetOfInstrs InsertedInsts;

	/// Keeps track of the type of the related instruction before their
	/// promotion for the current function.
	InstrToOrigTy PromotedInsts;

	/// Keep track of instructions removed during promotion.
	SetOfInstrs RemovedInsts;

	/// Keep track of sext chains based on their initial value.
	DenseMap<Value , Instruction > SeenChainsForSExt;

	/// Keep track of GEPs accessing the same data structures such as structs or
	/// arrays that are candidates to be split later because of their large
	/// size.
	MapVector<
	AssertingVH<Value>,
	SmallVector<std::pair<AssertingVH<GetElementPtrInst>, int64_t>, 32>>
	LargeOffsetGEPMap;

	/// Keep track of new GEP base after splitting the GEPs having large offset.
	SmallSet<AssertingVH<Value>, 2> NewGEPBases;

	/// Map serial numbers to Large offset GEPs.
	DenseMap<AssertingVH<GetElementPtrInst>, int> LargeOffsetGEPID;

	/// Keep track of SExt promoted.
	ValueToSExts ValToSExtendedUses;

	/// True if optimizing for size.
	bool OptSize;

	/// DataLayout for the Function being processed.
	const DataLayout *DL = nullptr;

	/// Building the dominator tree can be expensive, so we only build it
	/// lazily and update it when required.
	std::unique_ptr<DominatorTree> DT;

	public:
	static char ID; // Pass identification, replacement for typeid

	CodeGenPrepare() : FunctionPass(ID) {
	initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
	}

	bool runOnFunction(Function &F) override;

	StringRef getPassName() const override { return "CodeGen Prepare"; }

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	// FIXME: When we can selectively preserve passes, preserve the domtree.
	AU.addRequired<ProfileSummaryInfoWrapperPass>();
	AU.addRequired<TargetLibraryInfoWrapperPass>();
	AU.addRequired<TargetTransformInfoWrapperPass>();
	AU.addRequired<LoopInfoWrapperPass>();
	}

	private:
	template <typename F>
	void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) {
	// Substituting can cause recursive simplifications, which can invalidate
	// our iterator. Use a WeakTrackingVH to hold onto it in case this
	// happens.
	Value CurValue = &CurInstIterator;
	WeakTrackingVH IterHandle(CurValue);

	f();

	// If the iterator instruction was recursively deleted, start over at the
	// start of the block.
	if (IterHandle != CurValue) {
	CurInstIterator = BB->begin();
	SunkAddrs.clear();
	}
	}

	// Get the DominatorTree, building if necessary.
	DominatorTree &getDT(Function &F) {
	if (!DT)
	DT = llvm::make_unique<DominatorTree>(F);
	return *DT;
	}

	bool eliminateFallThrough(Function &F);
	bool eliminateMostlyEmptyBlocks(Function &F);
	BasicBlock findDestBlockOfMergeableEmptyBlock(BasicBlock BB);
	bool canMergeBlocks(const BasicBlock BB, const BasicBlock DestBB) const;
	void eliminateMostlyEmptyBlock(BasicBlock *BB);
	bool isMergingEmptyBlockProfitable(BasicBlock BB, BasicBlock DestBB,
	bool isPreheader);
	bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
	bool optimizeInst(Instruction *I, bool &ModifiedDT);
	bool optimizeMemoryInst(Instruction MemoryInst, Value Addr,
	Type *AccessTy, unsigned AddrSpace);
	bool optimizeInlineAsmInst(CallInst *CS);
	bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
	bool optimizeExt(Instruction *&I);
	bool optimizeExtUses(Instruction *I);
	bool optimizeLoadExt(LoadInst *Load);
	bool optimizeShiftInst(BinaryOperator *BO);
	bool optimizeSelectInst(SelectInst *SI);
	bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
	bool optimizeSwitchInst(SwitchInst *SI);
	bool optimizeExtractElementInst(Instruction *Inst);
	bool dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT);
	bool placeDbgValues(Function &F);
	bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,
	LoadInst &LI, Instruction &Inst, bool HasPromoted);
	bool tryToPromoteExts(TypePromotionTransaction &TPT,
	const SmallVectorImpl<Instruction *> &Exts,
	SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
	unsigned CreatedInstsCost = 0);
	bool mergeSExts(Function &F);
	bool splitLargeGEPOffsets();
	bool performAddressTypePromotion(
	Instruction *&Inst,
	bool AllowPromotionWithoutCommonHeader,
	bool HasPromoted, TypePromotionTransaction &TPT,
	SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
	bool splitBranchCondition(Function &F, bool &ModifiedDT);
	bool simplifyOffsetableRelocate(Instruction &I);

	bool tryToSinkFreeOperands(Instruction *I);
	bool replaceMathCmpWithIntrinsic(BinaryOperator BO, CmpInst Cmp,
	Intrinsic::ID IID);
	bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT);
	bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
	bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
	};

	} // end anonymous namespace

	char CodeGenPrepare::ID = 0;

	INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE,
	"Optimize for code generation", false, false)
	INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
	INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE,
	"Optimize for code generation", false, false)

	FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); }

	bool CodeGenPrepare::runOnFunction(Function &F) {
	if (skipFunction(F))
	return false;

	DL = &F.getParent()->getDataLayout();

	bool EverMadeChange = false;
	// Clear per function information.
	InsertedInsts.clear();
	PromotedInsts.clear();

	if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
	TM = &TPC->getTM<TargetMachine>();
	SubtargetInfo = TM->getSubtargetImpl(F);
	TLI = SubtargetInfo->getTargetLowering();
	TRI = SubtargetInfo->getRegisterInfo();
	}
	TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
	TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
	LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
	BPI.reset(new BranchProbabilityInfo(F, *LI));
	BFI.reset(new BlockFrequencyInfo(F, BPI, LI));
	OptSize = F.hasOptSize();

	ProfileSummaryInfo *PSI =
	&getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
	if (ProfileGuidedSectionPrefix) {
	if (PSI->isFunctionHotInCallGraph(&F, *BFI))
	F.setSectionPrefix(".hot");
	else if (PSI->isFunctionColdInCallGraph(&F, *BFI))
	F.setSectionPrefix(".unlikely");
	}

	/// This optimization identifies DIV instructions that can be
	/// profitably bypassed and carried out with a shorter, faster divide.
	if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI &&
	TLI->isSlowDivBypassed()) {
	const DenseMap<unsigned int, unsigned int> &BypassWidths =
	TLI->getBypassSlowDivWidths();
	BasicBlock* BB = &*F.begin();
	while (BB != nullptr) {
	// bypassSlowDivision may create new BBs, but we don't want to reapply the
	// optimization to those blocks.
	BasicBlock* Next = BB->getNextNode();
	EverMadeChange \|= bypassSlowDivision(BB, BypassWidths);
	BB = Next;
	}
	}

	// Eliminate blocks that contain only PHI nodes and an
	// unconditional branch.
	EverMadeChange \|= eliminateMostlyEmptyBlocks(F);

	bool ModifiedDT = false;
	if (!DisableBranchOpts)
	EverMadeChange \|= splitBranchCondition(F, ModifiedDT);

	// Split some critical edges where one of the sources is an indirect branch,
	// to help generate sane code for PHIs involving such edges.
	EverMadeChange \|= SplitIndirectBrCriticalEdges(F);

	bool MadeChange = true;
	while (MadeChange) {
	MadeChange = false;
	DT.reset();
	for (Function::iterator I = F.begin(); I != F.end(); ) {
	BasicBlock BB = &I++;
	bool ModifiedDTOnIteration = false;
	MadeChange \|= optimizeBlock(*BB, ModifiedDTOnIteration);

	// Restart BB iteration if the dominator tree of the Function was changed
	if (ModifiedDTOnIteration)
	break;
	}
	if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())
	MadeChange \|= mergeSExts(F);
	if (!LargeOffsetGEPMap.empty())
	MadeChange \|= splitLargeGEPOffsets();

	// Really free removed instructions during promotion.
	for (Instruction *I : RemovedInsts)
	I->deleteValue();

	EverMadeChange \|= MadeChange;
	SeenChainsForSExt.clear();
	ValToSExtendedUses.clear();
	RemovedInsts.clear();
	LargeOffsetGEPMap.clear();
	LargeOffsetGEPID.clear();
	}

	SunkAddrs.clear();

	if (!DisableBranchOpts) {
	MadeChange = false;
	// Use a set vector to get deterministic iteration order. The order the
	// blocks are removed may affect whether or not PHI nodes in successors
	// are removed.
	SmallSetVector<BasicBlock*, 8> WorkList;
	for (BasicBlock &BB : F) {
	SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB));
	MadeChange \|= ConstantFoldTerminator(&BB, true);
	if (!MadeChange) continue;

	for (SmallVectorImpl<BasicBlock*>::iterator
	II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
	if (pred_begin(II) == pred_end(II))
	WorkList.insert(*II);
	}

	// Delete the dead blocks and any of their dead successors.
	MadeChange \|= !WorkList.empty();
	while (!WorkList.empty()) {
	BasicBlock *BB = WorkList.pop_back_val();
	SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));

	DeleteDeadBlock(BB);

	for (SmallVectorImpl<BasicBlock*>::iterator
	II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
	if (pred_begin(II) == pred_end(II))
	WorkList.insert(*II);
	}

	// Merge pairs of basic blocks with unconditional branches, connected by
	// a single edge.
	if (EverMadeChange \|\| MadeChange)
	MadeChange \|= eliminateFallThrough(F);

	EverMadeChange \|= MadeChange;
	}

	if (!DisableGCOpts) {
	SmallVector<Instruction *, 2> Statepoints;
	for (BasicBlock &BB : F)
	for (Instruction &I : BB)
	if (isStatepoint(I))
	Statepoints.push_back(&I);
	for (auto &I : Statepoints)
	EverMadeChange \|= simplifyOffsetableRelocate(*I);
	}

	// Do this last to clean up use-before-def scenarios introduced by other
	// preparatory transforms.
	EverMadeChange \|= placeDbgValues(F);

	return EverMadeChange;
	}

	/// Merge basic blocks which are connected by a single edge, where one of the
	/// basic blocks has a single successor pointing to the other basic block,
	/// which has a single predecessor.
	bool CodeGenPrepare::eliminateFallThrough(Function &F) {
	bool Changed = false;
	// Scan all of the blocks in the function, except for the entry block.
	// Use a temporary array to avoid iterator being invalidated when
	// deleting blocks.
	SmallVector<WeakTrackingVH, 16> Blocks;
	for (auto &Block : llvm::make_range(std::next(F.begin()), F.end()))
	Blocks.push_back(&Block);

	for (auto &Block : Blocks) {
	auto *BB = cast_or_null<BasicBlock>(Block);
	if (!BB)
	continue;
	// If the destination block has a single pred, then this is a trivial
	// edge, just collapse it.
	BasicBlock *SinglePred = BB->getSinglePredecessor();

	// Don't merge if BB's address is taken.
	if (!SinglePred \|\| SinglePred == BB \|\| BB->hasAddressTaken()) continue;

	BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
	if (Term && !Term->isConditional()) {
	Changed = true;
	LLVM_DEBUG(dbgs() << "To merge:\n" << *BB << "\n\n\n");

	// Merge BB into SinglePred and delete it.
	MergeBlockIntoPredecessor(BB);
	}
	}
	return Changed;
	}

	/// Find a destination block from BB if BB is mergeable empty block.
	BasicBlock CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock BB) {
	// If this block doesn't end with an uncond branch, ignore it.
	BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
	if (!BI \|\| !BI->isUnconditional())
	return nullptr;

	// If the instruction before the branch (skipping debug info) isn't a phi
	// node, then other stuff is happening here.
	BasicBlock::iterator BBI = BI->getIterator();
	if (BBI != BB->begin()) {
	--BBI;
	while (isa<DbgInfoIntrinsic>(BBI)) {
	if (BBI == BB->begin())
	break;
	--BBI;
	}
	if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
	return nullptr;
	}

	// Do not break infinite loops.
	BasicBlock *DestBB = BI->getSuccessor(0);
	if (DestBB == BB)
	return nullptr;

	if (!canMergeBlocks(BB, DestBB))
	DestBB = nullptr;

	return DestBB;
	}

	/// Eliminate blocks that contain only PHI nodes, debug info directives, and an
	/// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
	/// edges in ways that are non-optimal for isel. Start by eliminating these
	/// blocks so we can split them the way we want them.
	bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
	SmallPtrSet<BasicBlock *, 16> Preheaders;
	SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end());
	while (!LoopList.empty()) {
	Loop *L = LoopList.pop_back_val();
	LoopList.insert(LoopList.end(), L->begin(), L->end());
	if (BasicBlock *Preheader = L->getLoopPreheader())
	Preheaders.insert(Preheader);
	}

	bool MadeChange = false;
	// Copy blocks into a temporary array to avoid iterator invalidation issues
	// as we remove them.
	// Note that this intentionally skips the entry block.
	SmallVector<WeakTrackingVH, 16> Blocks;
	for (auto &Block : llvm::make_range(std::next(F.begin()), F.end()))
	Blocks.push_back(&Block);

	for (auto &Block : Blocks) {
	BasicBlock *BB = cast_or_null<BasicBlock>(Block);
	if (!BB)
	continue;
	BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB);
	if (!DestBB \|\|
	!isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB)))
	continue;

	eliminateMostlyEmptyBlock(BB);
	MadeChange = true;
	}
	return MadeChange;
	}

	bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,
	BasicBlock *DestBB,
	bool isPreheader) {
	// Do not delete loop preheaders if doing so would create a critical edge.
	// Loop preheaders can be good locations to spill registers. If the
	// preheader is deleted and we create a critical edge, registers may be
	// spilled in the loop body instead.
	if (!DisablePreheaderProtect && isPreheader &&
	!(BB->getSinglePredecessor() &&
	BB->getSinglePredecessor()->getSingleSuccessor()))
	return false;

	// Skip merging if the block's successor is also a successor to any callbr
	// that leads to this block.
	// FIXME: Is this really needed? Is this a correctness issue?
	for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
	if (auto CBI = dyn_cast<CallBrInst>((PI)->getTerminator()))
	for (unsigned i = 0, e = CBI->getNumSuccessors(); i != e; ++i)
	if (DestBB == CBI->getSuccessor(i))
	return false;
	}

	// Try to skip merging if the unique predecessor of BB is terminated by a
	// switch or indirect branch instruction, and BB is used as an incoming block
	// of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to
	// add COPY instructions in the predecessor of BB instead of BB (if it is not
	// merged). Note that the critical edge created by merging such blocks wont be
	// split in MachineSink because the jump table is not analyzable. By keeping
	// such empty block (BB), ISel will place COPY instructions in BB, not in the
	// predecessor of BB.
	BasicBlock *Pred = BB->getUniquePredecessor();
	if (!Pred \|\|
	!(isa<SwitchInst>(Pred->getTerminator()) \|\|
	isa<IndirectBrInst>(Pred->getTerminator())))
	return true;

	if (BB->getTerminator() != BB->getFirstNonPHIOrDbg())
	return true;

	// We use a simple cost heuristic which determine skipping merging is
	// profitable if the cost of skipping merging is less than the cost of
	// merging : Cost(skipping merging) < Cost(merging BB), where the
	// Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and
	// the Cost(merging BB) is Freq(Pred) * Cost(Copy).
	// Assuming Cost(Copy) == Cost(Branch), we could simplify it to :
	// Freq(Pred) / Freq(BB) > 2.
	// Note that if there are multiple empty blocks sharing the same incoming
	// value for the PHIs in the DestBB, we consider them together. In such
	// case, Cost(merging BB) will be the sum of their frequencies.

	if (!isa<PHINode>(DestBB->begin()))
	return true;

	SmallPtrSet<BasicBlock *, 16> SameIncomingValueBBs;

	// Find all other incoming blocks from which incoming values of all PHIs in
	// DestBB are the same as the ones from BB.
	for (pred_iterator PI = pred_begin(DestBB), E = pred_end(DestBB); PI != E;
	++PI) {
	BasicBlock DestBBPred = PI;
	if (DestBBPred == BB)
	continue;

	if (llvm::all_of(DestBB->phis(), [&](const PHINode &DestPN) {
	return DestPN.getIncomingValueForBlock(BB) ==
	DestPN.getIncomingValueForBlock(DestBBPred);
	}))
	SameIncomingValueBBs.insert(DestBBPred);
	}

	// See if all BB's incoming values are same as the value from Pred. In this
	// case, no reason to skip merging because COPYs are expected to be place in
	// Pred already.
	if (SameIncomingValueBBs.count(Pred))
	return true;

	BlockFrequency PredFreq = BFI->getBlockFreq(Pred);
	BlockFrequency BBFreq = BFI->getBlockFreq(BB);

	for (auto SameValueBB : SameIncomingValueBBs)
	if (SameValueBB->getUniquePredecessor() == Pred &&
	DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB))
	BBFreq += BFI->getBlockFreq(SameValueBB);

	return PredFreq.getFrequency() <=
	BBFreq.getFrequency() * FreqRatioToSkipMerge;
	}

	/// Return true if we can merge BB into DestBB if there is a single
	/// unconditional branch between them, and BB contains no other non-phi
	/// instructions.
	bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
	const BasicBlock *DestBB) const {
	// We only want to eliminate blocks whose phi nodes are used by phi nodes in
	// the successor. If there are more complex condition (e.g. preheaders),
	// don't mess around with them.
	for (const PHINode &PN : BB->phis()) {
	for (const User *U : PN.users()) {
	const Instruction *UI = cast<Instruction>(U);
	if (UI->getParent() != DestBB \|\| !isa<PHINode>(UI))
	return false;
	// If User is inside DestBB block and it is a PHINode then check
	// incoming value. If incoming value is not from BB then this is
	// a complex condition (e.g. preheaders) we want to avoid here.
	if (UI->getParent() == DestBB) {
	if (const PHINode *UPN = dyn_cast<PHINode>(UI))
	for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
	Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
	if (Insn && Insn->getParent() == BB &&
	Insn->getParent() != UPN->getIncomingBlock(I))
	return false;
	}
	}
	}
	}

	// If BB and DestBB contain any common predecessors, then the phi nodes in BB
	// and DestBB may have conflicting incoming values for the block. If so, we
	// can't merge the block.
	const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
	if (!DestBBPN) return true; // no conflict.

	// Collect the preds of BB.
	SmallPtrSet<const BasicBlock*, 16> BBPreds;
	if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
	// It is faster to get preds from a PHI than with pred_iterator.
	for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
	BBPreds.insert(BBPN->getIncomingBlock(i));
	} else {
	BBPreds.insert(pred_begin(BB), pred_end(BB));
	}

	// Walk the preds of DestBB.
	for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
	BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
	if (BBPreds.count(Pred)) { // Common predecessor?
	for (const PHINode &PN : DestBB->phis()) {
	const Value *V1 = PN.getIncomingValueForBlock(Pred);
	const Value *V2 = PN.getIncomingValueForBlock(BB);

	// If V2 is a phi node in BB, look up what the mapped value will be.
	if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
	if (V2PN->getParent() == BB)
	V2 = V2PN->getIncomingValueForBlock(Pred);

	// If there is a conflict, bail out.
	if (V1 != V2) return false;
	}
	}
	}

	return true;
	}

	/// Eliminate a basic block that has only phi's and an unconditional branch in
	/// it.
	void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
	BranchInst *BI = cast<BranchInst>(BB->getTerminator());
	BasicBlock *DestBB = BI->getSuccessor(0);

	LLVM_DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n"
	<< BB << DestBB);

	// If the destination block has a single pred, then this is a trivial edge,
	// just collapse it.
	if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
	if (SinglePred != DestBB) {
	assert(SinglePred == BB &&
	"Single predecessor not the same as predecessor");
	// Merge DestBB into SinglePred/BB and delete it.
	MergeBlockIntoPredecessor(DestBB);
	// Note: BB(=SinglePred) will not be deleted on this path.
	// DestBB(=its single successor) is the one that was deleted.
	LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n");
	return;
	}
	}

	// Otherwise, we have multiple predecessors of BB. Update the PHIs in DestBB
	// to handle the new incoming edges it is about to have.
	for (PHINode &PN : DestBB->phis()) {
	// Remove the incoming value for BB, and remember it.
	Value *InVal = PN.removeIncomingValue(BB, false);

	// Two options: either the InVal is a phi node defined in BB or it is some
	// value that dominates BB.
	PHINode *InValPhi = dyn_cast<PHINode>(InVal);
	if (InValPhi && InValPhi->getParent() == BB) {
	// Add all of the input values of the input PHI as inputs of this phi.
	for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
	PN.addIncoming(InValPhi->getIncomingValue(i),
	InValPhi->getIncomingBlock(i));
	} else {
	// Otherwise, add one instance of the dominating value for each edge that
	// we will be adding.
	if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
	for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
	PN.addIncoming(InVal, BBPN->getIncomingBlock(i));
	} else {
	for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
	PN.addIncoming(InVal, *PI);
	}
	}
	}

	// The PHIs are now updated, change everything that refers to BB to use
	// DestBB and remove BB.
	BB->replaceAllUsesWith(DestBB);
	BB->eraseFromParent();
	++NumBlocksElim;

	LLVM_DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
	}

	// Computes a map of base pointer relocation instructions to corresponding
	// derived pointer relocation instructions given a vector of all relocate calls
	static void computeBaseDerivedRelocateMap(
	const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls,
	DenseMap<GCRelocateInst , SmallVector<GCRelocateInst , 2>>
	&RelocateInstMap) {
	// Collect information in two maps: one primarily for locating the base object
	// while filling the second map; the second map is the final structure holding
	// a mapping between Base and corresponding Derived relocate calls
	DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap;
	for (auto *ThisRelocate : AllRelocateCalls) {
	auto K = std::make_pair(ThisRelocate->getBasePtrIndex(),
	ThisRelocate->getDerivedPtrIndex());
	RelocateIdxMap.insert(std::make_pair(K, ThisRelocate));
	}
	for (auto &Item : RelocateIdxMap) {
	std::pair<unsigned, unsigned> Key = Item.first;
	if (Key.first == Key.second)
	// Base relocation: nothing to insert
	continue;

	GCRelocateInst *I = Item.second;
	auto BaseKey = std::make_pair(Key.first, Key.first);

	// We're iterating over RelocateIdxMap so we cannot modify it.
	auto MaybeBase = RelocateIdxMap.find(BaseKey);
	if (MaybeBase == RelocateIdxMap.end())
	// TODO: We might want to insert a new base object relocate and gep off
	// that, if there are enough derived object relocates.
	continue;

	RelocateInstMap[MaybeBase->second].push_back(I);
	}
	}

	// Accepts a GEP and extracts the operands into a vector provided they're all
	// small integer constants
	static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
	SmallVectorImpl<Value *> &OffsetV) {
	for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
	// Only accept small constant integer operands
	auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
	if (!Op \|\| Op->getZExtValue() > 20)
	return false;
	}

	for (unsigned i = 1; i < GEP->getNumOperands(); i++)
	OffsetV.push_back(GEP->getOperand(i));
	return true;
	}

	// Takes a RelocatedBase (base pointer relocation instruction) and Targets to
	// replace, computes a replacement, and affects it.
	static bool
	simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
	const SmallVectorImpl<GCRelocateInst *> &Targets) {
	bool MadeChange = false;
	// We must ensure the relocation of derived pointer is defined after
	// relocation of base pointer. If we find a relocation corresponding to base
	// defined earlier than relocation of base then we move relocation of base
	// right before found relocation. We consider only relocation in the same
	// basic block as relocation of base. Relocations from other basic block will
	// be skipped by optimization and we do not care about them.
	for (auto R = RelocatedBase->getParent()->getFirstInsertionPt();
	&*R != RelocatedBase; ++R)
	if (auto RI = dyn_cast<GCRelocateInst>(R))
	if (RI->getStatepoint() == RelocatedBase->getStatepoint())
	if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) {
	RelocatedBase->moveBefore(RI);
	break;
	}

	for (GCRelocateInst *ToReplace : Targets) {
	assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() &&
	"Not relocating a derived object of the original base object");
	if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) {
	// A duplicate relocate call. TODO: coalesce duplicates.
	continue;
	}

	if (RelocatedBase->getParent() != ToReplace->getParent()) {
	// Base and derived relocates are in different basic blocks.
	// In this case transform is only valid when base dominates derived
	// relocate. However it would be too expensive to check dominance
	// for each such relocate, so we skip the whole transformation.
	continue;
	}

	Value *Base = ToReplace->getBasePtr();
	auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
	if (!Derived \|\| Derived->getPointerOperand() != Base)
	continue;

	SmallVector<Value *, 2> OffsetV;
	if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV))
	continue;

	// Create a Builder and replace the target callsite with a gep
	assert(RelocatedBase->getNextNode() &&
	"Should always have one since it's not a terminator");

	// Insert after RelocatedBase
	IRBuilder<> Builder(RelocatedBase->getNextNode());
	Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());

	// If gc_relocate does not match the actual type, cast it to the right type.
	// In theory, there must be a bitcast after gc_relocate if the type does not
	// match, and we should reuse it to get the derived pointer. But it could be
	// cases like this:
	// bb1:
	// ...
	// %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
	// br label %merge
	//
	// bb2:
	// ...
	// %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
	// br label %merge
	//
	// merge:
	// %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ]
	// %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)*
	//
	// In this case, we can not find the bitcast any more. So we insert a new bitcast
	// no matter there is already one or not. In this way, we can handle all cases, and
	// the extra bitcast should be optimized away in later passes.
	Value *ActualRelocatedBase = RelocatedBase;
	if (RelocatedBase->getType() != Base->getType()) {
	ActualRelocatedBase =
	Builder.CreateBitCast(RelocatedBase, Base->getType());
	}
	Value *Replacement = Builder.CreateGEP(
	Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV));
	Replacement->takeName(ToReplace);
	// If the newly generated derived pointer's type does not match the original derived
	// pointer's type, cast the new derived pointer to match it. Same reasoning as above.
	Value *ActualReplacement = Replacement;
	if (Replacement->getType() != ToReplace->getType()) {
	ActualReplacement =
	Builder.CreateBitCast(Replacement, ToReplace->getType());
	}
	ToReplace->replaceAllUsesWith(ActualReplacement);
	ToReplace->eraseFromParent();

	MadeChange = true;
	}
	return MadeChange;
	}

	// Turns this:
	//
	// %base = ...
	// %ptr = gep %base + 15
	// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
	// %base' = relocate(%tok, i32 4, i32 4)
	// %ptr' = relocate(%tok, i32 4, i32 5)
	// %val = load %ptr'
	//
	// into this:
	//
	// %base = ...
	// %ptr = gep %base + 15
	// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
	// %base' = gc.relocate(%tok, i32 4, i32 4)
	// %ptr' = gep %base' + 15
	// %val = load %ptr'
	bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
	bool MadeChange = false;
	SmallVector<GCRelocateInst *, 2> AllRelocateCalls;

	for (auto *U : I.users())
	if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))
	// Collect all the relocate calls associated with a statepoint
	AllRelocateCalls.push_back(Relocate);

	// We need atleast one base pointer relocation + one derived pointer
	// relocation to mangle
	if (AllRelocateCalls.size() < 2)
	return false;

	// RelocateInstMap is a mapping from the base relocate instruction to the
	// corresponding derived relocate instructions
	DenseMap<GCRelocateInst , SmallVector<GCRelocateInst , 2>> RelocateInstMap;
	computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
	if (RelocateInstMap.empty())
	return false;

	for (auto &Item : RelocateInstMap)
	// Item.first is the RelocatedBase to offset against
	// Item.second is the vector of Targets to replace
	MadeChange = simplifyRelocatesOffABase(Item.first, Item.second);
	return MadeChange;
	}

	/// Sink the specified cast instruction into its user blocks.
	static bool SinkCast(CastInst *CI) {
	BasicBlock *DefBB = CI->getParent();

	/// InsertedCasts - Only insert a cast in each block once.
	DenseMap<BasicBlock, CastInst> InsertedCasts;

	bool MadeChange = false;
	for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
	UI != E; ) {
	Use &TheUse = UI.getUse();
	Instruction User = cast<Instruction>(UI);

	// Figure out which BB this cast is used in. For PHI's this is the
	// appropriate predecessor block.
	BasicBlock *UserBB = User->getParent();
	if (PHINode *PN = dyn_cast<PHINode>(User)) {
	UserBB = PN->getIncomingBlock(TheUse);
	}

	// Preincrement use iterator so we don't invalidate it.
	++UI;

	// The first insertion point of a block containing an EH pad is after the
	// pad. If the pad is the user, we cannot sink the cast past the pad.
	if (User->isEHPad())
	continue;

	// If the block selected to receive the cast is an EH pad that does not
	// allow non-PHI instructions before the terminator, we can't sink the
	// cast.
	if (UserBB->getTerminator()->isEHPad())
	continue;

	// If this user is in the same block as the cast, don't change the cast.
	if (UserBB == DefBB) continue;

	// If we have already inserted a cast into this block, use it.
	CastInst *&InsertedCast = InsertedCasts[UserBB];

	if (!InsertedCast) {
	BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
	assert(InsertPt != UserBB->end());
	InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0),
	CI->getType(), "", &*InsertPt);
	InsertedCast->setDebugLoc(CI->getDebugLoc());
	}

	// Replace a use of the cast with a use of the new cast.
	TheUse = InsertedCast;
	MadeChange = true;
	++NumCastUses;
	}

	// If we removed all uses, nuke the cast.
	if (CI->use_empty()) {
	salvageDebugInfo(*CI);
	CI->eraseFromParent();
	MadeChange = true;
	}

	return MadeChange;
	}

	/// If the specified cast instruction is a noop copy (e.g. it's casting from
	/// one pointer type to another, i32->i8 on PPC), sink it into user blocks to
	/// reduce the number of virtual registers that must be created and coalesced.
	///
	/// Return true if any changes are made.
	static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
	const DataLayout &DL) {
	// Sink only "cheap" (or nop) address-space casts. This is a weaker condition
	// than sinking only nop casts, but is helpful on some platforms.
	if (auto *ASC = dyn_cast<AddrSpaceCastInst>(CI)) {
	if (!TLI.isFreeAddrSpaceCast(ASC->getSrcAddressSpace(),
	ASC->getDestAddressSpace()))
	return false;
	}

	// If this is a noop copy,
	EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType());
	EVT DstVT = TLI.getValueType(DL, CI->getType());

	// This is an fp<->int conversion?
	if (SrcVT.isInteger() != DstVT.isInteger())
	return false;

	// If this is an extension, it will be a zero or sign extension, which
	// isn't a noop.
	if (SrcVT.bitsLT(DstVT)) return false;

	// If these values will be promoted, find out what they will be promoted
	// to. This helps us consider truncates on PPC as noop copies when they
	// are.
	if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
	TargetLowering::TypePromoteInteger)
	SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
	if (TLI.getTypeAction(CI->getContext(), DstVT) ==
	TargetLowering::TypePromoteInteger)
	DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);

	// If, after promotion, these are the same types, this is a noop copy.
	if (SrcVT != DstVT)
	return false;

	return SinkCast(CI);
	}

	bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
	CmpInst *Cmp,
	Intrinsic::ID IID) {
	if (BO->getParent() != Cmp->getParent()) {
	// We used to use a dominator tree here to allow multi-block optimization.
	// But that was problematic because:
	// 1. It could cause a perf regression by hoisting the math op into the
	// critical path.
	// 2. It could cause a perf regression by creating a value that was live
	// across multiple blocks and increasing register pressure.
	// 3. Use of a dominator tree could cause large compile-time regression.
	// This is because we recompute the DT on every change in the main CGP
	// run-loop. The recomputing is probably unnecessary in many cases, so if
	// that was fixed, using a DT here would be ok.
	return false;
	}

	// We allow matching the canonical IR (add X, C) back to (usubo X, -C).
	Value *Arg0 = BO->getOperand(0);
	Value *Arg1 = BO->getOperand(1);
	if (BO->getOpcode() == Instruction::Add &&
	IID == Intrinsic::usub_with_overflow) {
	assert(isa<Constant>(Arg1) && "Unexpected input for usubo");
	Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));
	}

	// Insert at the first instruction of the pair.
	Instruction *InsertPt = nullptr;
	for (Instruction &Iter : *Cmp->getParent()) {
	if (&Iter == BO \|\| &Iter == Cmp) {
	InsertPt = &Iter;
	break;
	}
	}
	assert(InsertPt != nullptr && "Parent block did not contain cmp or binop");

	IRBuilder<> Builder(InsertPt);
	Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
	Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
	Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");
	BO->replaceAllUsesWith(Math);
	Cmp->replaceAllUsesWith(OV);
	BO->eraseFromParent();
	Cmp->eraseFromParent();
	return true;
	}

	/// Match special-case patterns that check for unsigned add overflow.
	static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp,
	BinaryOperator *&Add) {
	// Add = add A, 1; Cmp = icmp eq A,-1 (overflow if A is max val)
	// Add = add A,-1; Cmp = icmp ne A, 0 (overflow if A is non-zero)
	Value A = Cmp->getOperand(0), B = Cmp->getOperand(1);

	// We are not expecting non-canonical/degenerate code. Just bail out.
	if (isa<Constant>(A))
	return false;

	ICmpInst::Predicate Pred = Cmp->getPredicate();
	if (Pred == ICmpInst::ICMP_EQ && match(B, m_AllOnes()))
	B = ConstantInt::get(B->getType(), 1);
	else if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt()))
	B = ConstantInt::get(B->getType(), -1);
	else
	return false;

	// Check the users of the variable operand of the compare looking for an add
	// with the adjusted constant.
	for (User *U : A->users()) {
	if (match(U, m_Add(m_Specific(A), m_Specific(B)))) {
	Add = cast<BinaryOperator>(U);
	return true;
	}
	}
	return false;
	}

	/// Try to combine the compare into a call to the llvm.uadd.with.overflow
	/// intrinsic. Return true if any changes were made.
	bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
	bool &ModifiedDT) {
	Value A, B;
	BinaryOperator *Add;
	if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add))))
	if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add))
	return false;

	if (!TLI->shouldFormOverflowOp(ISD::UADDO,
	TLI->getValueType(*DL, Add->getType())))
	return false;

	// We don't want to move around uses of condition values this late, so we
	// check if it is legal to create the call to the intrinsic in the basic
	// block containing the icmp.
	if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse())
	return false;

	if (!replaceMathCmpWithIntrinsic(Add, Cmp, Intrinsic::uadd_with_overflow))
	return false;

	// Reset callers - do not crash by iterating over a dead instruction.
	ModifiedDT = true;
	return true;
	}

	bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
	bool &ModifiedDT) {
	// We are not expecting non-canonical/degenerate code. Just bail out.
	Value A = Cmp->getOperand(0), B = Cmp->getOperand(1);
	if (isa<Constant>(A) && isa<Constant>(B))
	return false;

	// Convert (A u> B) to (A u< B) to simplify pattern matching.
	ICmpInst::Predicate Pred = Cmp->getPredicate();
	if (Pred == ICmpInst::ICMP_UGT) {
	std::swap(A, B);
	Pred = ICmpInst::ICMP_ULT;
	}
	// Convert special-case: (A == 0) is the same as (A u< 1).
	if (Pred == ICmpInst::ICMP_EQ && match(B, m_ZeroInt())) {
	B = ConstantInt::get(B->getType(), 1);
	Pred = ICmpInst::ICMP_ULT;
	}
	// Convert special-case: (A != 0) is the same as (0 u< A).
	if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt())) {
	std::swap(A, B);
	Pred = ICmpInst::ICMP_ULT;
	}
	if (Pred != ICmpInst::ICMP_ULT)
	return false;

	// Walk the users of a variable operand of a compare looking for a subtract or
	// add with that same operand. Also match the 2nd operand of the compare to
	// the add/sub, but that may be a negated constant operand of an add.
	Value *CmpVariableOperand = isa<Constant>(A) ? B : A;
	BinaryOperator *Sub = nullptr;
	for (User *U : CmpVariableOperand->users()) {
	// A - B, A u< B --> usubo(A, B)
	if (match(U, m_Sub(m_Specific(A), m_Specific(B)))) {
	Sub = cast<BinaryOperator>(U);
	break;
	}

	// A + (-C), A u< C (canonicalized form of (sub A, C))
	const APInt CmpC, AddC;
	if (match(U, m_Add(m_Specific(A), m_APInt(AddC))) &&
	match(B, m_APInt(CmpC)) && AddC == -(CmpC)) {
	Sub = cast<BinaryOperator>(U);
	break;
	}
	}
	if (!Sub)
	return false;

	if (!TLI->shouldFormOverflowOp(ISD::USUBO,
	TLI->getValueType(*DL, Sub->getType())))
	return false;

	if (!replaceMathCmpWithIntrinsic(Sub, Cmp, Intrinsic::usub_with_overflow))
	return false;

	// Reset callers - do not crash by iterating over a dead instruction.
	ModifiedDT = true;
	return true;
	}

	/// Sink the given CmpInst into user blocks to reduce the number of virtual
	/// registers that must be created and coalesced. This is a clear win except on
	/// targets with multiple condition code registers (PowerPC), where it might
	/// lose; some adjustment may be wanted there.
	///
	/// Return true if any changes are made.
	static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
	if (TLI.hasMultipleConditionRegisters())
	return false;

	// Avoid sinking soft-FP comparisons, since this can move them into a loop.
	if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp))
	return false;

	// Only insert a cmp in each block once.
	DenseMap<BasicBlock, CmpInst> InsertedCmps;

	bool MadeChange = false;
	for (Value::user_iterator UI = Cmp->user_begin(), E = Cmp->user_end();
	UI != E; ) {
	Use &TheUse = UI.getUse();
	Instruction User = cast<Instruction>(UI);

	// Preincrement use iterator so we don't invalidate it.
	++UI;

	// Don't bother for PHI nodes.
	if (isa<PHINode>(User))
	continue;

	// Figure out which BB this cmp is used in.
	BasicBlock *UserBB = User->getParent();
	BasicBlock *DefBB = Cmp->getParent();

	// If this user is in the same block as the cmp, don't change the cmp.
	if (UserBB == DefBB) continue;

	// If we have already inserted a cmp into this block, use it.
	CmpInst *&InsertedCmp = InsertedCmps[UserBB];

	if (!InsertedCmp) {
	BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
	assert(InsertPt != UserBB->end());
	InsertedCmp =
	CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(),
	Cmp->getOperand(0), Cmp->getOperand(1), "",
	&*InsertPt);
	// Propagate the debug info.
	InsertedCmp->setDebugLoc(Cmp->getDebugLoc());
	}

	// Replace a use of the cmp with a use of the new cmp.
	TheUse = InsertedCmp;
	MadeChange = true;
	++NumCmpUses;
	}

	// If we removed all uses, nuke the cmp.
	if (Cmp->use_empty()) {
	Cmp->eraseFromParent();
	MadeChange = true;
	}

	return MadeChange;
	}

	bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, bool &ModifiedDT) {
	if (sinkCmpExpression(Cmp, *TLI))
	return true;

	if (combineToUAddWithOverflow(Cmp, ModifiedDT))
	return true;

	if (combineToUSubWithOverflow(Cmp, ModifiedDT))
	return true;

	return false;
	}

	/// Duplicate and sink the given 'and' instruction into user blocks where it is
	/// used in a compare to allow isel to generate better code for targets where
	/// this operation can be combined.
	///
	/// Return true if any changes are made.
	static bool sinkAndCmp0Expression(Instruction *AndI,
	const TargetLowering &TLI,
	SetOfInstrs &InsertedInsts) {
	// Double-check that we're not trying to optimize an instruction that was
	// already optimized by some other part of this pass.
	assert(!InsertedInsts.count(AndI) &&
	"Attempting to optimize already optimized and instruction");
	(void) InsertedInsts;

	// Nothing to do for single use in same basic block.
	if (AndI->hasOneUse() &&
	AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())
	return false;

	// Try to avoid cases where sinking/duplicating is likely to increase register
	// pressure.
	if (!isa<ConstantInt>(AndI->getOperand(0)) &&
	!isa<ConstantInt>(AndI->getOperand(1)) &&
	AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())
	return false;

	for (auto *U : AndI->users()) {
	Instruction *User = cast<Instruction>(U);

	// Only sink 'and' feeding icmp with 0.
	if (!isa<ICmpInst>(User))
	return false;

	auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));
	if (!CmpC \|\| !CmpC->isZero())
	return false;
	}

	if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
	return false;

	LLVM_DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
	LLVM_DEBUG(AndI->getParent()->dump());

	// Push the 'and' into the same block as the icmp 0. There should only be
	// one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
	// others, so we don't need to keep track of which BBs we insert into.
	for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
	UI != E; ) {
	Use &TheUse = UI.getUse();
	Instruction User = cast<Instruction>(UI);

	// Preincrement use iterator so we don't invalidate it.
	++UI;

	LLVM_DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");

	// Keep the 'and' in the same place if the use is already in the same block.
	Instruction *InsertPt =
	User->getParent() == AndI->getParent() ? AndI : User;
	Instruction *InsertedAnd =
	BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
	AndI->getOperand(1), "", InsertPt);
	// Propagate the debug info.
	InsertedAnd->setDebugLoc(AndI->getDebugLoc());

	// Replace a use of the 'and' with a use of the new 'and'.
	TheUse = InsertedAnd;
	++NumAndUses;
	LLVM_DEBUG(User->getParent()->dump());
	}

	// We removed all uses, nuke the and.
	AndI->eraseFromParent();
	return true;
	}

	/// Check if the candidates could be combined with a shift instruction, which
	/// includes:
	/// 1. Truncate instruction
	/// 2. And instruction and the imm is a mask of the low bits:
	/// imm & (imm+1) == 0
	static bool isExtractBitsCandidateUse(Instruction *User) {
	if (!isa<TruncInst>(User)) {
	if (User->getOpcode() != Instruction::And \|\|
	!isa<ConstantInt>(User->getOperand(1)))
	return false;

	const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue();

	if ((Cimm & (Cimm + 1)).getBoolValue())
	return false;
	}
	return true;
	}

	/// Sink both shift and truncate instruction to the use of truncate's BB.
	static bool
	SinkShiftAndTruncate(BinaryOperator ShiftI, Instruction User, ConstantInt *CI,
	DenseMap<BasicBlock , BinaryOperator > &InsertedShifts,
	const TargetLowering &TLI, const DataLayout &DL) {
	BasicBlock *UserBB = User->getParent();
	DenseMap<BasicBlock , CastInst > InsertedTruncs;
	TruncInst *TruncI = dyn_cast<TruncInst>(User);
	bool MadeChange = false;

	for (Value::user_iterator TruncUI = TruncI->user_begin(),
	TruncE = TruncI->user_end();
	TruncUI != TruncE;) {

	Use &TruncTheUse = TruncUI.getUse();
	Instruction TruncUser = cast<Instruction>(TruncUI);
	// Preincrement use iterator so we don't invalidate it.

	++TruncUI;

	int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
	if (!ISDOpcode)
	continue;

	// If the use is actually a legal node, there will not be an
	// implicit truncate.
	// FIXME: always querying the result type is just an
	// approximation; some nodes' legality is determined by the
	// operand or other means. There's no good way to find out though.
	if (TLI.isOperationLegalOrCustom(
	ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true)))
	continue;

	// Don't bother for PHI nodes.
	if (isa<PHINode>(TruncUser))
	continue;

	BasicBlock *TruncUserBB = TruncUser->getParent();

	if (UserBB == TruncUserBB)
	continue;

	BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
	CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];

	if (!InsertedShift && !InsertedTrunc) {
	BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
	assert(InsertPt != TruncUserBB->end());
	// Sink the shift
	if (ShiftI->getOpcode() == Instruction::AShr)
	InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
	"", &*InsertPt);
	else
	InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
	"", &*InsertPt);
	InsertedShift->setDebugLoc(ShiftI->getDebugLoc());

	// Sink the trunc
	BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
	TruncInsertPt++;
	assert(TruncInsertPt != TruncUserBB->end());

	InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
	TruncI->getType(), "", &*TruncInsertPt);
	InsertedTrunc->setDebugLoc(TruncI->getDebugLoc());

	MadeChange = true;

	TruncTheUse = InsertedTrunc;
	}
	}
	return MadeChange;
	}

	/// Sink the shift right instruction into user blocks if the uses could
	/// potentially be combined with this shift instruction and generate BitExtract
	/// instruction. It will only be applied if the architecture supports BitExtract
	/// instruction. Here is an example:
	/// BB1:
	/// %x.extract.shift = lshr i64 %arg1, 32
	/// BB2:
	/// %x.extract.trunc = trunc i64 %x.extract.shift to i16
	/// ==>
	///
	/// BB2:
	/// %x.extract.shift.1 = lshr i64 %arg1, 32
	/// %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
	///
	/// CodeGen will recognize the pattern in BB2 and generate BitExtract
	/// instruction.
	/// Return true if any changes are made.
	static bool OptimizeExtractBits(BinaryOperator ShiftI, ConstantInt CI,
	const TargetLowering &TLI,
	const DataLayout &DL) {
	BasicBlock *DefBB = ShiftI->getParent();

	/// Only insert instructions in each block once.
	DenseMap<BasicBlock , BinaryOperator > InsertedShifts;

	bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType()));

	bool MadeChange = false;
	for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
	UI != E;) {
	Use &TheUse = UI.getUse();
	Instruction User = cast<Instruction>(UI);
	// Preincrement use iterator so we don't invalidate it.
	++UI;

	// Don't bother for PHI nodes.
	if (isa<PHINode>(User))
	continue;

	if (!isExtractBitsCandidateUse(User))
	continue;

	BasicBlock *UserBB = User->getParent();

	if (UserBB == DefBB) {
	// If the shift and truncate instruction are in the same BB. The use of
	// the truncate(TruncUse) may still introduce another truncate if not
	// legal. In this case, we would like to sink both shift and truncate
	// instruction to the BB of TruncUse.
	// for example:
	// BB1:
	// i64 shift.result = lshr i64 opnd, imm
	// trunc.result = trunc shift.result to i16
	//
	// BB2:
	// ----> We will have an implicit truncate here if the architecture does
	// not have i16 compare.
	// cmp i16 trunc.result, opnd2
	//
	if (isa<TruncInst>(User) && shiftIsLegal
	// If the type of the truncate is legal, no truncate will be
	// introduced in other basic blocks.
	&&
	(!TLI.isTypeLegal(TLI.getValueType(DL, User->getType()))))
	MadeChange =
	SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL);

	continue;
	}
	// If we have already inserted a shift into this block, use it.
	BinaryOperator *&InsertedShift = InsertedShifts[UserBB];

	if (!InsertedShift) {
	BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
	assert(InsertPt != UserBB->end());

	if (ShiftI->getOpcode() == Instruction::AShr)
	InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
	"", &*InsertPt);
	else
	InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
	"", &*InsertPt);
	InsertedShift->setDebugLoc(ShiftI->getDebugLoc());

	MadeChange = true;
	}

	// Replace a use of the shift with a use of the new shift.
	TheUse = InsertedShift;
	}

	- // If we removed all uses, nuke the shift.
	+ // If we removed all uses, or there are none, nuke the shift.
	if (ShiftI->use_empty()) {
	salvageDebugInfo(*ShiftI);
	ShiftI->eraseFromParent();
	+ MadeChange = true;
	}

	return MadeChange;
	}

	/// If counting leading or trailing zeros is an expensive operation and a zero
	/// input is defined, add a check for zero to avoid calling the intrinsic.
	///
	/// We want to transform:
	/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
	///
	/// into:
	/// entry:
	/// %cmpz = icmp eq i64 %A, 0
	/// br i1 %cmpz, label %cond.end, label %cond.false
	/// cond.false:
	/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true)
	/// br label %cond.end
	/// cond.end:
	/// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
	///
	/// If the transform is performed, return true and set ModifiedDT to true.
	static bool despeculateCountZeros(IntrinsicInst *CountZeros,
	const TargetLowering *TLI,
	const DataLayout *DL,
	bool &ModifiedDT) {
	if (!TLI \|\| !DL)
	return false;

	// If a zero input is undefined, it doesn't make sense to despeculate that.
	if (match(CountZeros->getOperand(1), m_One()))
	return false;

	// If it's cheap to speculate, there's nothing to do.
	auto IntrinsicID = CountZeros->getIntrinsicID();
	if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) \|\|
	(IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz()))
	return false;

	// Only handle legal scalar cases. Anything else requires too much work.
	Type *Ty = CountZeros->getType();
	unsigned SizeInBits = Ty->getPrimitiveSizeInBits();
	if (Ty->isVectorTy() \|\| SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
	return false;

	// The intrinsic will be sunk behind a compare against zero and branch.
	BasicBlock *StartBlock = CountZeros->getParent();
	BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false");

	// Create another block after the count zero intrinsic. A PHI will be added
	// in this block to select the result of the intrinsic or the bit-width
	// constant if the input to the intrinsic is zero.
	BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros));
	BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end");

	// Set up a builder to create a compare, conditional branch, and PHI.
	IRBuilder<> Builder(CountZeros->getContext());
	Builder.SetInsertPoint(StartBlock->getTerminator());
	Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc());

	// Replace the unconditional branch that was created by the first split with
	// a compare against zero and a conditional branch.
	Value *Zero = Constant::getNullValue(Ty);
	Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz");
	Builder.CreateCondBr(Cmp, EndBlock, CallBlock);
	StartBlock->getTerminator()->eraseFromParent();

	// Create a PHI in the end block to select either the output of the intrinsic
	// or the bit width of the operand.
	Builder.SetInsertPoint(&EndBlock->front());
	PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz");
	CountZeros->replaceAllUsesWith(PN);
	Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits));
	PN->addIncoming(BitWidth, StartBlock);
	PN->addIncoming(CountZeros, CallBlock);

	// We are explicitly handling the zero case, so we can set the intrinsic's
	// undefined zero argument to 'true'. This will also prevent reprocessing the
	// intrinsic; we only despeculate when a zero input is defined.
	CountZeros->setArgOperand(1, Builder.getTrue());
	ModifiedDT = true;
	return true;
	}

	bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
	BasicBlock *BB = CI->getParent();

	// Lower inline assembly if we can.
	// If we found an inline asm expession, and if the target knows how to
	// lower it to normal LLVM code, do so now.
	if (TLI && isa<InlineAsm>(CI->getCalledValue())) {
	if (TLI->ExpandInlineAsm(CI)) {
	// Avoid invalidating the iterator.
	CurInstIterator = BB->begin();
	// Avoid processing instructions out of order, which could cause
	// reuse before a value is defined.
	SunkAddrs.clear();
	return true;
	}
	// Sink address computing for memory operands into the block.
	if (optimizeInlineAsmInst(CI))
	return true;
	}

	// Align the pointer arguments to this call if the target thinks it's a good
	// idea
	unsigned MinSize, PrefAlign;
	if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
	for (auto &Arg : CI->arg_operands()) {
	// We want to align both objects whose address is used directly and
	// objects whose address is used in casts and GEPs, though it only makes
	// sense for GEPs if the offset is a multiple of the desired alignment and
	// if size - offset meets the size threshold.
	if (!Arg->getType()->isPointerTy())
	continue;
	APInt Offset(DL->getIndexSizeInBits(
	cast<PointerType>(Arg->getType())->getAddressSpace()),
	0);
	Value Val = Arg->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
	uint64_t Offset2 = Offset.getLimitedValue();
	if ((Offset2 & (PrefAlign-1)) != 0)
	continue;
	AllocaInst *AI;
	if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
	DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
	AI->setAlignment(PrefAlign);
	// Global variables can only be aligned if they are defined in this
	// object (i.e. they are uniquely initialized in this object), and
	// over-aligning global variables that have an explicit section is
	// forbidden.
	GlobalVariable *GV;
	if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() &&
	GV->getPointerAlignment(*DL) < PrefAlign &&
	DL->getTypeAllocSize(GV->getValueType()) >=
	MinSize + Offset2)
	GV->setAlignment(PrefAlign);
	}
	// If this is a memcpy (or similar) then we may be able to improve the
	// alignment
	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
	unsigned DestAlign = getKnownAlignment(MI->getDest(), *DL);
	if (DestAlign > MI->getDestAlignment())
	MI->setDestAlignment(DestAlign);
	if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
	unsigned SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
	if (SrcAlign > MTI->getSourceAlignment())
	MTI->setSourceAlignment(SrcAlign);
	}
	}
	}

	// If we have a cold call site, try to sink addressing computation into the
	// cold block. This interacts with our handling for loads and stores to
	// ensure that we can fold all uses of a potential addressing computation
	// into their uses. TODO: generalize this to work over profiling data
	if (!OptSize && CI->hasFnAttr(Attribute::Cold))
	for (auto &Arg : CI->arg_operands()) {
	if (!Arg->getType()->isPointerTy())
	continue;
	unsigned AS = Arg->getType()->getPointerAddressSpace();
	return optimizeMemoryInst(CI, Arg, Arg->getType(), AS);
	}

	IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
	if (II) {
	switch (II->getIntrinsicID()) {
	default: break;
	case Intrinsic::experimental_widenable_condition: {
	// Give up on future widening oppurtunties so that we can fold away dead
	// paths and merge blocks before going into block-local instruction
	// selection.
	if (II->use_empty()) {
	II->eraseFromParent();
	return true;
	}
	Constant *RetVal = ConstantInt::getTrue(II->getContext());
	resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
	replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
	});
	return true;
	}
	case Intrinsic::objectsize: {
	// Lower all uses of llvm.objectsize.*
	Value *RetVal =
	lowerObjectSizeCall(II, DL, TLInfo, /MustSucceed=*/true);

	resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
	replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
	});
	return true;
	}
	case Intrinsic::is_constant: {
	// If is_constant hasn't folded away yet, lower it to false now.
	Constant *RetVal = ConstantInt::get(II->getType(), 0);
	resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
	replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
	});
	return true;
	}
	case Intrinsic::aarch64_stlxr:
	case Intrinsic::aarch64_stxr: {
	ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
	if (!ExtVal \|\| !ExtVal->hasOneUse() \|\|
	ExtVal->getParent() == CI->getParent())
	return false;
	// Sink a zext feeding stlxr/stxr before it, so it can be folded into it.
	ExtVal->moveBefore(CI);
	// Mark this instruction as "inserted by CGP", so that other
	// optimizations don't touch it.
	InsertedInsts.insert(ExtVal);
	return true;
	}

	case Intrinsic::launder_invariant_group:
	case Intrinsic::strip_invariant_group: {
	Value *ArgVal = II->getArgOperand(0);
	auto it = LargeOffsetGEPMap.find(II);
	if (it != LargeOffsetGEPMap.end()) {
	// Merge entries in LargeOffsetGEPMap to reflect the RAUW.
	// Make sure not to have to deal with iterator invalidation
	// after possibly adding ArgVal to LargeOffsetGEPMap.
	auto GEPs = std::move(it->second);
	LargeOffsetGEPMap[ArgVal].append(GEPs.begin(), GEPs.end());
	LargeOffsetGEPMap.erase(II);
	}

	II->replaceAllUsesWith(ArgVal);
	II->eraseFromParent();
	return true;
	}
	case Intrinsic::cttz:
	case Intrinsic::ctlz:
	// If counting zeros is expensive, try to avoid it.
	return despeculateCountZeros(II, TLI, DL, ModifiedDT);
	}

	if (TLI) {
	SmallVector<Value*, 2> PtrOps;
	Type *AccessTy;
	if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
	while (!PtrOps.empty()) {
	Value *PtrVal = PtrOps.pop_back_val();
	unsigned AS = PtrVal->getType()->getPointerAddressSpace();
	if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
	return true;
	}
	}
	}

	// From here on out we're working with named functions.
	if (!CI->getCalledFunction()) return false;

	// Lower all default uses of _chk calls. This is very similar
	// to what InstCombineCalls does, but here we are only lowering calls
	// to fortified library functions (e.g. __memcpy_chk) that have the default
	// "don't know" as the objectsize. Anything else should be left alone.
	FortifiedLibCallSimplifier Simplifier(TLInfo, true);
	if (Value *V = Simplifier.optimizeCall(CI)) {
	CI->replaceAllUsesWith(V);
	CI->eraseFromParent();
	return true;
	}

	return false;
	}

	/// Look for opportunities to duplicate return instructions to the predecessor
	/// to enable tail call optimizations. The case it is currently looking for is:
	/// @code
	/// bb0:
	/// %tmp0 = tail call i32 @f0()
	/// br label %return
	/// bb1:
	/// %tmp1 = tail call i32 @f1()
	/// br label %return
	/// bb2:
	/// %tmp2 = tail call i32 @f2()
	/// br label %return
	/// return:
	/// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
	/// ret i32 %retval
	/// @endcode
	///
	/// =>
	///
	/// @code
	/// bb0:
	/// %tmp0 = tail call i32 @f0()
	/// ret i32 %tmp0
	/// bb1:
	/// %tmp1 = tail call i32 @f1()
	/// ret i32 %tmp1
	/// bb2:
	/// %tmp2 = tail call i32 @f2()
	/// ret i32 %tmp2
	/// @endcode
	bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT) {
	if (!TLI)
	return false;

	ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator());
	if (!RetI)
	return false;

	PHINode *PN = nullptr;
	BitCastInst *BCI = nullptr;
	Value *V = RetI->getReturnValue();
	if (V) {
	BCI = dyn_cast<BitCastInst>(V);
	if (BCI)
	V = BCI->getOperand(0);

	PN = dyn_cast<PHINode>(V);
	if (!PN)
	return false;
	}

	if (PN && PN->getParent() != BB)
	return false;

	// Make sure there are no instructions between the PHI and return, or that the
	// return is the first instruction in the block.
	if (PN) {
	BasicBlock::iterator BI = BB->begin();
	// Skip over debug and the bitcast.
	do { ++BI; } while (isa<DbgInfoIntrinsic>(BI) \|\| &*BI == BCI);
	if (&*BI != RetI)
	return false;
	} else {
	BasicBlock::iterator BI = BB->begin();
	while (isa<DbgInfoIntrinsic>(BI)) ++BI;
	if (&*BI != RetI)
	return false;
	}

	/// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
	/// call.
	const Function *F = BB->getParent();
	SmallVector<CallInst*, 4> TailCalls;
	if (PN) {
	for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
	// Look through bitcasts.
	Value *IncomingVal = PN->getIncomingValue(I)->stripPointerCasts();
	CallInst *CI = dyn_cast<CallInst>(IncomingVal);
	// Make sure the phi value is indeed produced by the tail call.
	if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) &&
	TLI->mayBeEmittedAsTailCall(CI) &&
	attributesPermitTailCall(F, CI, RetI, *TLI))
	TailCalls.push_back(CI);
	}
	} else {
	SmallPtrSet<BasicBlock*, 4> VisitedBBs;
	for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
	if (!VisitedBBs.insert(*PI).second)
	continue;

	BasicBlock::InstListType &InstList = (*PI)->getInstList();
	BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin();
	BasicBlock::InstListType::reverse_iterator RE = InstList.rend();
	do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI));
	if (RI == RE)
	continue;

	CallInst CI = dyn_cast<CallInst>(&RI);
	if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
	attributesPermitTailCall(F, CI, RetI, *TLI))
	TailCalls.push_back(CI);
	}
	}

	bool Changed = false;
	for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) {
	CallInst *CI = TailCalls[i];
	CallSite CS(CI);

	// Make sure the call instruction is followed by an unconditional branch to
	// the return block.
	BasicBlock *CallBB = CI->getParent();
	BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator());
	if (!BI \|\| !BI->isUnconditional() \|\| BI->getSuccessor(0) != BB)
	continue;

	// Duplicate the return into CallBB.
	(void)FoldReturnIntoUncondBranch(RetI, BB, CallBB);
	ModifiedDT = Changed = true;
	++NumRetsDup;
	}

	// If we eliminated all predecessors of the block, delete the block now.
	if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
	BB->eraseFromParent();

	return Changed;
	}

	//===----------------------------------------------------------------------===//
	// Memory Optimization
	//===----------------------------------------------------------------------===//

	namespace {

	/// This is an extended version of TargetLowering::AddrMode
	/// which holds actual Value*'s for register values.
	struct ExtAddrMode : public TargetLowering::AddrMode {
	Value *BaseReg = nullptr;
	Value *ScaledReg = nullptr;
	Value *OriginalValue = nullptr;
	bool InBounds = true;

	enum FieldName {
	NoField = 0x00,
	BaseRegField = 0x01,
	BaseGVField = 0x02,
	BaseOffsField = 0x04,
	ScaledRegField = 0x08,
	ScaleField = 0x10,
	MultipleFields = 0xff
	};


	ExtAddrMode() = default;

	void print(raw_ostream &OS) const;
	void dump() const;

	FieldName compare(const ExtAddrMode &other) {
	// First check that the types are the same on each field, as differing types
	// is something we can't cope with later on.
	if (BaseReg && other.BaseReg &&
	BaseReg->getType() != other.BaseReg->getType())
	return MultipleFields;
	if (BaseGV && other.BaseGV &&
	BaseGV->getType() != other.BaseGV->getType())
	return MultipleFields;
	if (ScaledReg && other.ScaledReg &&
	ScaledReg->getType() != other.ScaledReg->getType())
	return MultipleFields;

	// Conservatively reject 'inbounds' mismatches.
	if (InBounds != other.InBounds)
	return MultipleFields;

	// Check each field to see if it differs.
	unsigned Result = NoField;
	if (BaseReg != other.BaseReg)
	Result \|= BaseRegField;
	if (BaseGV != other.BaseGV)
	Result \|= BaseGVField;
	if (BaseOffs != other.BaseOffs)
	Result \|= BaseOffsField;
	if (ScaledReg != other.ScaledReg)
	Result \|= ScaledRegField;
	// Don't count 0 as being a different scale, because that actually means
	// unscaled (which will already be counted by having no ScaledReg).
	if (Scale && other.Scale && Scale != other.Scale)
	Result \|= ScaleField;

	if (countPopulation(Result) > 1)
	return MultipleFields;
	else
	return static_cast<FieldName>(Result);
	}

	// An AddrMode is trivial if it involves no calculation i.e. it is just a base
	// with no offset.
	bool isTrivial() {
	// An AddrMode is (BaseGV + BaseReg + BaseOffs + ScaleReg * Scale) so it is
	// trivial if at most one of these terms is nonzero, except that BaseGV and
	// BaseReg both being zero actually means a null pointer value, which we
	// consider to be 'non-zero' here.
	return !BaseOffs && !Scale && !(BaseGV && BaseReg);
	}

	Value GetFieldAsValue(FieldName Field, Type IntPtrTy) {
	switch (Field) {
	default:
	return nullptr;
	case BaseRegField:
	return BaseReg;
	case BaseGVField:
	return BaseGV;
	case ScaledRegField:
	return ScaledReg;
	case BaseOffsField:
	return ConstantInt::get(IntPtrTy, BaseOffs);
	}
	}

	void SetCombinedField(FieldName Field, Value *V,
	const SmallVectorImpl<ExtAddrMode> &AddrModes) {
	switch (Field) {
	default:
	llvm_unreachable("Unhandled fields are expected to be rejected earlier");
	break;
	case ExtAddrMode::BaseRegField:
	BaseReg = V;
	break;
	case ExtAddrMode::BaseGVField:
	// A combined BaseGV is an Instruction, not a GlobalValue, so it goes
	// in the BaseReg field.
	assert(BaseReg == nullptr);
	BaseReg = V;
	BaseGV = nullptr;
	break;
	case ExtAddrMode::ScaledRegField:
	ScaledReg = V;
	// If we have a mix of scaled and unscaled addrmodes then we want scale
	// to be the scale and not zero.
	if (!Scale)
	for (const ExtAddrMode &AM : AddrModes)
	if (AM.Scale) {
	Scale = AM.Scale;
	break;
	}
	break;
	case ExtAddrMode::BaseOffsField:
	// The offset is no longer a constant, so it goes in ScaledReg with a
	// scale of 1.
	assert(ScaledReg == nullptr);
	ScaledReg = V;
	Scale = 1;
	BaseOffs = 0;
	break;
	}
	}
	};

	} // end anonymous namespace

	#ifndef NDEBUG
	static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
	AM.print(OS);
	return OS;
	}
	#endif

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	void ExtAddrMode::print(raw_ostream &OS) const {
	bool NeedPlus = false;
	OS << "[";
	if (InBounds)
	OS << "inbounds ";
	if (BaseGV) {
	OS << (NeedPlus ? " + " : "")
	<< "GV:";
	BaseGV->printAsOperand(OS, /PrintType=/false);
	NeedPlus = true;
	}

	if (BaseOffs) {
	OS << (NeedPlus ? " + " : "")
	<< BaseOffs;
	NeedPlus = true;
	}

	if (BaseReg) {
	OS << (NeedPlus ? " + " : "")
	<< "Base:";
	BaseReg->printAsOperand(OS, /PrintType=/false);
	NeedPlus = true;
	}
	if (Scale) {
	OS << (NeedPlus ? " + " : "")
	<< Scale << "*";
	ScaledReg->printAsOperand(OS, /PrintType=/false);
	}

	OS << ']';
	}

	LLVM_DUMP_METHOD void ExtAddrMode::dump() const {
	print(dbgs());
	dbgs() << '\n';
	}
	#endif

	namespace {

	/// This class provides transaction based operation on the IR.
	/// Every change made through this class is recorded in the internal state and
	/// can be undone (rollback) until commit is called.
	class TypePromotionTransaction {
	/// This represents the common interface of the individual transaction.
	/// Each class implements the logic for doing one specific modification on
	/// the IR via the TypePromotionTransaction.
	class TypePromotionAction {
	protected:
	/// The Instruction modified.
	Instruction *Inst;

	public:
	/// Constructor of the action.
	/// The constructor performs the related action on the IR.
	TypePromotionAction(Instruction *Inst) : Inst(Inst) {}

	virtual ~TypePromotionAction() = default;

	/// Undo the modification done by this action.
	/// When this method is called, the IR must be in the same state as it was
	/// before this action was applied.
	/// \pre Undoing the action works if and only if the IR is in the exact same
	/// state as it was directly after this action was applied.
	virtual void undo() = 0;

	/// Advocate every change made by this action.
	/// When the results on the IR of the action are to be kept, it is important
	/// to call this function, otherwise hidden information may be kept forever.
	virtual void commit() {
	// Nothing to be done, this action is not doing anything.
	}
	};

	/// Utility to remember the position of an instruction.
	class InsertionHandler {
	/// Position of an instruction.
	/// Either an instruction:
	/// - Is the first in a basic block: BB is used.
	/// - Has a previous instruction: PrevInst is used.
	union {
	Instruction *PrevInst;
	BasicBlock *BB;
	} Point;

	/// Remember whether or not the instruction had a previous instruction.
	bool HasPrevInstruction;

	public:
	/// Record the position of \p Inst.
	InsertionHandler(Instruction *Inst) {
	BasicBlock::iterator It = Inst->getIterator();
	HasPrevInstruction = (It != (Inst->getParent()->begin()));
	if (HasPrevInstruction)
	Point.PrevInst = &*--It;
	else
	Point.BB = Inst->getParent();
	}

	/// Insert \p Inst at the recorded position.
	void insert(Instruction *Inst) {
	if (HasPrevInstruction) {
	if (Inst->getParent())
	Inst->removeFromParent();
	Inst->insertAfter(Point.PrevInst);
	} else {
	Instruction Position = &Point.BB->getFirstInsertionPt();
	if (Inst->getParent())
	Inst->moveBefore(Position);
	else
	Inst->insertBefore(Position);
	}
	}
	};

	/// Move an instruction before another.
	class InstructionMoveBefore : public TypePromotionAction {
	/// Original position of the instruction.
	InsertionHandler Position;

	public:
	/// Move \p Inst before \p Before.
	InstructionMoveBefore(Instruction Inst, Instruction Before)
	: TypePromotionAction(Inst), Position(Inst) {
	LLVM_DEBUG(dbgs() << "Do: move: " << Inst << "\nbefore: " << Before
	<< "\n");
	Inst->moveBefore(Before);
	}

	/// Move the instruction back to its original position.
	void undo() override {
	LLVM_DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n");
	Position.insert(Inst);
	}
	};

	/// Set the operand of an instruction with a new value.
	class OperandSetter : public TypePromotionAction {
	/// Original operand of the instruction.
	Value *Origin;

	/// Index of the modified instruction.
	unsigned Idx;

	public:
	/// Set \p Idx operand of \p Inst with \p NewVal.
	OperandSetter(Instruction Inst, unsigned Idx, Value NewVal)
	: TypePromotionAction(Inst), Idx(Idx) {
	LLVM_DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n"
	<< "for:" << *Inst << "\n"
	<< "with:" << *NewVal << "\n");
	Origin = Inst->getOperand(Idx);
	Inst->setOperand(Idx, NewVal);
	}

	/// Restore the original value of the instruction.
	void undo() override {
	LLVM_DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n"
	<< "for: " << *Inst << "\n"
	<< "with: " << *Origin << "\n");
	Inst->setOperand(Idx, Origin);
	}
	};

	/// Hide the operands of an instruction.
	/// Do as if this instruction was not using any of its operands.
	class OperandsHider : public TypePromotionAction {
	/// The list of original operands.
	SmallVector<Value *, 4> OriginalValues;

	public:
	/// Remove \p Inst from the uses of the operands of \p Inst.
	OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) {
	LLVM_DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n");
	unsigned NumOpnds = Inst->getNumOperands();
	OriginalValues.reserve(NumOpnds);
	for (unsigned It = 0; It < NumOpnds; ++It) {
	// Save the current operand.
	Value *Val = Inst->getOperand(It);
	OriginalValues.push_back(Val);
	// Set a dummy one.
	// We could use OperandSetter here, but that would imply an overhead
	// that we are not willing to pay.
	Inst->setOperand(It, UndefValue::get(Val->getType()));
	}
	}

	/// Restore the original list of uses.
	void undo() override {
	LLVM_DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n");
	for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It)
	Inst->setOperand(It, OriginalValues[It]);
	}
	};

	/// Build a truncate instruction.
	class TruncBuilder : public TypePromotionAction {
	Value *Val;

	public:
	/// Build a truncate instruction of \p Opnd producing a \p Ty
	/// result.
	/// trunc Opnd to Ty.
	TruncBuilder(Instruction Opnd, Type Ty) : TypePromotionAction(Opnd) {
	IRBuilder<> Builder(Opnd);
	Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
	LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
	}

	/// Get the built value.
	Value *getBuiltValue() { return Val; }

	/// Remove the built instruction.
	void undo() override {
	LLVM_DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
	if (Instruction *IVal = dyn_cast<Instruction>(Val))
	IVal->eraseFromParent();
	}
	};

	/// Build a sign extension instruction.
	class SExtBuilder : public TypePromotionAction {
	Value *Val;

	public:
	/// Build a sign extension instruction of \p Opnd producing a \p Ty
	/// result.
	/// sext Opnd to Ty.
	SExtBuilder(Instruction InsertPt, Value Opnd, Type *Ty)
	: TypePromotionAction(InsertPt) {
	IRBuilder<> Builder(InsertPt);
	Val = Builder.CreateSExt(Opnd, Ty, "promoted");
	LLVM_DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
	}

	/// Get the built value.
	Value *getBuiltValue() { return Val; }

	/// Remove the built instruction.
	void undo() override {
	LLVM_DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
	if (Instruction *IVal = dyn_cast<Instruction>(Val))
	IVal->eraseFromParent();
	}
	};

	/// Build a zero extension instruction.
	class ZExtBuilder : public TypePromotionAction {
	Value *Val;

	public:
	/// Build a zero extension instruction of \p Opnd producing a \p Ty
	/// result.
	/// zext Opnd to Ty.
	ZExtBuilder(Instruction InsertPt, Value Opnd, Type *Ty)
	: TypePromotionAction(InsertPt) {
	IRBuilder<> Builder(InsertPt);
	Val = Builder.CreateZExt(Opnd, Ty, "promoted");
	LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
	}

	/// Get the built value.
	Value *getBuiltValue() { return Val; }

	/// Remove the built instruction.
	void undo() override {
	LLVM_DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
	if (Instruction *IVal = dyn_cast<Instruction>(Val))
	IVal->eraseFromParent();
	}
	};

	/// Mutate an instruction to another type.
	class TypeMutator : public TypePromotionAction {
	/// Record the original type.
	Type *OrigTy;

	public:
	/// Mutate the type of \p Inst into \p NewTy.
	TypeMutator(Instruction Inst, Type NewTy)
	: TypePromotionAction(Inst), OrigTy(Inst->getType()) {
	LLVM_DEBUG(dbgs() << "Do: MutateType: " << Inst << " with " << NewTy
	<< "\n");
	Inst->mutateType(NewTy);
	}

	/// Mutate the instruction back to its original type.
	void undo() override {
	LLVM_DEBUG(dbgs() << "Undo: MutateType: " << Inst << " with " << OrigTy
	<< "\n");
	Inst->mutateType(OrigTy);
	}
	};

	/// Replace the uses of an instruction by another instruction.
	class UsesReplacer : public TypePromotionAction {
	/// Helper structure to keep track of the replaced uses.
	struct InstructionAndIdx {
	/// The instruction using the instruction.
	Instruction *Inst;

	/// The index where this instruction is used for Inst.
	unsigned Idx;

	InstructionAndIdx(Instruction *Inst, unsigned Idx)
	: Inst(Inst), Idx(Idx) {}
	};

	/// Keep track of the original uses (pair Instruction, Index).
	SmallVector<InstructionAndIdx, 4> OriginalUses;
	/// Keep track of the debug users.
	SmallVector<DbgValueInst *, 1> DbgValues;

	using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator;

	public:
	/// Replace all the use of \p Inst by \p New.
	UsesReplacer(Instruction Inst, Value New) : TypePromotionAction(Inst) {
	LLVM_DEBUG(dbgs() << "Do: UsersReplacer: " << Inst << " with " << New
	<< "\n");
	// Record the original uses.
	for (Use &U : Inst->uses()) {
	Instruction *UserI = cast<Instruction>(U.getUser());
	OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo()));
	}
	// Record the debug uses separately. They are not in the instruction's
	// use list, but they are replaced by RAUW.
	findDbgValues(DbgValues, Inst);

	// Now, we can replace the uses.
	Inst->replaceAllUsesWith(New);
	}

	/// Reassign the original uses of Inst to Inst.
	void undo() override {
	LLVM_DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n");
	for (use_iterator UseIt = OriginalUses.begin(),
	EndIt = OriginalUses.end();
	UseIt != EndIt; ++UseIt) {
	UseIt->Inst->setOperand(UseIt->Idx, Inst);
	}
	// RAUW has replaced all original uses with references to the new value,
	// including the debug uses. Since we are undoing the replacements,
	// the original debug uses must also be reinstated to maintain the
	// correctness and utility of debug value instructions.
	for (auto *DVI: DbgValues) {
	LLVMContext &Ctx = Inst->getType()->getContext();
	auto *MV = MetadataAsValue::get(Ctx, ValueAsMetadata::get(Inst));
	DVI->setOperand(0, MV);
	}
	}
	};

	/// Remove an instruction from the IR.
	class InstructionRemover : public TypePromotionAction {
	/// Original position of the instruction.
	InsertionHandler Inserter;

	/// Helper structure to hide all the link to the instruction. In other
	/// words, this helps to do as if the instruction was removed.
	OperandsHider Hider;

	/// Keep track of the uses replaced, if any.
	UsesReplacer *Replacer = nullptr;

	/// Keep track of instructions removed.
	SetOfInstrs &RemovedInsts;

	public:
	/// Remove all reference of \p Inst and optionally replace all its
	/// uses with New.
	/// \p RemovedInsts Keep track of the instructions removed by this Action.
	/// \pre If !Inst->use_empty(), then New != nullptr
	InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts,
	Value *New = nullptr)
	: TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
	RemovedInsts(RemovedInsts) {
	if (New)
	Replacer = new UsesReplacer(Inst, New);
	LLVM_DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
	RemovedInsts.insert(Inst);
	/// The instructions removed here will be freed after completing
	/// optimizeBlock() for all blocks as we need to keep track of the
	/// removed instructions during promotion.
	Inst->removeFromParent();
	}

	~InstructionRemover() override { delete Replacer; }

	/// Resurrect the instruction and reassign it to the proper uses if
	/// new value was provided when build this action.
	void undo() override {
	LLVM_DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n");
	Inserter.insert(Inst);
	if (Replacer)
	Replacer->undo();
	Hider.undo();
	RemovedInsts.erase(Inst);
	}
	};

	public:
	/// Restoration point.
	/// The restoration point is a pointer to an action instead of an iterator
	/// because the iterator may be invalidated but not the pointer.
	using ConstRestorationPt = const TypePromotionAction *;

	TypePromotionTransaction(SetOfInstrs &RemovedInsts)
	: RemovedInsts(RemovedInsts) {}

	/// Advocate every changes made in that transaction.
	void commit();

	/// Undo all the changes made after the given point.
	void rollback(ConstRestorationPt Point);

	/// Get the current restoration point.
	ConstRestorationPt getRestorationPoint() const;

	/// \name API for IR modification with state keeping to support rollback.
	/// @{
	/// Same as Instruction::setOperand.
	void setOperand(Instruction Inst, unsigned Idx, Value NewVal);

	/// Same as Instruction::eraseFromParent.
	void eraseInstruction(Instruction Inst, Value NewVal = nullptr);

	/// Same as Value::replaceAllUsesWith.
	void replaceAllUsesWith(Instruction Inst, Value New);

	/// Same as Value::mutateType.
	void mutateType(Instruction Inst, Type NewTy);

	/// Same as IRBuilder::createTrunc.
	Value createTrunc(Instruction Opnd, Type *Ty);

	/// Same as IRBuilder::createSExt.
	Value createSExt(Instruction Inst, Value Opnd, Type Ty);

	/// Same as IRBuilder::createZExt.
	Value createZExt(Instruction Inst, Value Opnd, Type Ty);

	/// Same as Instruction::moveBefore.
	void moveBefore(Instruction Inst, Instruction Before);
	/// @}

	private:
	/// The ordered list of actions made so far.
	SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;

	using CommitPt = SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator;

	SetOfInstrs &RemovedInsts;
	};

	} // end anonymous namespace

	void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
	Value *NewVal) {
	Actions.push_back(llvm::make_unique<TypePromotionTransaction::OperandSetter>(
	Inst, Idx, NewVal));
	}

	void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
	Value *NewVal) {
	Actions.push_back(
	llvm::make_unique<TypePromotionTransaction::InstructionRemover>(
	Inst, RemovedInsts, NewVal));
	}

	void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
	Value *New) {
	Actions.push_back(
	llvm::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
	}

	void TypePromotionTransaction::mutateType(Instruction Inst, Type NewTy) {
	Actions.push_back(
	llvm::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
	}

	Value TypePromotionTransaction::createTrunc(Instruction Opnd,
	Type *Ty) {
	std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));
	Value *Val = Ptr->getBuiltValue();
	Actions.push_back(std::move(Ptr));
	return Val;
	}

	Value TypePromotionTransaction::createSExt(Instruction Inst,
	Value Opnd, Type Ty) {
	std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));
	Value *Val = Ptr->getBuiltValue();
	Actions.push_back(std::move(Ptr));
	return Val;
	}

	Value TypePromotionTransaction::createZExt(Instruction Inst,
	Value Opnd, Type Ty) {
	std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty));
	Value *Val = Ptr->getBuiltValue();
	Actions.push_back(std::move(Ptr));
	return Val;
	}

	void TypePromotionTransaction::moveBefore(Instruction *Inst,
	Instruction *Before) {
	Actions.push_back(
	llvm::make_unique<TypePromotionTransaction::InstructionMoveBefore>(
	Inst, Before));
	}

	TypePromotionTransaction::ConstRestorationPt
	TypePromotionTransaction::getRestorationPoint() const {
	return !Actions.empty() ? Actions.back().get() : nullptr;
	}

	void TypePromotionTransaction::commit() {
	for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt;
	++It)
	(*It)->commit();
	Actions.clear();
	}

	void TypePromotionTransaction::rollback(
	TypePromotionTransaction::ConstRestorationPt Point) {
	while (!Actions.empty() && Point != Actions.back().get()) {
	std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val();
	Curr->undo();
	}
	}

	namespace {

	/// A helper class for matching addressing modes.
	///
	/// This encapsulates the logic for matching the target-legal addressing modes.
	class AddressingModeMatcher {
	SmallVectorImpl<Instruction*> &AddrModeInsts;
	const TargetLowering &TLI;
	const TargetRegisterInfo &TRI;
	const DataLayout &DL;

	/// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
	/// the memory instruction that we're computing this address for.
	Type *AccessTy;
	unsigned AddrSpace;
	Instruction *MemoryInst;

	/// This is the addressing mode that we're building up. This is
	/// part of the return value of this addressing mode matching stuff.
	ExtAddrMode &AddrMode;

	/// The instructions inserted by other CodeGenPrepare optimizations.
	const SetOfInstrs &InsertedInsts;

	/// A map from the instructions to their type before promotion.
	InstrToOrigTy &PromotedInsts;

	/// The ongoing transaction where every action should be registered.
	TypePromotionTransaction &TPT;

	// A GEP which has too large offset to be folded into the addressing mode.
	std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP;

	/// This is set to true when we should not do profitability checks.
	/// When true, IsProfitableToFoldIntoAddressingMode always returns true.
	bool IgnoreProfitability;

	AddressingModeMatcher(
	SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI,
	const TargetRegisterInfo &TRI, Type AT, unsigned AS, Instruction MI,
	ExtAddrMode &AM, const SetOfInstrs &InsertedInsts,
	InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
	std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP)
	: AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
	DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
	MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
	PromotedInsts(PromotedInsts), TPT(TPT), LargeOffsetGEP(LargeOffsetGEP) {
	IgnoreProfitability = false;
	}

	public:
	/// Find the maximal addressing mode that a load/store of V can fold,
	/// give an access type of AccessTy. This returns a list of involved
	/// instructions in AddrModeInsts.
	/// \p InsertedInsts The instructions inserted by other CodeGenPrepare
	/// optimizations.
	/// \p PromotedInsts maps the instructions to their type before promotion.
	/// \p The ongoing transaction where every action should be registered.
	static ExtAddrMode
	Match(Value V, Type AccessTy, unsigned AS, Instruction *MemoryInst,
	SmallVectorImpl<Instruction *> &AddrModeInsts,
	const TargetLowering &TLI, const TargetRegisterInfo &TRI,
	const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts,
	TypePromotionTransaction &TPT,
	std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP) {
	ExtAddrMode Result;

	bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, AccessTy, AS,
	MemoryInst, Result, InsertedInsts,
	PromotedInsts, TPT, LargeOffsetGEP)
	.matchAddr(V, 0);
	(void)Success; assert(Success && "Couldn't select anything?");
	return Result;
	}

	private:
	bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
	bool matchAddr(Value *Addr, unsigned Depth);
	bool matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth,
	bool *MovedAway = nullptr);
	bool isProfitableToFoldIntoAddressingMode(Instruction *I,
	ExtAddrMode &AMBefore,
	ExtAddrMode &AMAfter);
	bool valueAlreadyLiveAtInst(Value Val, Value KnownLive1, Value *KnownLive2);
	bool isPromotionProfitable(unsigned NewCost, unsigned OldCost,
	Value *PromotedOperand) const;
	};

	class PhiNodeSet;

	/// An iterator for PhiNodeSet.
	class PhiNodeSetIterator {
	PhiNodeSet * const Set;
	size_t CurrentIndex = 0;

	public:
	/// The constructor. Start should point to either a valid element, or be equal
	/// to the size of the underlying SmallVector of the PhiNodeSet.
	PhiNodeSetIterator(PhiNodeSet * const Set, size_t Start);
	PHINode * operator*() const;
	PhiNodeSetIterator& operator++();
	bool operator==(const PhiNodeSetIterator &RHS) const;
	bool operator!=(const PhiNodeSetIterator &RHS) const;
	};

	/// Keeps a set of PHINodes.
	///
	/// This is a minimal set implementation for a specific use case:
	/// It is very fast when there are very few elements, but also provides good
	/// performance when there are many. It is similar to SmallPtrSet, but also
	/// provides iteration by insertion order, which is deterministic and stable
	/// across runs. It is also similar to SmallSetVector, but provides removing
	/// elements in O(1) time. This is achieved by not actually removing the element
	/// from the underlying vector, so comes at the cost of using more memory, but
	/// that is fine, since PhiNodeSets are used as short lived objects.
	class PhiNodeSet {
	friend class PhiNodeSetIterator;

	using MapType = SmallDenseMap<PHINode *, size_t, 32>;
	using iterator = PhiNodeSetIterator;

	/// Keeps the elements in the order of their insertion in the underlying
	/// vector. To achieve constant time removal, it never deletes any element.
	SmallVector<PHINode *, 32> NodeList;

	/// Keeps the elements in the underlying set implementation. This (and not the
	/// NodeList defined above) is the source of truth on whether an element
	/// is actually in the collection.
	MapType NodeMap;

	/// Points to the first valid (not deleted) element when the set is not empty
	/// and the value is not zero. Equals to the size of the underlying vector
	/// when the set is empty. When the value is 0, as in the beginning, the
	/// first element may or may not be valid.
	size_t FirstValidElement = 0;

	public:
	/// Inserts a new element to the collection.
	/// \returns true if the element is actually added, i.e. was not in the
	/// collection before the operation.
	bool insert(PHINode *Ptr) {
	if (NodeMap.insert(std::make_pair(Ptr, NodeList.size())).second) {
	NodeList.push_back(Ptr);
	return true;
	}
	return false;
	}

	/// Removes the element from the collection.
	/// \returns whether the element is actually removed, i.e. was in the
	/// collection before the operation.
	bool erase(PHINode *Ptr) {
	auto it = NodeMap.find(Ptr);
	if (it != NodeMap.end()) {
	NodeMap.erase(Ptr);
	SkipRemovedElements(FirstValidElement);
	return true;
	}
	return false;
	}

	/// Removes all elements and clears the collection.
	void clear() {
	NodeMap.clear();
	NodeList.clear();
	FirstValidElement = 0;
	}

	/// \returns an iterator that will iterate the elements in the order of
	/// insertion.
	iterator begin() {
	if (FirstValidElement == 0)
	SkipRemovedElements(FirstValidElement);
	return PhiNodeSetIterator(this, FirstValidElement);
	}

	/// \returns an iterator that points to the end of the collection.
	iterator end() { return PhiNodeSetIterator(this, NodeList.size()); }

	/// Returns the number of elements in the collection.
	size_t size() const {
	return NodeMap.size();
	}

	/// \returns 1 if the given element is in the collection, and 0 if otherwise.
	size_t count(PHINode *Ptr) const {
	return NodeMap.count(Ptr);
	}

	private:
	/// Updates the CurrentIndex so that it will point to a valid element.
	///
	/// If the element of NodeList at CurrentIndex is valid, it does not
	/// change it. If there are no more valid elements, it updates CurrentIndex
	/// to point to the end of the NodeList.
	void SkipRemovedElements(size_t &CurrentIndex) {
	while (CurrentIndex < NodeList.size()) {
	auto it = NodeMap.find(NodeList[CurrentIndex]);
	// If the element has been deleted and added again later, NodeMap will
	// point to a different index, so CurrentIndex will still be invalid.
	if (it != NodeMap.end() && it->second == CurrentIndex)
	break;
	++CurrentIndex;
	}
	}
	};

	PhiNodeSetIterator::PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start)
	: Set(Set), CurrentIndex(Start) {}

	PHINode * PhiNodeSetIterator::operator*() const {
	assert(CurrentIndex < Set->NodeList.size() &&
	"PhiNodeSet access out of range");
	return Set->NodeList[CurrentIndex];
	}

	PhiNodeSetIterator& PhiNodeSetIterator::operator++() {
	assert(CurrentIndex < Set->NodeList.size() &&
	"PhiNodeSet access out of range");
	++CurrentIndex;
	Set->SkipRemovedElements(CurrentIndex);
	return *this;
	}

	bool PhiNodeSetIterator::operator==(const PhiNodeSetIterator &RHS) const {
	return CurrentIndex == RHS.CurrentIndex;
	}

	bool PhiNodeSetIterator::operator!=(const PhiNodeSetIterator &RHS) const {
	return !((*this) == RHS);
	}

	/// Keep track of simplification of Phi nodes.
	/// Accept the set of all phi nodes and erase phi node from this set
	/// if it is simplified.
	class SimplificationTracker {
	DenseMap<Value , Value > Storage;
	const SimplifyQuery &SQ;
	// Tracks newly created Phi nodes. The elements are iterated by insertion
	// order.
	PhiNodeSet AllPhiNodes;
	// Tracks newly created Select nodes.
	SmallPtrSet<SelectInst *, 32> AllSelectNodes;

	public:
	SimplificationTracker(const SimplifyQuery &sq)
	: SQ(sq) {}

	Value Get(Value V) {
	do {
	auto SV = Storage.find(V);
	if (SV == Storage.end())
	return V;
	V = SV->second;
	} while (true);
	}

	Value Simplify(Value Val) {
	SmallVector<Value *, 32> WorkList;
	SmallPtrSet<Value *, 32> Visited;
	WorkList.push_back(Val);
	while (!WorkList.empty()) {
	auto P = WorkList.pop_back_val();
	if (!Visited.insert(P).second)
	continue;
	if (auto *PI = dyn_cast<Instruction>(P))
	if (Value *V = SimplifyInstruction(cast<Instruction>(PI), SQ)) {
	for (auto *U : PI->users())
	WorkList.push_back(cast<Value>(U));
	Put(PI, V);
	PI->replaceAllUsesWith(V);
	if (auto *PHI = dyn_cast<PHINode>(PI))
	AllPhiNodes.erase(PHI);
	if (auto *Select = dyn_cast<SelectInst>(PI))
	AllSelectNodes.erase(Select);
	PI->eraseFromParent();
	}
	}
	return Get(Val);
	}

	void Put(Value From, Value To) {
	Storage.insert({ From, To });
	}

	void ReplacePhi(PHINode From, PHINode To) {
	Value* OldReplacement = Get(From);
	while (OldReplacement != From) {
	From = To;
	To = dyn_cast<PHINode>(OldReplacement);
	OldReplacement = Get(From);
	}
	assert(Get(To) == To && "Replacement PHI node is already replaced.");
	Put(From, To);
	From->replaceAllUsesWith(To);
	AllPhiNodes.erase(From);
	From->eraseFromParent();
	}

	PhiNodeSet& newPhiNodes() { return AllPhiNodes; }

	void insertNewPhi(PHINode *PN) { AllPhiNodes.insert(PN); }

	void insertNewSelect(SelectInst *SI) { AllSelectNodes.insert(SI); }

	unsigned countNewPhiNodes() const { return AllPhiNodes.size(); }

	unsigned countNewSelectNodes() const { return AllSelectNodes.size(); }

	void destroyNewNodes(Type *CommonType) {
	// For safe erasing, replace the uses with dummy value first.
	auto Dummy = UndefValue::get(CommonType);
	for (auto I : AllPhiNodes) {
	I->replaceAllUsesWith(Dummy);
	I->eraseFromParent();
	}
	AllPhiNodes.clear();
	for (auto I : AllSelectNodes) {
	I->replaceAllUsesWith(Dummy);
	I->eraseFromParent();
	}
	AllSelectNodes.clear();
	}
	};

	/// A helper class for combining addressing modes.
	class AddressingModeCombiner {
	typedef DenseMap<Value , Value > FoldAddrToValueMapping;
	typedef std::pair<PHINode , PHINode > PHIPair;

	private:
	/// The addressing modes we've collected.
	SmallVector<ExtAddrMode, 16> AddrModes;

	/// The field in which the AddrModes differ, when we have more than one.
	ExtAddrMode::FieldName DifferentField = ExtAddrMode::NoField;

	/// Are the AddrModes that we have all just equal to their original values?
	bool AllAddrModesTrivial = true;

	/// Common Type for all different fields in addressing modes.
	Type *CommonType;

	/// SimplifyQuery for simplifyInstruction utility.
	const SimplifyQuery &SQ;

	/// Original Address.
	Value *Original;

	public:
	AddressingModeCombiner(const SimplifyQuery &_SQ, Value *OriginalValue)
	: CommonType(nullptr), SQ(_SQ), Original(OriginalValue) {}

	/// Get the combined AddrMode
	const ExtAddrMode &getAddrMode() const {
	return AddrModes[0];
	}

	/// Add a new AddrMode if it's compatible with the AddrModes we already
	/// have.
	/// \return True iff we succeeded in doing so.
	bool addNewAddrMode(ExtAddrMode &NewAddrMode) {
	// Take note of if we have any non-trivial AddrModes, as we need to detect
	// when all AddrModes are trivial as then we would introduce a phi or select
	// which just duplicates what's already there.
	AllAddrModesTrivial = AllAddrModesTrivial && NewAddrMode.isTrivial();

	// If this is the first addrmode then everything is fine.
	if (AddrModes.empty()) {
	AddrModes.emplace_back(NewAddrMode);
	return true;
	}

	// Figure out how different this is from the other address modes, which we
	// can do just by comparing against the first one given that we only care
	// about the cumulative difference.
	ExtAddrMode::FieldName ThisDifferentField =
	AddrModes[0].compare(NewAddrMode);
	if (DifferentField == ExtAddrMode::NoField)
	DifferentField = ThisDifferentField;
	else if (DifferentField != ThisDifferentField)
	DifferentField = ExtAddrMode::MultipleFields;

	// If NewAddrMode differs in more than one dimension we cannot handle it.
	bool CanHandle = DifferentField != ExtAddrMode::MultipleFields;

	// If Scale Field is different then we reject.
	CanHandle = CanHandle && DifferentField != ExtAddrMode::ScaleField;

	// We also must reject the case when base offset is different and
	// scale reg is not null, we cannot handle this case due to merge of
	// different offsets will be used as ScaleReg.
	CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseOffsField \|\|
	!NewAddrMode.ScaledReg);

	// We also must reject the case when GV is different and BaseReg installed
	// due to we want to use base reg as a merge of GV values.
	CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseGVField \|\|
	!NewAddrMode.HasBaseReg);

	// Even if NewAddMode is the same we still need to collect it due to
	// original value is different. And later we will need all original values
	// as anchors during finding the common Phi node.
	if (CanHandle)
	AddrModes.emplace_back(NewAddrMode);
	else
	AddrModes.clear();

	return CanHandle;
	}

	/// Combine the addressing modes we've collected into a single
	/// addressing mode.
	/// \return True iff we successfully combined them or we only had one so
	/// didn't need to combine them anyway.
	bool combineAddrModes() {
	// If we have no AddrModes then they can't be combined.
	if (AddrModes.size() == 0)
	return false;

	// A single AddrMode can trivially be combined.
	if (AddrModes.size() == 1 \|\| DifferentField == ExtAddrMode::NoField)
	return true;

	// If the AddrModes we collected are all just equal to the value they are
	// derived from then combining them wouldn't do anything useful.
	if (AllAddrModesTrivial)
	return false;

	if (!addrModeCombiningAllowed())
	return false;

	// Build a map between <original value, basic block where we saw it> to
	// value of base register.
	// Bail out if there is no common type.
	FoldAddrToValueMapping Map;
	if (!initializeMap(Map))
	return false;

	Value *CommonValue = findCommon(Map);
	if (CommonValue)
	AddrModes[0].SetCombinedField(DifferentField, CommonValue, AddrModes);
	return CommonValue != nullptr;
	}

	private:
	/// Initialize Map with anchor values. For address seen
	/// we set the value of different field saw in this address.
	/// At the same time we find a common type for different field we will
	/// use to create new Phi/Select nodes. Keep it in CommonType field.
	/// Return false if there is no common type found.
	bool initializeMap(FoldAddrToValueMapping &Map) {
	// Keep track of keys where the value is null. We will need to replace it
	// with constant null when we know the common type.
	SmallVector<Value *, 2> NullValue;
	Type *IntPtrTy = SQ.DL.getIntPtrType(AddrModes[0].OriginalValue->getType());
	for (auto &AM : AddrModes) {
	Value *DV = AM.GetFieldAsValue(DifferentField, IntPtrTy);
	if (DV) {
	auto *Type = DV->getType();
	if (CommonType && CommonType != Type)
	return false;
	CommonType = Type;
	Map[AM.OriginalValue] = DV;
	} else {
	NullValue.push_back(AM.OriginalValue);
	}
	}
	assert(CommonType && "At least one non-null value must be!");
	for (auto *V : NullValue)
	Map[V] = Constant::getNullValue(CommonType);
	return true;
	}

	/// We have mapping between value A and other value B where B was a field in
	/// addressing mode represented by A. Also we have an original value C
	/// representing an address we start with. Traversing from C through phi and
	/// selects we ended up with A's in a map. This utility function tries to find
	/// a value V which is a field in addressing mode C and traversing through phi
	/// nodes and selects we will end up in corresponded values B in a map.
	/// The utility will create a new Phi/Selects if needed.
	// The simple example looks as follows:
	// BB1:
	// p1 = b1 + 40
	// br cond BB2, BB3
	// BB2:
	// p2 = b2 + 40
	// br BB3
	// BB3:
	// p = phi [p1, BB1], [p2, BB2]
	// v = load p
	// Map is
	// p1 -> b1
	// p2 -> b2
	// Request is
	// p -> ?
	// The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3.
	Value *findCommon(FoldAddrToValueMapping &Map) {
	// Tracks the simplification of newly created phi nodes. The reason we use
	// this mapping is because we will add new created Phi nodes in AddrToBase.
	// Simplification of Phi nodes is recursive, so some Phi node may
	// be simplified after we added it to AddrToBase. In reality this
	// simplification is possible only if original phi/selects were not
	// simplified yet.
	// Using this mapping we can find the current value in AddrToBase.
	SimplificationTracker ST(SQ);

	// First step, DFS to create PHI nodes for all intermediate blocks.
	// Also fill traverse order for the second step.
	SmallVector<Value *, 32> TraverseOrder;
	InsertPlaceholders(Map, TraverseOrder, ST);

	// Second Step, fill new nodes by merged values and simplify if possible.
	FillPlaceholders(Map, TraverseOrder, ST);

	if (!AddrSinkNewSelects && ST.countNewSelectNodes() > 0) {
	ST.destroyNewNodes(CommonType);
	return nullptr;
	}

	// Now we'd like to match New Phi nodes to existed ones.
	unsigned PhiNotMatchedCount = 0;
	if (!MatchPhiSet(ST, AddrSinkNewPhis, PhiNotMatchedCount)) {
	ST.destroyNewNodes(CommonType);
	return nullptr;
	}

	auto *Result = ST.Get(Map.find(Original)->second);
	if (Result) {
	NumMemoryInstsPhiCreated += ST.countNewPhiNodes() + PhiNotMatchedCount;
	NumMemoryInstsSelectCreated += ST.countNewSelectNodes();
	}
	return Result;
	}

	/// Try to match PHI node to Candidate.
	/// Matcher tracks the matched Phi nodes.
	bool MatchPhiNode(PHINode PHI, PHINode Candidate,
	SmallSetVector<PHIPair, 8> &Matcher,
	PhiNodeSet &PhiNodesToMatch) {
	SmallVector<PHIPair, 8> WorkList;
	Matcher.insert({ PHI, Candidate });
	SmallSet<PHINode *, 8> MatchedPHIs;
	MatchedPHIs.insert(PHI);
	WorkList.push_back({ PHI, Candidate });
	SmallSet<PHIPair, 8> Visited;
	while (!WorkList.empty()) {
	auto Item = WorkList.pop_back_val();
	if (!Visited.insert(Item).second)
	continue;
	// We iterate over all incoming values to Phi to compare them.
	// If values are different and both of them Phi and the first one is a
	// Phi we added (subject to match) and both of them is in the same basic
	// block then we can match our pair if values match. So we state that
	// these values match and add it to work list to verify that.
	for (auto B : Item.first->blocks()) {
	Value *FirstValue = Item.first->getIncomingValueForBlock(B);
	Value *SecondValue = Item.second->getIncomingValueForBlock(B);
	if (FirstValue == SecondValue)
	continue;

	PHINode *FirstPhi = dyn_cast<PHINode>(FirstValue);
	PHINode *SecondPhi = dyn_cast<PHINode>(SecondValue);

	// One of them is not Phi or
	// The first one is not Phi node from the set we'd like to match or
	// Phi nodes from different basic blocks then
	// we will not be able to match.
	if (!FirstPhi \|\| !SecondPhi \|\| !PhiNodesToMatch.count(FirstPhi) \|\|
	FirstPhi->getParent() != SecondPhi->getParent())
	return false;

	// If we already matched them then continue.
	if (Matcher.count({ FirstPhi, SecondPhi }))
	continue;
	// So the values are different and does not match. So we need them to
	// match. (But we register no more than one match per PHI node, so that
	// we won't later try to replace them twice.)
	if (!MatchedPHIs.insert(FirstPhi).second)
	Matcher.insert({ FirstPhi, SecondPhi });
	// But me must check it.
	WorkList.push_back({ FirstPhi, SecondPhi });
	}
	}
	return true;
	}

	/// For the given set of PHI nodes (in the SimplificationTracker) try
	/// to find their equivalents.
	/// Returns false if this matching fails and creation of new Phi is disabled.
	bool MatchPhiSet(SimplificationTracker &ST, bool AllowNewPhiNodes,
	unsigned &PhiNotMatchedCount) {
	// Matched and PhiNodesToMatch iterate their elements in a deterministic
	// order, so the replacements (ReplacePhi) are also done in a deterministic
	// order.
	SmallSetVector<PHIPair, 8> Matched;
	SmallPtrSet<PHINode *, 8> WillNotMatch;
	PhiNodeSet &PhiNodesToMatch = ST.newPhiNodes();
	while (PhiNodesToMatch.size()) {
	PHINode PHI = PhiNodesToMatch.begin();

	// Add us, if no Phi nodes in the basic block we do not match.
	WillNotMatch.clear();
	WillNotMatch.insert(PHI);

	// Traverse all Phis until we found equivalent or fail to do that.
	bool IsMatched = false;
	for (auto &P : PHI->getParent()->phis()) {
	if (&P == PHI)
	continue;
	if ((IsMatched = MatchPhiNode(PHI, &P, Matched, PhiNodesToMatch)))
	break;
	// If it does not match, collect all Phi nodes from matcher.
	// if we end up with no match, them all these Phi nodes will not match
	// later.
	for (auto M : Matched)
	WillNotMatch.insert(M.first);
	Matched.clear();
	}
	if (IsMatched) {
	// Replace all matched values and erase them.
	for (auto MV : Matched)
	ST.ReplacePhi(MV.first, MV.second);
	Matched.clear();
	continue;
	}
	// If we are not allowed to create new nodes then bail out.
	if (!AllowNewPhiNodes)
	return false;
	// Just remove all seen values in matcher. They will not match anything.
	PhiNotMatchedCount += WillNotMatch.size();
	for (auto *P : WillNotMatch)
	PhiNodesToMatch.erase(P);
	}
	return true;
	}
	/// Fill the placeholders with values from predecessors and simplify them.
	void FillPlaceholders(FoldAddrToValueMapping &Map,
	SmallVectorImpl<Value *> &TraverseOrder,
	SimplificationTracker &ST) {
	while (!TraverseOrder.empty()) {
	Value *Current = TraverseOrder.pop_back_val();
	assert(Map.find(Current) != Map.end() && "No node to fill!!!");
	Value *V = Map[Current];

	if (SelectInst *Select = dyn_cast<SelectInst>(V)) {
	// CurrentValue also must be Select.
	auto *CurrentSelect = cast<SelectInst>(Current);
	auto *TrueValue = CurrentSelect->getTrueValue();
	assert(Map.find(TrueValue) != Map.end() && "No True Value!");
	Select->setTrueValue(ST.Get(Map[TrueValue]));
	auto *FalseValue = CurrentSelect->getFalseValue();
	assert(Map.find(FalseValue) != Map.end() && "No False Value!");
	Select->setFalseValue(ST.Get(Map[FalseValue]));
	} else {
	// Must be a Phi node then.
	PHINode *PHI = cast<PHINode>(V);
	auto *CurrentPhi = dyn_cast<PHINode>(Current);
	// Fill the Phi node with values from predecessors.
	for (auto B : predecessors(PHI->getParent())) {
	Value *PV = CurrentPhi->getIncomingValueForBlock(B);
	assert(Map.find(PV) != Map.end() && "No predecessor Value!");
	PHI->addIncoming(ST.Get(Map[PV]), B);
	}
	}
	Map[Current] = ST.Simplify(V);
	}
	}

	/// Starting from original value recursively iterates over def-use chain up to
	/// known ending values represented in a map. For each traversed phi/select
	/// inserts a placeholder Phi or Select.
	/// Reports all new created Phi/Select nodes by adding them to set.
	/// Also reports and order in what values have been traversed.
	void InsertPlaceholders(FoldAddrToValueMapping &Map,
	SmallVectorImpl<Value *> &TraverseOrder,
	SimplificationTracker &ST) {
	SmallVector<Value *, 32> Worklist;
	assert((isa<PHINode>(Original) \|\| isa<SelectInst>(Original)) &&
	"Address must be a Phi or Select node");
	auto *Dummy = UndefValue::get(CommonType);
	Worklist.push_back(Original);
	while (!Worklist.empty()) {
	Value *Current = Worklist.pop_back_val();
	// if it is already visited or it is an ending value then skip it.
	if (Map.find(Current) != Map.end())
	continue;
	TraverseOrder.push_back(Current);

	// CurrentValue must be a Phi node or select. All others must be covered
	// by anchors.
	if (SelectInst *CurrentSelect = dyn_cast<SelectInst>(Current)) {
	// Is it OK to get metadata from OrigSelect?!
	// Create a Select placeholder with dummy value.
	SelectInst *Select = SelectInst::Create(
	CurrentSelect->getCondition(), Dummy, Dummy,
	CurrentSelect->getName(), CurrentSelect, CurrentSelect);
	Map[Current] = Select;
	ST.insertNewSelect(Select);
	// We are interested in True and False values.
	Worklist.push_back(CurrentSelect->getTrueValue());
	Worklist.push_back(CurrentSelect->getFalseValue());
	} else {
	// It must be a Phi node then.
	PHINode *CurrentPhi = cast<PHINode>(Current);
	unsigned PredCount = CurrentPhi->getNumIncomingValues();
	PHINode *PHI =
	PHINode::Create(CommonType, PredCount, "sunk_phi", CurrentPhi);
	Map[Current] = PHI;
	ST.insertNewPhi(PHI);
	for (Value *P : CurrentPhi->incoming_values())
	Worklist.push_back(P);
	}
	}
	}

	bool addrModeCombiningAllowed() {
	if (DisableComplexAddrModes)
	return false;
	switch (DifferentField) {
	default:
	return false;
	case ExtAddrMode::BaseRegField:
	return AddrSinkCombineBaseReg;
	case ExtAddrMode::BaseGVField:
	return AddrSinkCombineBaseGV;
	case ExtAddrMode::BaseOffsField:
	return AddrSinkCombineBaseOffs;
	case ExtAddrMode::ScaledRegField:
	return AddrSinkCombineScaledReg;
	}
	}
	};
	} // end anonymous namespace

	/// Try adding ScaleReg*Scale to the current addressing mode.
	/// Return true and update AddrMode if this addr mode is legal for the target,
	/// false if not.
	bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
	unsigned Depth) {
	// If Scale is 1, then this is the same as adding ScaleReg to the addressing
	// mode. Just process that directly.
	if (Scale == 1)
	return matchAddr(ScaleReg, Depth);

	// If the scale is 0, it takes nothing to add this.
	if (Scale == 0)
	return true;

	// If we already have a scale of this value, we can add to it, otherwise, we
	// need an available scale field.
	if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
	return false;

	ExtAddrMode TestAddrMode = AddrMode;

	// Add scale to turn X4+X3 -> X*7. This could also do things like
	// [A+B + A7] -> [B+A8].
	TestAddrMode.Scale += Scale;
	TestAddrMode.ScaledReg = ScaleReg;

	// If the new address isn't legal, bail out.
	if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace))
	return false;

	// It was legal, so commit it.
	AddrMode = TestAddrMode;

	// Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now
	// to see if ScaleReg is actually X+C. If so, we can turn this into adding
	// XScale + CScale to addr mode.
	ConstantInt CI = nullptr; Value AddLHS = nullptr;
	if (isa<Instruction>(ScaleReg) && // not a constant expr.
	match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
	TestAddrMode.InBounds = false;
	TestAddrMode.ScaledReg = AddLHS;
	TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;

	// If this addressing mode is legal, commit it and remember that we folded
	// this instruction.
	if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
	AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
	AddrMode = TestAddrMode;
	return true;
	}
	}

	// Otherwise, not (x+c)*scale, just return what we have.
	return true;
	}

	/// This is a little filter, which returns true if an addressing computation
	/// involving I might be folded into a load/store accessing it.
	/// This doesn't need to be perfect, but needs to accept at least
	/// the set of instructions that MatchOperationAddr can.
	static bool MightBeFoldableInst(Instruction *I) {
	switch (I->getOpcode()) {
	case Instruction::BitCast:
	case Instruction::AddrSpaceCast:
	// Don't touch identity bitcasts.
	if (I->getType() == I->getOperand(0)->getType())
	return false;
	return I->getType()->isIntOrPtrTy();
	case Instruction::PtrToInt:
	// PtrToInt is always a noop, as we know that the int type is pointer sized.
	return true;
	case Instruction::IntToPtr:
	// We know the input is intptr_t, so this is foldable.
	return true;
	case Instruction::Add:
	return true;
	case Instruction::Mul:
	case Instruction::Shl:
	// Can only handle X*C and X << C.
	return isa<ConstantInt>(I->getOperand(1));
	case Instruction::GetElementPtr:
	return true;
	default:
	return false;
	}
	}

	/// Check whether or not \p Val is a legal instruction for \p TLI.
	/// \note \p Val is assumed to be the product of some type promotion.
	/// Therefore if \p Val has an undefined state in \p TLI, this is assumed
	/// to be legal, as the non-promoted value would have had the same state.
	static bool isPromotedInstructionLegal(const TargetLowering &TLI,
	const DataLayout &DL, Value *Val) {
	Instruction *PromotedInst = dyn_cast<Instruction>(Val);
	if (!PromotedInst)
	return false;
	int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
	// If the ISDOpcode is undefined, it was undefined before the promotion.
	if (!ISDOpcode)
	return true;
	// Otherwise, check if the promoted instruction is legal or not.
	return TLI.isOperationLegalOrCustom(
	ISDOpcode, TLI.getValueType(DL, PromotedInst->getType()));
	}

	namespace {

	/// Hepler class to perform type promotion.
	class TypePromotionHelper {
	/// Utility function to add a promoted instruction \p ExtOpnd to
	/// \p PromotedInsts and record the type of extension we have seen.
	static void addPromotedInst(InstrToOrigTy &PromotedInsts,
	Instruction *ExtOpnd,
	bool IsSExt) {
	ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;
	InstrToOrigTy::iterator It = PromotedInsts.find(ExtOpnd);
	if (It != PromotedInsts.end()) {
	// If the new extension is same as original, the information in
	// PromotedInsts[ExtOpnd] is still correct.
	if (It->second.getInt() == ExtTy)
	return;

	// Now the new extension is different from old extension, we make
	// the type information invalid by setting extension type to
	// BothExtension.
	ExtTy = BothExtension;
	}
	PromotedInsts[ExtOpnd] = TypeIsSExt(ExtOpnd->getType(), ExtTy);
	}

	/// Utility function to query the original type of instruction \p Opnd
	/// with a matched extension type. If the extension doesn't match, we
	/// cannot use the information we had on the original type.
	/// BothExtension doesn't match any extension type.
	static const Type *getOrigType(const InstrToOrigTy &PromotedInsts,
	Instruction *Opnd,
	bool IsSExt) {
	ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;
	InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);
	if (It != PromotedInsts.end() && It->second.getInt() == ExtTy)
	return It->second.getPointer();
	return nullptr;
	}

	/// Utility function to check whether or not a sign or zero extension
	/// of \p Inst with \p ConsideredExtType can be moved through \p Inst by
	/// either using the operands of \p Inst or promoting \p Inst.
	/// The type of the extension is defined by \p IsSExt.
	/// In other words, check if:
	/// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType.
	/// #1 Promotion applies:
	/// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...).
	/// #2 Operand reuses:
	/// ext opnd1 to ConsideredExtType.
	/// \p PromotedInsts maps the instructions to their type before promotion.
	static bool canGetThrough(const Instruction Inst, Type ConsideredExtType,
	const InstrToOrigTy &PromotedInsts, bool IsSExt);

	/// Utility function to determine if \p OpIdx should be promoted when
	/// promoting \p Inst.
	static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
	return !(isa<SelectInst>(Inst) && OpIdx == 0);
	}

	/// Utility function to promote the operand of \p Ext when this
	/// operand is a promotable trunc or sext or zext.
	/// \p PromotedInsts maps the instructions to their type before promotion.
	/// \p CreatedInstsCost[out] contains the cost of all instructions
	/// created to promote the operand of Ext.
	/// Newly added extensions are inserted in \p Exts.
	/// Newly added truncates are inserted in \p Truncs.
	/// Should never be called directly.
	/// \return The promoted value which is used instead of Ext.
	static Value *promoteOperandForTruncAndAnyExt(
	Instruction *Ext, TypePromotionTransaction &TPT,
	InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
	SmallVectorImpl<Instruction > Exts,
	SmallVectorImpl<Instruction > Truncs, const TargetLowering &TLI);

	/// Utility function to promote the operand of \p Ext when this
	/// operand is promotable and is not a supported trunc or sext.
	/// \p PromotedInsts maps the instructions to their type before promotion.
	/// \p CreatedInstsCost[out] contains the cost of all the instructions
	/// created to promote the operand of Ext.
	/// Newly added extensions are inserted in \p Exts.
	/// Newly added truncates are inserted in \p Truncs.
	/// Should never be called directly.
	/// \return The promoted value which is used instead of Ext.
	static Value promoteOperandForOther(Instruction Ext,
	TypePromotionTransaction &TPT,
	InstrToOrigTy &PromotedInsts,
	unsigned &CreatedInstsCost,
	SmallVectorImpl<Instruction > Exts,
	SmallVectorImpl<Instruction > Truncs,
	const TargetLowering &TLI, bool IsSExt);

	/// \see promoteOperandForOther.
	static Value *signExtendOperandForOther(
	Instruction *Ext, TypePromotionTransaction &TPT,
	InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
	SmallVectorImpl<Instruction > Exts,
	SmallVectorImpl<Instruction > Truncs, const TargetLowering &TLI) {
	return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
	Exts, Truncs, TLI, true);
	}

	/// \see promoteOperandForOther.
	static Value *zeroExtendOperandForOther(
	Instruction *Ext, TypePromotionTransaction &TPT,
	InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
	SmallVectorImpl<Instruction > Exts,
	SmallVectorImpl<Instruction > Truncs, const TargetLowering &TLI) {
	return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
	Exts, Truncs, TLI, false);
	}

	public:
	/// Type for the utility function that promotes the operand of Ext.
	using Action = Value ()(Instruction *Ext, TypePromotionTransaction &TPT,
	InstrToOrigTy &PromotedInsts,
	unsigned &CreatedInstsCost,
	SmallVectorImpl<Instruction > Exts,
	SmallVectorImpl<Instruction > Truncs,
	const TargetLowering &TLI);

	/// Given a sign/zero extend instruction \p Ext, return the appropriate
	/// action to promote the operand of \p Ext instead of using Ext.
	/// \return NULL if no promotable action is possible with the current
	/// sign extension.
	/// \p InsertedInsts keeps track of all the instructions inserted by the
	/// other CodeGenPrepare optimizations. This information is important
	/// because we do not want to promote these instructions as CodeGenPrepare
	/// will reinsert them later. Thus creating an infinite loop: create/remove.
	/// \p PromotedInsts maps the instructions to their type before promotion.
	static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts,
	const TargetLowering &TLI,
	const InstrToOrigTy &PromotedInsts);
	};

	} // end anonymous namespace

	bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
	Type *ConsideredExtType,
	const InstrToOrigTy &PromotedInsts,
	bool IsSExt) {
	// The promotion helper does not know how to deal with vector types yet.
	// To be able to fix that, we would need to fix the places where we
	// statically extend, e.g., constants and such.
	if (Inst->getType()->isVectorTy())
	return false;

	// We can always get through zext.
	if (isa<ZExtInst>(Inst))
	return true;

	// sext(sext) is ok too.
	if (IsSExt && isa<SExtInst>(Inst))
	return true;

	// We can get through binary operator, if it is legal. In other words, the
	// binary operator must have a nuw or nsw flag.
	const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
	if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
	((!IsSExt && BinOp->hasNoUnsignedWrap()) \|\|
	(IsSExt && BinOp->hasNoSignedWrap())))
	return true;

	// ext(and(opnd, cst)) --> and(ext(opnd), ext(cst))
	if ((Inst->getOpcode() == Instruction::And \|\|
	Inst->getOpcode() == Instruction::Or))
	return true;

	// ext(xor(opnd, cst)) --> xor(ext(opnd), ext(cst))
	if (Inst->getOpcode() == Instruction::Xor) {
	const ConstantInt *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1));
	// Make sure it is not a NOT.
	if (Cst && !Cst->getValue().isAllOnesValue())
	return true;
	}

	// zext(shrl(opnd, cst)) --> shrl(zext(opnd), zext(cst))
	// It may change a poisoned value into a regular value, like
	// zext i32 (shrl i8 %val, 12) --> shrl i32 (zext i8 %val), 12
	// poisoned value regular value
	// It should be OK since undef covers valid value.
	if (Inst->getOpcode() == Instruction::LShr && !IsSExt)
	return true;

	// and(ext(shl(opnd, cst)), cst) --> and(shl(ext(opnd), ext(cst)), cst)
	// It may change a poisoned value into a regular value, like
	// zext i32 (shl i8 %val, 12) --> shl i32 (zext i8 %val), 12
	// poisoned value regular value
	// It should be OK since undef covers valid value.
	if (Inst->getOpcode() == Instruction::Shl && Inst->hasOneUse()) {
	const Instruction *ExtInst =
	dyn_cast<const Instruction>(*Inst->user_begin());
	if (ExtInst->hasOneUse()) {
	const Instruction *AndInst =
	dyn_cast<const Instruction>(*ExtInst->user_begin());
	if (AndInst && AndInst->getOpcode() == Instruction::And) {
	const ConstantInt *Cst = dyn_cast<ConstantInt>(AndInst->getOperand(1));
	if (Cst &&
	Cst->getValue().isIntN(Inst->getType()->getIntegerBitWidth()))
	return true;
	}
	}
	}

	// Check if we can do the following simplification.
	// ext(trunc(opnd)) --> ext(opnd)
	if (!isa<TruncInst>(Inst))
	return false;

	Value *OpndVal = Inst->getOperand(0);
	// Check if we can use this operand in the extension.
	// If the type is larger than the result type of the extension, we cannot.
	if (!OpndVal->getType()->isIntegerTy() \|\|
	OpndVal->getType()->getIntegerBitWidth() >
	ConsideredExtType->getIntegerBitWidth())
	return false;

	// If the operand of the truncate is not an instruction, we will not have
	// any information on the dropped bits.
	// (Actually we could for constant but it is not worth the extra logic).
	Instruction *Opnd = dyn_cast<Instruction>(OpndVal);
	if (!Opnd)
	return false;

	// Check if the source of the type is narrow enough.
	// I.e., check that trunc just drops extended bits of the same kind of
	// the extension.
	// #1 get the type of the operand and check the kind of the extended bits.
	const Type *OpndType = getOrigType(PromotedInsts, Opnd, IsSExt);
	if (OpndType)
	;
	else if ((IsSExt && isa<SExtInst>(Opnd)) \|\| (!IsSExt && isa<ZExtInst>(Opnd)))
	OpndType = Opnd->getOperand(0)->getType();
	else
	return false;

	// #2 check that the truncate just drops extended bits.
	return Inst->getType()->getIntegerBitWidth() >=
	OpndType->getIntegerBitWidth();
	}

	TypePromotionHelper::Action TypePromotionHelper::getAction(
	Instruction *Ext, const SetOfInstrs &InsertedInsts,
	const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {
	assert((isa<SExtInst>(Ext) \|\| isa<ZExtInst>(Ext)) &&
	"Unexpected instruction type");
	Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0));
	Type *ExtTy = Ext->getType();
	bool IsSExt = isa<SExtInst>(Ext);
	// If the operand of the extension is not an instruction, we cannot
	// get through.
	// If it, check we can get through.
	if (!ExtOpnd \|\| !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt))
	return nullptr;

	// Do not promote if the operand has been added by codegenprepare.
	// Otherwise, it means we are undoing an optimization that is likely to be
	// redone, thus causing potential infinite loop.
	if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd))
	return nullptr;

	// SExt or Trunc instructions.
	// Return the related handler.
	if (isa<SExtInst>(ExtOpnd) \|\| isa<TruncInst>(ExtOpnd) \|\|
	isa<ZExtInst>(ExtOpnd))
	return promoteOperandForTruncAndAnyExt;

	// Regular instruction.
	// Abort early if we will have to insert non-free instructions.
	if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType()))
	return nullptr;
	return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther;
	}

	Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
	Instruction *SExt, TypePromotionTransaction &TPT,
	InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
	SmallVectorImpl<Instruction > Exts,
	SmallVectorImpl<Instruction > Truncs, const TargetLowering &TLI) {
	// By construction, the operand of SExt is an instruction. Otherwise we cannot
	// get through it and this method should not be called.
	Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
	Value *ExtVal = SExt;
	bool HasMergedNonFreeExt = false;
	if (isa<ZExtInst>(SExtOpnd)) {
	// Replace s\|zext(zext(opnd))
	// => zext(opnd).
	HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd);
	Value *ZExt =
	TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType());
	TPT.replaceAllUsesWith(SExt, ZExt);
	TPT.eraseInstruction(SExt);
	ExtVal = ZExt;
	} else {
	// Replace z\|sext(trunc(opnd)) or sext(sext(opnd))
	// => z\|sext(opnd).
	TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
	}
	CreatedInstsCost = 0;

	// Remove dead code.
	if (SExtOpnd->use_empty())
	TPT.eraseInstruction(SExtOpnd);

	// Check if the extension is still needed.
	Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
	if (!ExtInst \|\| ExtInst->getType() != ExtInst->getOperand(0)->getType()) {
	if (ExtInst) {
	if (Exts)
	Exts->push_back(ExtInst);
	CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt;
	}
	return ExtVal;
	}

	// At this point we have: ext ty opnd to ty.
	// Reassign the uses of ExtInst to the opnd and remove ExtInst.
	Value *NextVal = ExtInst->getOperand(0);
	TPT.eraseInstruction(ExtInst, NextVal);
	return NextVal;
	}

	Value *TypePromotionHelper::promoteOperandForOther(
	Instruction *Ext, TypePromotionTransaction &TPT,
	InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
	SmallVectorImpl<Instruction > Exts,
	SmallVectorImpl<Instruction > Truncs, const TargetLowering &TLI,
	bool IsSExt) {
	// By construction, the operand of Ext is an instruction. Otherwise we cannot
	// get through it and this method should not be called.
	Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
	CreatedInstsCost = 0;
	if (!ExtOpnd->hasOneUse()) {
	// ExtOpnd will be promoted.
	// All its uses, but Ext, will need to use a truncated value of the
	// promoted version.
	// Create the truncate now.
	Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType());
	if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) {
	// Insert it just after the definition.
	ITrunc->moveAfter(ExtOpnd);
	if (Truncs)
	Truncs->push_back(ITrunc);
	}

	TPT.replaceAllUsesWith(ExtOpnd, Trunc);
	// Restore the operand of Ext (which has been replaced by the previous call
	// to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.
	TPT.setOperand(Ext, 0, ExtOpnd);
	}

	// Get through the Instruction:
	// 1. Update its type.
	// 2. Replace the uses of Ext by Inst.
	// 3. Extend each operand that needs to be extended.

	// Remember the original type of the instruction before promotion.
	// This is useful to know that the high bits are sign extended bits.
	addPromotedInst(PromotedInsts, ExtOpnd, IsSExt);
	// Step #1.
	TPT.mutateType(ExtOpnd, Ext->getType());
	// Step #2.
	TPT.replaceAllUsesWith(Ext, ExtOpnd);
	// Step #3.
	Instruction *ExtForOpnd = Ext;

	LLVM_DEBUG(dbgs() << "Propagate Ext to operands\n");
	for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
	++OpIdx) {
	LLVM_DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
	if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() \|\|
	!shouldExtOperand(ExtOpnd, OpIdx)) {
	LLVM_DEBUG(dbgs() << "No need to propagate\n");
	continue;
	}
	// Check if we can statically extend the operand.
	Value *Opnd = ExtOpnd->getOperand(OpIdx);
	if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
	LLVM_DEBUG(dbgs() << "Statically extend\n");
	unsigned BitWidth = Ext->getType()->getIntegerBitWidth();
	APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)
	: Cst->getValue().zext(BitWidth);
	TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal));
	continue;
	}
	// UndefValue are typed, so we have to statically sign extend them.
	if (isa<UndefValue>(Opnd)) {
	LLVM_DEBUG(dbgs() << "Statically extend\n");
	TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType()));
	continue;
	}

	// Otherwise we have to explicitly sign extend the operand.
	// Check if Ext was reused to extend an operand.
	if (!ExtForOpnd) {
	// If yes, create a new one.
	LLVM_DEBUG(dbgs() << "More operands to ext\n");
	Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
	: TPT.createZExt(Ext, Opnd, Ext->getType());
	if (!isa<Instruction>(ValForExtOpnd)) {
	TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd);
	continue;
	}
	ExtForOpnd = cast<Instruction>(ValForExtOpnd);
	}
	if (Exts)
	Exts->push_back(ExtForOpnd);
	TPT.setOperand(ExtForOpnd, 0, Opnd);

	// Move the sign extension before the insertion point.
	TPT.moveBefore(ExtForOpnd, ExtOpnd);
	TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd);
	CreatedInstsCost += !TLI.isExtFree(ExtForOpnd);
	// If more sext are required, new instructions will have to be created.
	ExtForOpnd = nullptr;
	}
	if (ExtForOpnd == Ext) {
	LLVM_DEBUG(dbgs() << "Extension is useless now\n");
	TPT.eraseInstruction(Ext);
	}
	return ExtOpnd;
	}

	/// Check whether or not promoting an instruction to a wider type is profitable.
	/// \p NewCost gives the cost of extension instructions created by the
	/// promotion.
	/// \p OldCost gives the cost of extension instructions before the promotion
	/// plus the number of instructions that have been
	/// matched in the addressing mode the promotion.
	/// \p PromotedOperand is the value that has been promoted.
	/// \return True if the promotion is profitable, false otherwise.
	bool AddressingModeMatcher::isPromotionProfitable(
	unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {
	LLVM_DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost
	<< '\n');
	// The cost of the new extensions is greater than the cost of the
	// old extension plus what we folded.
	// This is not profitable.
	if (NewCost > OldCost)
	return false;
	if (NewCost < OldCost)
	return true;
	// The promotion is neutral but it may help folding the sign extension in
	// loads for instance.
	// Check that we did not create an illegal instruction.
	return isPromotedInstructionLegal(TLI, DL, PromotedOperand);
	}

	/// Given an instruction or constant expr, see if we can fold the operation
	/// into the addressing mode. If so, update the addressing mode and return
	/// true, otherwise return false without modifying AddrMode.
	/// If \p MovedAway is not NULL, it contains the information of whether or
	/// not AddrInst has to be folded into the addressing mode on success.
	/// If \p MovedAway == true, \p AddrInst will not be part of the addressing
	/// because it has been moved away.
	/// Thus AddrInst must not be added in the matched instructions.
	/// This state can happen when AddrInst is a sext, since it may be moved away.
	/// Therefore, AddrInst may not be valid when MovedAway is true and it must
	/// not be referenced anymore.
	bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
	unsigned Depth,
	bool *MovedAway) {
	// Avoid exponential behavior on extremely deep expression trees.
	if (Depth >= 5) return false;

	// By default, all matched instructions stay in place.
	if (MovedAway)
	*MovedAway = false;

	switch (Opcode) {
	case Instruction::PtrToInt:
	// PtrToInt is always a noop, as we know that the int type is pointer sized.
	return matchAddr(AddrInst->getOperand(0), Depth);
	case Instruction::IntToPtr: {
	auto AS = AddrInst->getType()->getPointerAddressSpace();
	auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
	// This inttoptr is a no-op if the integer type is pointer sized.
	if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy)
	return matchAddr(AddrInst->getOperand(0), Depth);
	return false;
	}
	case Instruction::BitCast:
	// BitCast is always a noop, and we can handle it as long as it is
	// int->int or pointer->pointer (we don't want int<->fp or something).
	if (AddrInst->getOperand(0)->getType()->isIntOrPtrTy() &&
	// Don't touch identity bitcasts. These were probably put here by LSR,
	// and we don't want to mess around with them. Assume it knows what it
	// is doing.
	AddrInst->getOperand(0)->getType() != AddrInst->getType())
	return matchAddr(AddrInst->getOperand(0), Depth);
	return false;
	case Instruction::AddrSpaceCast: {
	unsigned SrcAS
	= AddrInst->getOperand(0)->getType()->getPointerAddressSpace();
	unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();
	if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
	return matchAddr(AddrInst->getOperand(0), Depth);
	return false;
	}
	case Instruction::Add: {
	// Check to see if we can merge in the RHS then the LHS. If so, we win.
	ExtAddrMode BackupAddrMode = AddrMode;
	unsigned OldSize = AddrModeInsts.size();
	// Start a transaction at this point.
	// The LHS may match but not the RHS.
	// Therefore, we need a higher level restoration point to undo partially
	// matched operation.
	TypePromotionTransaction::ConstRestorationPt LastKnownGood =
	TPT.getRestorationPoint();

	AddrMode.InBounds = false;
	if (matchAddr(AddrInst->getOperand(1), Depth+1) &&
	matchAddr(AddrInst->getOperand(0), Depth+1))
	return true;

	// Restore the old addr mode info.
	AddrMode = BackupAddrMode;
	AddrModeInsts.resize(OldSize);
	TPT.rollback(LastKnownGood);

	// Otherwise this was over-aggressive. Try merging in the LHS then the RHS.
	if (matchAddr(AddrInst->getOperand(0), Depth+1) &&
	matchAddr(AddrInst->getOperand(1), Depth+1))
	return true;

	// Otherwise we definitely can't merge the ADD in.
	AddrMode = BackupAddrMode;
	AddrModeInsts.resize(OldSize);
	TPT.rollback(LastKnownGood);
	break;
	}
	//case Instruction::Or:
	// TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
	//break;
	case Instruction::Mul:
	case Instruction::Shl: {
	// Can only handle X*C and X << C.
	AddrMode.InBounds = false;
	ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
	if (!RHS \|\| RHS->getBitWidth() > 64)
	return false;
	int64_t Scale = RHS->getSExtValue();
	if (Opcode == Instruction::Shl)
	Scale = 1LL << Scale;

	return matchScaledValue(AddrInst->getOperand(0), Scale, Depth);
	}
	case Instruction::GetElementPtr: {
	// Scan the GEP. We check it if it contains constant offsets and at most
	// one variable offset.
	int VariableOperand = -1;
	unsigned VariableScale = 0;

	int64_t ConstantOffset = 0;
	gep_type_iterator GTI = gep_type_begin(AddrInst);
	for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
	if (StructType *STy = GTI.getStructTypeOrNull()) {
	const StructLayout *SL = DL.getStructLayout(STy);
	unsigned Idx =
	cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
	ConstantOffset += SL->getElementOffset(Idx);
	} else {
	uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType());
	if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
	const APInt &CVal = CI->getValue();
	if (CVal.getMinSignedBits() <= 64) {
	ConstantOffset += CVal.getSExtValue() * TypeSize;
	continue;
	}
	}
	if (TypeSize) { // Scales of zero don't do anything.
	// We only allow one variable index at the moment.
	if (VariableOperand != -1)
	return false;

	// Remember the variable index.
	VariableOperand = i;
	VariableScale = TypeSize;
	}
	}
	}

	// A common case is for the GEP to only do a constant offset. In this case,
	// just add it to the disp field and check validity.
	if (VariableOperand == -1) {
	AddrMode.BaseOffs += ConstantOffset;
	if (ConstantOffset == 0 \|\|
	TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) {
	// Check to see if we can fold the base pointer in too.
	if (matchAddr(AddrInst->getOperand(0), Depth+1)) {
	if (!cast<GEPOperator>(AddrInst)->isInBounds())
	AddrMode.InBounds = false;
	return true;
	}
	} else if (EnableGEPOffsetSplit && isa<GetElementPtrInst>(AddrInst) &&
	TLI.shouldConsiderGEPOffsetSplit() && Depth == 0 &&
	ConstantOffset > 0) {
	// Record GEPs with non-zero offsets as candidates for splitting in the
	// event that the offset cannot fit into the r+i addressing mode.
	// Simple and common case that only one GEP is used in calculating the
	// address for the memory access.
	Value *Base = AddrInst->getOperand(0);
	auto *BaseI = dyn_cast<Instruction>(Base);
	auto *GEP = cast<GetElementPtrInst>(AddrInst);
	if (isa<Argument>(Base) \|\| isa<GlobalValue>(Base) \|\|
	(BaseI && !isa<CastInst>(BaseI) &&
	!isa<GetElementPtrInst>(BaseI))) {
	// Make sure the parent block allows inserting non-PHI instructions
	// before the terminator.
	BasicBlock *Parent =
	BaseI ? BaseI->getParent() : &GEP->getFunction()->getEntryBlock();
	if (!Parent->getTerminator()->isEHPad())
	LargeOffsetGEP = std::make_pair(GEP, ConstantOffset);
	}
	}
	AddrMode.BaseOffs -= ConstantOffset;
	return false;
	}

	// Save the valid addressing mode in case we can't match.
	ExtAddrMode BackupAddrMode = AddrMode;
	unsigned OldSize = AddrModeInsts.size();

	// See if the scale and offset amount is valid for this target.
	AddrMode.BaseOffs += ConstantOffset;
	if (!cast<GEPOperator>(AddrInst)->isInBounds())
	AddrMode.InBounds = false;

	// Match the base operand of the GEP.
	if (!matchAddr(AddrInst->getOperand(0), Depth+1)) {
	// If it couldn't be matched, just stuff the value in a register.
	if (AddrMode.HasBaseReg) {
	AddrMode = BackupAddrMode;
	AddrModeInsts.resize(OldSize);
	return false;
	}
	AddrMode.HasBaseReg = true;
	AddrMode.BaseReg = AddrInst->getOperand(0);
	}

	// Match the remaining variable portion of the GEP.
	if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
	Depth)) {
	// If it couldn't be matched, try stuffing the base into a register
	// instead of matching it, and retrying the match of the scale.
	AddrMode = BackupAddrMode;
	AddrModeInsts.resize(OldSize);
	if (AddrMode.HasBaseReg)
	return false;
	AddrMode.HasBaseReg = true;
	AddrMode.BaseReg = AddrInst->getOperand(0);
	AddrMode.BaseOffs += ConstantOffset;
	if (!matchScaledValue(AddrInst->getOperand(VariableOperand),
	VariableScale, Depth)) {
	// If even that didn't work, bail.
	AddrMode = BackupAddrMode;
	AddrModeInsts.resize(OldSize);
	return false;
	}
	}

	return true;
	}
	case Instruction::SExt:
	case Instruction::ZExt: {
	Instruction *Ext = dyn_cast<Instruction>(AddrInst);
	if (!Ext)
	return false;

	// Try to move this ext out of the way of the addressing mode.
	// Ask for a method for doing so.
	TypePromotionHelper::Action TPH =
	TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts);
	if (!TPH)
	return false;

	TypePromotionTransaction::ConstRestorationPt LastKnownGood =
	TPT.getRestorationPoint();
	unsigned CreatedInstsCost = 0;
	unsigned ExtCost = !TLI.isExtFree(Ext);
	Value *PromotedOperand =
	TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI);
	// SExt has been moved away.
	// Thus either it will be rematched later in the recursive calls or it is
	// gone. Anyway, we must not fold it into the addressing mode at this point.
	// E.g.,
	// op = add opnd, 1
	// idx = ext op
	// addr = gep base, idx
	// is now:
	// promotedOpnd = ext opnd <- no match here
	// op = promoted_add promotedOpnd, 1 <- match (later in recursive calls)
	// addr = gep base, op <- match
	if (MovedAway)
	*MovedAway = true;

	assert(PromotedOperand &&
	"TypePromotionHelper should have filtered out those cases");

	ExtAddrMode BackupAddrMode = AddrMode;
	unsigned OldSize = AddrModeInsts.size();

	if (!matchAddr(PromotedOperand, Depth) \|\|
	// The total of the new cost is equal to the cost of the created
	// instructions.
	// The total of the old cost is equal to the cost of the extension plus
	// what we have saved in the addressing mode.
	!isPromotionProfitable(CreatedInstsCost,
	ExtCost + (AddrModeInsts.size() - OldSize),
	PromotedOperand)) {
	AddrMode = BackupAddrMode;
	AddrModeInsts.resize(OldSize);
	LLVM_DEBUG(dbgs() << "Sign extension does not pay off: rollback\n");
	TPT.rollback(LastKnownGood);
	return false;
	}
	return true;
	}
	}
	return false;
	}

	/// If we can, try to add the value of 'Addr' into the current addressing mode.
	/// If Addr can't be added to AddrMode this returns false and leaves AddrMode
	/// unmodified. This assumes that Addr is either a pointer type or intptr_t
	/// for the target.
	///
	bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
	// Start a transaction at this point that we will rollback if the matching
	// fails.
	TypePromotionTransaction::ConstRestorationPt LastKnownGood =
	TPT.getRestorationPoint();
	if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
	// Fold in immediates if legal for the target.
	AddrMode.BaseOffs += CI->getSExtValue();
	if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
	return true;
	AddrMode.BaseOffs -= CI->getSExtValue();
	} else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
	// If this is a global variable, try to fold it into the addressing mode.
	if (!AddrMode.BaseGV) {
	AddrMode.BaseGV = GV;
	if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
	return true;
	AddrMode.BaseGV = nullptr;
	}
	} else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
	ExtAddrMode BackupAddrMode = AddrMode;
	unsigned OldSize = AddrModeInsts.size();

	// Check to see if it is possible to fold this operation.
	bool MovedAway = false;
	if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {
	// This instruction may have been moved away. If so, there is nothing
	// to check here.
	if (MovedAway)
	return true;
	// Okay, it's possible to fold this. Check to see if it is actually
	// profitable to do so. We use a simple cost model to avoid increasing
	// register pressure too much.
	if (I->hasOneUse() \|\|
	isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
	AddrModeInsts.push_back(I);
	return true;
	}

	// It isn't profitable to do this, roll back.
	//cerr << "NOT FOLDING: " << *I;
	AddrMode = BackupAddrMode;
	AddrModeInsts.resize(OldSize);
	TPT.rollback(LastKnownGood);
	}
	} else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
	if (matchOperationAddr(CE, CE->getOpcode(), Depth))
	return true;
	TPT.rollback(LastKnownGood);
	} else if (isa<ConstantPointerNull>(Addr)) {
	// Null pointer gets folded without affecting the addressing mode.
	return true;
	}

	// Worse case, the target should support [reg] addressing modes. :)
	if (!AddrMode.HasBaseReg) {
	AddrMode.HasBaseReg = true;
	AddrMode.BaseReg = Addr;
	// Still check for legality in case the target supports [imm] but not [i+r].
	if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
	return true;
	AddrMode.HasBaseReg = false;
	AddrMode.BaseReg = nullptr;
	}

	// If the base register is already taken, see if we can do [r+r].
	if (AddrMode.Scale == 0) {
	AddrMode.Scale = 1;
	AddrMode.ScaledReg = Addr;
	if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
	return true;
	AddrMode.Scale = 0;
	AddrMode.ScaledReg = nullptr;
	}
	// Couldn't match.
	TPT.rollback(LastKnownGood);
	return false;
	}

	/// Check to see if all uses of OpVal by the specified inline asm call are due
	/// to memory operands. If so, return true, otherwise return false.
	static bool IsOperandAMemoryOperand(CallInst CI, InlineAsm IA, Value *OpVal,
	const TargetLowering &TLI,
	const TargetRegisterInfo &TRI) {
	const Function *F = CI->getFunction();
	TargetLowering::AsmOperandInfoVector TargetConstraints =
	TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI,
	ImmutableCallSite(CI));

	for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
	TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];

	// Compute the constraint code and ConstraintType to use.
	TLI.ComputeConstraintToUse(OpInfo, SDValue());

	// If this asm operand is our Value*, and if it isn't an indirect memory
	// operand, we can't fold it!
	if (OpInfo.CallOperandVal == OpVal &&
	(OpInfo.ConstraintType != TargetLowering::C_Memory \|\|
	!OpInfo.isIndirect))
	return false;
	}

	return true;
	}

	// Max number of memory uses to look at before aborting the search to conserve
	// compile time.
	static constexpr int MaxMemoryUsesToScan = 20;

	/// Recursively walk all the uses of I until we find a memory use.
	/// If we find an obviously non-foldable instruction, return true.
	/// Add the ultimately found memory instructions to MemoryUses.
	static bool FindAllMemoryUses(
	Instruction *I,
	SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
	SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI,
	const TargetRegisterInfo &TRI, int SeenInsts = 0) {
	// If we already considered this instruction, we're done.
	if (!ConsideredInsts.insert(I).second)
	return false;

	// If this is an obviously unfoldable instruction, bail out.
	if (!MightBeFoldableInst(I))
	return true;

	const bool OptSize = I->getFunction()->hasOptSize();

	// Loop over all the uses, recursively processing them.
	for (Use &U : I->uses()) {
	// Conservatively return true if we're seeing a large number or a deep chain
	// of users. This avoids excessive compilation times in pathological cases.
	if (SeenInsts++ >= MaxMemoryUsesToScan)
	return true;

	Instruction *UserI = cast<Instruction>(U.getUser());
	if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) {
	MemoryUses.push_back(std::make_pair(LI, U.getOperandNo()));
	continue;
	}

	if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
	unsigned opNo = U.getOperandNo();
	if (opNo != StoreInst::getPointerOperandIndex())
	return true; // Storing addr, not into addr.
	MemoryUses.push_back(std::make_pair(SI, opNo));
	continue;
	}

	if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) {
	unsigned opNo = U.getOperandNo();
	if (opNo != AtomicRMWInst::getPointerOperandIndex())
	return true; // Storing addr, not into addr.
	MemoryUses.push_back(std::make_pair(RMW, opNo));
	continue;
	}

	if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) {
	unsigned opNo = U.getOperandNo();
	if (opNo != AtomicCmpXchgInst::getPointerOperandIndex())
	return true; // Storing addr, not into addr.
	MemoryUses.push_back(std::make_pair(CmpX, opNo));
	continue;
	}

	if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
	// If this is a cold call, we can sink the addressing calculation into
	// the cold path. See optimizeCallInst
	if (!OptSize && CI->hasFnAttr(Attribute::Cold))
	continue;

	InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
	if (!IA) return true;

	// If this is a memory operand, we're cool, otherwise bail out.
	if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI))
	return true;
	continue;
	}

	if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI,
	SeenInsts))
	return true;
	}

	return false;
	}

	/// Return true if Val is already known to be live at the use site that we're
	/// folding it into. If so, there is no cost to include it in the addressing
	/// mode. KnownLive1 and KnownLive2 are two values that we know are live at the
	/// instruction already.
	bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value Val,Value KnownLive1,
	Value *KnownLive2) {
	// If Val is either of the known-live values, we know it is live!
	if (Val == nullptr \|\| Val == KnownLive1 \|\| Val == KnownLive2)
	return true;

	// All values other than instructions and arguments (e.g. constants) are live.
	if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;

	// If Val is a constant sized alloca in the entry block, it is live, this is
	// true because it is just a reference to the stack/frame pointer, which is
	// live for the whole function.
	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
	if (AI->isStaticAlloca())
	return true;

	// Check to see if this value is already used in the memory instruction's
	// block. If so, it's already live into the block at the very least, so we
	// can reasonably fold it.
	return Val->isUsedInBasicBlock(MemoryInst->getParent());
	}

	/// It is possible for the addressing mode of the machine to fold the specified
	/// instruction into a load or store that ultimately uses it.
	/// However, the specified instruction has multiple uses.
	/// Given this, it may actually increase register pressure to fold it
	/// into the load. For example, consider this code:
	///
	/// X = ...
	/// Y = X+1
	/// use(Y) -> nonload/store
	/// Z = Y+1
	/// load Z
	///
	/// In this case, Y has multiple uses, and can be folded into the load of Z
	/// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to
	/// be live at the use(Y) line. If we don't fold Y into load Z, we use one
	/// fewer register. Since Y can't be folded into "use(Y)" we don't increase the
	/// number of computations either.
	///
	/// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If
	/// X was live across 'load Z' for other reasons, we actually would want to
	/// fold the addressing mode in the Z case. This would make Y die earlier.
	bool AddressingModeMatcher::
	isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
	ExtAddrMode &AMAfter) {
	if (IgnoreProfitability) return true;

	// AMBefore is the addressing mode before this instruction was folded into it,
	// and AMAfter is the addressing mode after the instruction was folded. Get
	// the set of registers referenced by AMAfter and subtract out those
	// referenced by AMBefore: this is the set of values which folding in this
	// address extends the lifetime of.
	//
	// Note that there are only two potential values being referenced here,
	// BaseReg and ScaleReg (global addresses are always available, as are any
	// folded immediates).
	Value BaseReg = AMAfter.BaseReg, ScaledReg = AMAfter.ScaledReg;

	// If the BaseReg or ScaledReg was referenced by the previous addrmode, their
	// lifetime wasn't extended by adding this instruction.
	if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
	BaseReg = nullptr;
	if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
	ScaledReg = nullptr;

	// If folding this instruction (and it's subexprs) didn't extend any live
	// ranges, we're ok with it.
	if (!BaseReg && !ScaledReg)
	return true;

	// If all uses of this instruction can have the address mode sunk into them,
	// we can remove the addressing mode and effectively trade one live register
	// for another (at worst.) In this context, folding an addressing mode into
	// the use is just a particularly nice way of sinking it.
	SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
	SmallPtrSet<Instruction*, 16> ConsideredInsts;
	if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI))
	return false; // Has a non-memory, non-foldable use!

	// Now that we know that all uses of this instruction are part of a chain of
	// computation involving only operations that could theoretically be folded
	// into a memory use, loop over each of these memory operation uses and see
	// if they could actually fold the instruction. The assumption is that
	// addressing modes are cheap and that duplicating the computation involved
	// many times is worthwhile, even on a fastpath. For sinking candidates
	// (i.e. cold call sites), this serves as a way to prevent excessive code
	// growth since most architectures have some reasonable small and fast way to
	// compute an effective address. (i.e LEA on x86)
	SmallVector<Instruction*, 32> MatchedAddrModeInsts;
	for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
	Instruction *User = MemoryUses[i].first;
	unsigned OpNo = MemoryUses[i].second;

	// Get the access type of this use. If the use isn't a pointer, we don't
	// know what it accesses.
	Value *Address = User->getOperand(OpNo);
	PointerType *AddrTy = dyn_cast<PointerType>(Address->getType());
	if (!AddrTy)
	return false;
	Type *AddressAccessTy = AddrTy->getElementType();
	unsigned AS = AddrTy->getAddressSpace();

	// Do a match against the root of this address, ignoring profitability. This
	// will tell us if the addressing mode for the memory operation will
	// actually cover the shared instruction.
	ExtAddrMode Result;
	std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
	0);
	TypePromotionTransaction::ConstRestorationPt LastKnownGood =
	TPT.getRestorationPoint();
	AddressingModeMatcher Matcher(
	MatchedAddrModeInsts, TLI, TRI, AddressAccessTy, AS, MemoryInst, Result,
	InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP);
	Matcher.IgnoreProfitability = true;
	bool Success = Matcher.matchAddr(Address, 0);
	(void)Success; assert(Success && "Couldn't select anything?");

	// The match was to check the profitability, the changes made are not
	// part of the original matcher. Therefore, they should be dropped
	// otherwise the original matcher will not present the right state.
	TPT.rollback(LastKnownGood);

	// If the match didn't cover I, then it won't be shared by it.
	if (!is_contained(MatchedAddrModeInsts, I))
	return false;

	MatchedAddrModeInsts.clear();
	}

	return true;
	}

	/// Return true if the specified values are defined in a
	/// different basic block than BB.
	static bool IsNonLocalValue(Value V, BasicBlock BB) {
	if (Instruction *I = dyn_cast<Instruction>(V))
	return I->getParent() != BB;
	return false;
	}

	/// Sink addressing mode computation immediate before MemoryInst if doing so
	/// can be done without increasing register pressure. The need for the
	/// register pressure constraint means this can end up being an all or nothing
	/// decision for all uses of the same addressing computation.
	///
	/// Load and Store Instructions often have addressing modes that can do
	/// significant amounts of computation. As such, instruction selection will try
	/// to get the load or store to do as much computation as possible for the
	/// program. The problem is that isel can only see within a single block. As
	/// such, we sink as much legal addressing mode work into the block as possible.
	///
	/// This method is used to optimize both load/store and inline asms with memory
	/// operands. It's also used to sink addressing computations feeding into cold
	/// call sites into their (cold) basic block.
	///
	/// The motivation for handling sinking into cold blocks is that doing so can
	/// both enable other address mode sinking (by satisfying the register pressure
	/// constraint above), and reduce register pressure globally (by removing the
	/// addressing mode computation from the fast path entirely.).
	bool CodeGenPrepare::optimizeMemoryInst(Instruction MemoryInst, Value Addr,
	Type *AccessTy, unsigned AddrSpace) {
	Value *Repl = Addr;

	// Try to collapse single-value PHI nodes. This is necessary to undo
	// unprofitable PRE transformations.
	SmallVector<Value*, 8> worklist;
	SmallPtrSet<Value*, 16> Visited;
	worklist.push_back(Addr);

	// Use a worklist to iteratively look through PHI and select nodes, and
	// ensure that the addressing mode obtained from the non-PHI/select roots of
	// the graph are compatible.
	bool PhiOrSelectSeen = false;
	SmallVector<Instruction*, 16> AddrModeInsts;
	const SimplifyQuery SQ(*DL, TLInfo);
	AddressingModeCombiner AddrModes(SQ, Addr);
	TypePromotionTransaction TPT(RemovedInsts);
	TypePromotionTransaction::ConstRestorationPt LastKnownGood =
	TPT.getRestorationPoint();
	while (!worklist.empty()) {
	Value *V = worklist.back();
	worklist.pop_back();

	// We allow traversing cyclic Phi nodes.
	// In case of success after this loop we ensure that traversing through
	// Phi nodes ends up with all cases to compute address of the form
	// BaseGV + Base + Scale * Index + Offset
	// where Scale and Offset are constans and BaseGV, Base and Index
	// are exactly the same Values in all cases.
	// It means that BaseGV, Scale and Offset dominate our memory instruction
	// and have the same value as they had in address computation represented
	// as Phi. So we can safely sink address computation to memory instruction.
	if (!Visited.insert(V).second)
	continue;

	// For a PHI node, push all of its incoming values.
	if (PHINode *P = dyn_cast<PHINode>(V)) {
	for (Value *IncValue : P->incoming_values())
	worklist.push_back(IncValue);
	PhiOrSelectSeen = true;
	continue;
	}
	// Similar for select.
	if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
	worklist.push_back(SI->getFalseValue());
	worklist.push_back(SI->getTrueValue());
	PhiOrSelectSeen = true;
	continue;
	}

	// For non-PHIs, determine the addressing mode being computed. Note that
	// the result may differ depending on what other uses our candidate
	// addressing instructions might have.
	AddrModeInsts.clear();
	std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
	0);
	ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
	V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, TLI, TRI,
	InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP);

	GetElementPtrInst *GEP = LargeOffsetGEP.first;
	if (GEP && !NewGEPBases.count(GEP)) {
	// If splitting the underlying data structure can reduce the offset of a
	// GEP, collect the GEP. Skip the GEPs that are the new bases of
	// previously split data structures.
	LargeOffsetGEPMap[GEP->getPointerOperand()].push_back(LargeOffsetGEP);
	if (LargeOffsetGEPID.find(GEP) == LargeOffsetGEPID.end())
	LargeOffsetGEPID[GEP] = LargeOffsetGEPID.size();
	}

	NewAddrMode.OriginalValue = V;
	if (!AddrModes.addNewAddrMode(NewAddrMode))
	break;
	}

	// Try to combine the AddrModes we've collected. If we couldn't collect any,
	// or we have multiple but either couldn't combine them or combining them
	// wouldn't do anything useful, bail out now.
	if (!AddrModes.combineAddrModes()) {
	TPT.rollback(LastKnownGood);
	return false;
	}
	TPT.commit();

	// Get the combined AddrMode (or the only AddrMode, if we only had one).
	ExtAddrMode AddrMode = AddrModes.getAddrMode();

	// If all the instructions matched are already in this BB, don't do anything.
	// If we saw a Phi node then it is not local definitely, and if we saw a select
	// then we want to push the address calculation past it even if it's already
	// in this BB.
	if (!PhiOrSelectSeen && none_of(AddrModeInsts, [&](Value *V) {
	return IsNonLocalValue(V, MemoryInst->getParent());
	})) {
	LLVM_DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode
	<< "\n");
	return false;
	}

	// Insert this computation right after this user. Since our caller is
	// scanning from the top of the BB to the bottom, reuse of the expr are
	// guaranteed to happen later.
	IRBuilder<> Builder(MemoryInst);

	// Now that we determined the addressing expression we want to use and know
	// that we have to sink it into this block. Check to see if we have already
	// done this for some other load/store instr in this block. If so, reuse
	// the computation. Before attempting reuse, check if the address is valid
	// as it may have been erased.

	WeakTrackingVH SunkAddrVH = SunkAddrs[Addr];

	Value * SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
	if (SunkAddr) {
	LLVM_DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode
	<< " for " << *MemoryInst << "\n");
	if (SunkAddr->getType() != Addr->getType())
	SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
	} else if (AddrSinkUsingGEPs \|\|
	(!AddrSinkUsingGEPs.getNumOccurrences() && TM && TTI->useAA())) {
	// By default, we use the GEP-based method when AA is used later. This
	// prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
	LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
	<< " for " << *MemoryInst << "\n");
	Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
	Value ResultPtr = nullptr, ResultIndex = nullptr;

	// First, find the pointer.
	if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) {
	ResultPtr = AddrMode.BaseReg;
	AddrMode.BaseReg = nullptr;
	}

	if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) {
	// We can't add more than one pointer together, nor can we scale a
	// pointer (both of which seem meaningless).
	if (ResultPtr \|\| AddrMode.Scale != 1)
	return false;

	ResultPtr = AddrMode.ScaledReg;
	AddrMode.Scale = 0;
	}

	// It is only safe to sign extend the BaseReg if we know that the math
	// required to create it did not overflow before we extend it. Since
	// the original IR value was tossed in favor of a constant back when
	// the AddrMode was created we need to bail out gracefully if widths
	// do not match instead of extending it.
	//
	// (See below for code to add the scale.)
	if (AddrMode.Scale) {
	Type *ScaledRegTy = AddrMode.ScaledReg->getType();
	if (cast<IntegerType>(IntPtrTy)->getBitWidth() >
	cast<IntegerType>(ScaledRegTy)->getBitWidth())
	return false;
	}

	if (AddrMode.BaseGV) {
	if (ResultPtr)
	return false;

	ResultPtr = AddrMode.BaseGV;
	}

	// If the real base value actually came from an inttoptr, then the matcher
	// will look through it and provide only the integer value. In that case,
	// use it here.
	if (!DL->isNonIntegralPointerType(Addr->getType())) {
	if (!ResultPtr && AddrMode.BaseReg) {
	ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(),
	"sunkaddr");
	AddrMode.BaseReg = nullptr;
	} else if (!ResultPtr && AddrMode.Scale == 1) {
	ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(),
	"sunkaddr");
	AddrMode.Scale = 0;
	}
	}

	if (!ResultPtr &&
	!AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) {
	SunkAddr = Constant::getNullValue(Addr->getType());
	} else if (!ResultPtr) {
	return false;
	} else {
	Type *I8PtrTy =
	Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
	Type *I8Ty = Builder.getInt8Ty();

	// Start with the base register. Do this first so that subsequent address
	// matching finds it last, which will prevent it from trying to match it
	// as the scaled value in case it happens to be a mul. That would be
	// problematic if we've sunk a different mul for the scale, because then
	// we'd end up sinking both muls.
	if (AddrMode.BaseReg) {
	Value *V = AddrMode.BaseReg;
	if (V->getType() != IntPtrTy)
	V = Builder.CreateIntCast(V, IntPtrTy, /isSigned=/true, "sunkaddr");

	ResultIndex = V;
	}

	// Add the scale value.
	if (AddrMode.Scale) {
	Value *V = AddrMode.ScaledReg;
	if (V->getType() == IntPtrTy) {
	// done.
	} else {
	assert(cast<IntegerType>(IntPtrTy)->getBitWidth() <
	cast<IntegerType>(V->getType())->getBitWidth() &&
	"We can't transform if ScaledReg is too narrow");
	V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
	}

	if (AddrMode.Scale != 1)
	V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
	"sunkaddr");
	if (ResultIndex)
	ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr");
	else
	ResultIndex = V;
	}

	// Add in the Base Offset if present.
	if (AddrMode.BaseOffs) {
	Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
	if (ResultIndex) {
	// We need to add this separately from the scale above to help with
	// SDAG consecutive load/store merging.
	if (ResultPtr->getType() != I8PtrTy)
	ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
	ResultPtr =
	AddrMode.InBounds
	? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex,
	"sunkaddr")
	: Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
	}

	ResultIndex = V;
	}

	if (!ResultIndex) {
	SunkAddr = ResultPtr;
	} else {
	if (ResultPtr->getType() != I8PtrTy)
	ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
	SunkAddr =
	AddrMode.InBounds
	? Builder.CreateInBoundsGEP(I8Ty, ResultPtr, ResultIndex,
	"sunkaddr")
	: Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
	}

	if (SunkAddr->getType() != Addr->getType())
	SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
	}
	} else {
	// We'd require a ptrtoint/inttoptr down the line, which we can't do for
	// non-integral pointers, so in that case bail out now.
	Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr;
	Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr;
	PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy);
	PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy);
	if (DL->isNonIntegralPointerType(Addr->getType()) \|\|
	(BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) \|\|
	(ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) \|\|
	(AddrMode.BaseGV &&
	DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))
	return false;

	LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
	<< " for " << *MemoryInst << "\n");
	Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
	Value *Result = nullptr;

	// Start with the base register. Do this first so that subsequent address
	// matching finds it last, which will prevent it from trying to match it
	// as the scaled value in case it happens to be a mul. That would be
	// problematic if we've sunk a different mul for the scale, because then
	// we'd end up sinking both muls.
	if (AddrMode.BaseReg) {
	Value *V = AddrMode.BaseReg;
	if (V->getType()->isPointerTy())
	V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
	if (V->getType() != IntPtrTy)
	V = Builder.CreateIntCast(V, IntPtrTy, /isSigned=/true, "sunkaddr");
	Result = V;
	}

	// Add the scale value.
	if (AddrMode.Scale) {
	Value *V = AddrMode.ScaledReg;
	if (V->getType() == IntPtrTy) {
	// done.
	} else if (V->getType()->isPointerTy()) {
	V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
	} else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
	cast<IntegerType>(V->getType())->getBitWidth()) {
	V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
	} else {
	// It is only safe to sign extend the BaseReg if we know that the math
	// required to create it did not overflow before we extend it. Since
	// the original IR value was tossed in favor of a constant back when
	// the AddrMode was created we need to bail out gracefully if widths
	// do not match instead of extending it.
	Instruction *I = dyn_cast_or_null<Instruction>(Result);
	if (I && (Result != AddrMode.BaseReg))
	I->eraseFromParent();
	return false;
	}
	if (AddrMode.Scale != 1)
	V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
	"sunkaddr");
	if (Result)
	Result = Builder.CreateAdd(Result, V, "sunkaddr");
	else
	Result = V;
	}

	// Add in the BaseGV if present.
	if (AddrMode.BaseGV) {
	Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
	if (Result)
	Result = Builder.CreateAdd(Result, V, "sunkaddr");
	else
	Result = V;
	}

	// Add in the Base Offset if present.
	if (AddrMode.BaseOffs) {
	Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
	if (Result)
	Result = Builder.CreateAdd(Result, V, "sunkaddr");
	else
	Result = V;
	}

	if (!Result)
	SunkAddr = Constant::getNullValue(Addr->getType());
	else
	SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
	}

	MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
	// Store the newly computed address into the cache. In the case we reused a
	// value, this should be idempotent.
	SunkAddrs[Addr] = WeakTrackingVH(SunkAddr);

	// If we have no uses, recursively delete the value and all dead instructions
	// using it.
	if (Repl->use_empty()) {
	// This can cause recursive deletion, which can invalidate our iterator.
	// Use a WeakTrackingVH to hold onto it in case this happens.
	Value CurValue = &CurInstIterator;
	WeakTrackingVH IterHandle(CurValue);
	BasicBlock *BB = CurInstIterator->getParent();

	RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);

	if (IterHandle != CurValue) {
	// If the iterator instruction was recursively deleted, start over at the
	// start of the block.
	CurInstIterator = BB->begin();
	SunkAddrs.clear();
	}
	}
	++NumMemoryInsts;
	return true;
	}

	/// If there are any memory operands, use OptimizeMemoryInst to sink their
	/// address computing into the block when possible / profitable.
	bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
	bool MadeChange = false;

	const TargetRegisterInfo *TRI =
	TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();
	TargetLowering::AsmOperandInfoVector TargetConstraints =
	TLI->ParseConstraints(*DL, TRI, CS);
	unsigned ArgNo = 0;
	for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
	TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];

	// Compute the constraint code and ConstraintType to use.
	TLI->ComputeConstraintToUse(OpInfo, SDValue());

	if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
	OpInfo.isIndirect) {
	Value *OpVal = CS->getArgOperand(ArgNo++);
	MadeChange \|= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u);
	} else if (OpInfo.Type == InlineAsm::isInput)
	ArgNo++;
	}

	return MadeChange;
	}

	/// Check if all the uses of \p Val are equivalent (or free) zero or
	/// sign extensions.
	static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {
	assert(!Val->use_empty() && "Input must have at least one use");
	const Instruction FirstUser = cast<Instruction>(Val->user_begin());
	bool IsSExt = isa<SExtInst>(FirstUser);
	Type *ExtTy = FirstUser->getType();
	for (const User *U : Val->users()) {
	const Instruction *UI = cast<Instruction>(U);
	if ((IsSExt && !isa<SExtInst>(UI)) \|\| (!IsSExt && !isa<ZExtInst>(UI)))
	return false;
	Type *CurTy = UI->getType();
	// Same input and output types: Same instruction after CSE.
	if (CurTy == ExtTy)
	continue;

	// If IsSExt is true, we are in this situation:
	// a = Val
	// b = sext ty1 a to ty2
	// c = sext ty1 a to ty3
	// Assuming ty2 is shorter than ty3, this could be turned into:
	// a = Val
	// b = sext ty1 a to ty2
	// c = sext ty2 b to ty3
	// However, the last sext is not free.
	if (IsSExt)
	return false;

	// This is a ZExt, maybe this is free to extend from one type to another.
	// In that case, we would not account for a different use.
	Type *NarrowTy;
	Type *LargeTy;
	if (ExtTy->getScalarType()->getIntegerBitWidth() >
	CurTy->getScalarType()->getIntegerBitWidth()) {
	NarrowTy = CurTy;
	LargeTy = ExtTy;
	} else {
	NarrowTy = ExtTy;
	LargeTy = CurTy;
	}

	if (!TLI.isZExtFree(NarrowTy, LargeTy))
	return false;
	}
	// All uses are the same or can be derived from one another for free.
	return true;
	}

	/// Try to speculatively promote extensions in \p Exts and continue
	/// promoting through newly promoted operands recursively as far as doing so is
	/// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts.
	/// When some promotion happened, \p TPT contains the proper state to revert
	/// them.
	///
	/// \return true if some promotion happened, false otherwise.
	bool CodeGenPrepare::tryToPromoteExts(
	TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts,
	SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
	unsigned CreatedInstsCost) {
	bool Promoted = false;

	// Iterate over all the extensions to try to promote them.
	for (auto I : Exts) {
	// Early check if we directly have ext(load).
	if (isa<LoadInst>(I->getOperand(0))) {
	ProfitablyMovedExts.push_back(I);
	continue;
	}

	// Check whether or not we want to do any promotion. The reason we have
	// this check inside the for loop is to catch the case where an extension
	// is directly fed by a load because in such case the extension can be moved
	// up without any promotion on its operands.
	if (!TLI \|\| !TLI->enableExtLdPromotion() \|\| DisableExtLdPromotion)
	return false;

	// Get the action to perform the promotion.
	TypePromotionHelper::Action TPH =
	TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts);
	// Check if we can promote.
	if (!TPH) {
	// Save the current extension as we cannot move up through its operand.
	ProfitablyMovedExts.push_back(I);
	continue;
	}

	// Save the current state.
	TypePromotionTransaction::ConstRestorationPt LastKnownGood =
	TPT.getRestorationPoint();
	SmallVector<Instruction *, 4> NewExts;
	unsigned NewCreatedInstsCost = 0;
	unsigned ExtCost = !TLI->isExtFree(I);
	// Promote.
	Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost,
	&NewExts, nullptr, *TLI);
	assert(PromotedVal &&
	"TypePromotionHelper should have filtered out those cases");

	// We would be able to merge only one extension in a load.
	// Therefore, if we have more than 1 new extension we heuristically
	// cut this search path, because it means we degrade the code quality.
	// With exactly 2, the transformation is neutral, because we will merge
	// one extension but leave one. However, we optimistically keep going,
	// because the new extension may be removed too.
	long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
	// FIXME: It would be possible to propagate a negative value instead of
	// conservatively ceiling it to 0.
	TotalCreatedInstsCost =
	std::max((long long)0, (TotalCreatedInstsCost - ExtCost));
	if (!StressExtLdPromotion &&
	(TotalCreatedInstsCost > 1 \|\|
	!isPromotedInstructionLegal(TLI, DL, PromotedVal))) {
	// This promotion is not profitable, rollback to the previous state, and
	// save the current extension in ProfitablyMovedExts as the latest
	// speculative promotion turned out to be unprofitable.
	TPT.rollback(LastKnownGood);
	ProfitablyMovedExts.push_back(I);
	continue;
	}
	// Continue promoting NewExts as far as doing so is profitable.
	SmallVector<Instruction *, 2> NewlyMovedExts;
	(void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);
	bool NewPromoted = false;
	for (auto ExtInst : NewlyMovedExts) {
	Instruction *MovedExt = cast<Instruction>(ExtInst);
	Value *ExtOperand = MovedExt->getOperand(0);
	// If we have reached to a load, we need this extra profitability check
	// as it could potentially be merged into an ext(load).
	if (isa<LoadInst>(ExtOperand) &&
	!(StressExtLdPromotion \|\| NewCreatedInstsCost <= ExtCost \|\|
	(ExtOperand->hasOneUse() \|\| hasSameExtUse(ExtOperand, *TLI))))
	continue;

	ProfitablyMovedExts.push_back(MovedExt);
	NewPromoted = true;
	}

	// If none of speculative promotions for NewExts is profitable, rollback
	// and save the current extension (I) as the last profitable extension.
	if (!NewPromoted) {
	TPT.rollback(LastKnownGood);
	ProfitablyMovedExts.push_back(I);
	continue;
	}
	// The promotion is profitable.
	Promoted = true;
	}
	return Promoted;
	}

	/// Merging redundant sexts when one is dominating the other.
	bool CodeGenPrepare::mergeSExts(Function &F) {
	bool Changed = false;
	for (auto &Entry : ValToSExtendedUses) {
	SExts &Insts = Entry.second;
	SExts CurPts;
	for (Instruction *Inst : Insts) {
	if (RemovedInsts.count(Inst) \|\| !isa<SExtInst>(Inst) \|\|
	Inst->getOperand(0) != Entry.first)
	continue;
	bool inserted = false;
	for (auto &Pt : CurPts) {
	if (getDT(F).dominates(Inst, Pt)) {
	Pt->replaceAllUsesWith(Inst);
	RemovedInsts.insert(Pt);
	Pt->removeFromParent();
	Pt = Inst;
	inserted = true;
	Changed = true;
	break;
	}
	if (!getDT(F).dominates(Pt, Inst))
	// Give up if we need to merge in a common dominator as the
	// experiments show it is not profitable.
	continue;
	Inst->replaceAllUsesWith(Pt);
	RemovedInsts.insert(Inst);
	Inst->removeFromParent();
	inserted = true;
	Changed = true;
	break;
	}
	if (!inserted)
	CurPts.push_back(Inst);
	}
	}
	return Changed;
	}

	// Spliting large data structures so that the GEPs accessing them can have
	// smaller offsets so that they can be sunk to the same blocks as their users.
	// For example, a large struct starting from %base is splitted into two parts
	// where the second part starts from %new_base.
	//
	// Before:
	// BB0:
	// %base =
	//
	// BB1:
	// %gep0 = gep %base, off0
	// %gep1 = gep %base, off1
	// %gep2 = gep %base, off2
	//
	// BB2:
	// %load1 = load %gep0
	// %load2 = load %gep1
	// %load3 = load %gep2
	//
	// After:
	// BB0:
	// %base =
	// %new_base = gep %base, off0
	//
	// BB1:
	// %new_gep0 = %new_base
	// %new_gep1 = gep %new_base, off1 - off0
	// %new_gep2 = gep %new_base, off2 - off0
	//
	// BB2:
	// %load1 = load i32, i32* %new_gep0
	// %load2 = load i32, i32* %new_gep1
	// %load3 = load i32, i32* %new_gep2
	//
	// %new_gep1 and %new_gep2 can be sunk to BB2 now after the splitting because
	// their offsets are smaller enough to fit into the addressing mode.
	bool CodeGenPrepare::splitLargeGEPOffsets() {
	bool Changed = false;
	for (auto &Entry : LargeOffsetGEPMap) {
	Value *OldBase = Entry.first;
	SmallVectorImpl<std::pair<AssertingVH<GetElementPtrInst>, int64_t>>
	&LargeOffsetGEPs = Entry.second;
	auto compareGEPOffset =
	[&](const std::pair<GetElementPtrInst *, int64_t> &LHS,
	const std::pair<GetElementPtrInst *, int64_t> &RHS) {
	if (LHS.first == RHS.first)
	return false;
	if (LHS.second != RHS.second)
	return LHS.second < RHS.second;
	return LargeOffsetGEPID[LHS.first] < LargeOffsetGEPID[RHS.first];
	};
	// Sorting all the GEPs of the same data structures based on the offsets.
	llvm::sort(LargeOffsetGEPs, compareGEPOffset);
	LargeOffsetGEPs.erase(
	std::unique(LargeOffsetGEPs.begin(), LargeOffsetGEPs.end()),
	LargeOffsetGEPs.end());
	// Skip if all the GEPs have the same offsets.
	if (LargeOffsetGEPs.front().second == LargeOffsetGEPs.back().second)
	continue;
	GetElementPtrInst *BaseGEP = LargeOffsetGEPs.begin()->first;
	int64_t BaseOffset = LargeOffsetGEPs.begin()->second;
	Value *NewBaseGEP = nullptr;

	auto LargeOffsetGEP = LargeOffsetGEPs.begin();
	while (LargeOffsetGEP != LargeOffsetGEPs.end()) {
	GetElementPtrInst *GEP = LargeOffsetGEP->first;
	int64_t Offset = LargeOffsetGEP->second;
	if (Offset != BaseOffset) {
	TargetLowering::AddrMode AddrMode;
	AddrMode.BaseOffs = Offset - BaseOffset;
	// The result type of the GEP might not be the type of the memory
	// access.
	if (!TLI->isLegalAddressingMode(*DL, AddrMode,
	GEP->getResultElementType(),
	GEP->getAddressSpace())) {
	// We need to create a new base if the offset to the current base is
	// too large to fit into the addressing mode. So, a very large struct
	// may be splitted into several parts.
	BaseGEP = GEP;
	BaseOffset = Offset;
	NewBaseGEP = nullptr;
	}
	}

	// Generate a new GEP to replace the current one.
	LLVMContext &Ctx = GEP->getContext();
	Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
	Type *I8PtrTy =
	Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace());
	Type *I8Ty = Type::getInt8Ty(Ctx);

	if (!NewBaseGEP) {
	// Create a new base if we don't have one yet. Find the insertion
	// pointer for the new base first.
	BasicBlock::iterator NewBaseInsertPt;
	BasicBlock *NewBaseInsertBB;
	if (auto *BaseI = dyn_cast<Instruction>(OldBase)) {
	// If the base of the struct is an instruction, the new base will be
	// inserted close to it.
	NewBaseInsertBB = BaseI->getParent();
	if (isa<PHINode>(BaseI))
	NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
	else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) {
	NewBaseInsertBB =
	SplitEdge(NewBaseInsertBB, Invoke->getNormalDest());
	NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
	} else
	NewBaseInsertPt = std::next(BaseI->getIterator());
	} else {
	// If the current base is an argument or global value, the new base
	// will be inserted to the entry block.
	NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock();
	NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
	}
	IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt);
	// Create a new base.
	Value *BaseIndex = ConstantInt::get(IntPtrTy, BaseOffset);
	NewBaseGEP = OldBase;
	if (NewBaseGEP->getType() != I8PtrTy)
	NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy);
	NewBaseGEP =
	NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep");
	NewGEPBases.insert(NewBaseGEP);
	}

	IRBuilder<> Builder(GEP);
	Value *NewGEP = NewBaseGEP;
	if (Offset == BaseOffset) {
	if (GEP->getType() != I8PtrTy)
	NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
	} else {
	// Calculate the new offset for the new GEP.
	Value *Index = ConstantInt::get(IntPtrTy, Offset - BaseOffset);
	NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index);

	if (GEP->getType() != I8PtrTy)
	NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
	}
	GEP->replaceAllUsesWith(NewGEP);
	LargeOffsetGEPID.erase(GEP);
	LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP);
	GEP->eraseFromParent();
	Changed = true;
	}
	}
	return Changed;
	}

	/// Return true, if an ext(load) can be formed from an extension in
	/// \p MovedExts.
	bool CodeGenPrepare::canFormExtLd(
	const SmallVectorImpl<Instruction > &MovedExts, LoadInst &LI,
	Instruction *&Inst, bool HasPromoted) {
	for (auto *MovedExtInst : MovedExts) {
	if (isa<LoadInst>(MovedExtInst->getOperand(0))) {
	LI = cast<LoadInst>(MovedExtInst->getOperand(0));
	Inst = MovedExtInst;
	break;
	}
	}
	if (!LI)
	return false;

	// If they're already in the same block, there's nothing to do.
	// Make the cheap checks first if we did not promote.
	// If we promoted, we need to check if it is indeed profitable.
	if (!HasPromoted && LI->getParent() == Inst->getParent())
	return false;

	return TLI->isExtLoad(LI, Inst, *DL);
	}

	/// Move a zext or sext fed by a load into the same basic block as the load,
	/// unless conditions are unfavorable. This allows SelectionDAG to fold the
	/// extend into the load.
	///
	/// E.g.,
	/// \code
	/// %ld = load i32* %addr
	/// %add = add nuw i32 %ld, 4
	/// %zext = zext i32 %add to i64
	// \endcode
	/// =>
	/// \code
	/// %ld = load i32* %addr
	/// %zext = zext i32 %ld to i64
	/// %add = add nuw i64 %zext, 4
	/// \encode
	/// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which
	/// allow us to match zext(load i32*) to i64.
	///
	/// Also, try to promote the computations used to obtain a sign extended
	/// value used into memory accesses.
	/// E.g.,
	/// \code
	/// a = add nsw i32 b, 3
	/// d = sext i32 a to i64
	/// e = getelementptr ..., i64 d
	/// \endcode
	/// =>
	/// \code
	/// f = sext i32 b to i64
	/// a = add nsw i64 f, 3
	/// e = getelementptr ..., i64 a
	/// \endcode
	///
	/// \p Inst[in/out] the extension may be modified during the process if some
	/// promotions apply.
	bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
	// ExtLoad formation and address type promotion infrastructure requires TLI to
	// be effective.
	if (!TLI)
	return false;

	bool AllowPromotionWithoutCommonHeader = false;
	/// See if it is an interesting sext operations for the address type
	/// promotion before trying to promote it, e.g., the ones with the right
	/// type and used in memory accesses.
	bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion(
	*Inst, AllowPromotionWithoutCommonHeader);
	TypePromotionTransaction TPT(RemovedInsts);
	TypePromotionTransaction::ConstRestorationPt LastKnownGood =
	TPT.getRestorationPoint();
	SmallVector<Instruction *, 1> Exts;
	SmallVector<Instruction *, 2> SpeculativelyMovedExts;
	Exts.push_back(Inst);

	bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts);

	// Look for a load being extended.
	LoadInst *LI = nullptr;
	Instruction *ExtFedByLoad;

	// Try to promote a chain of computation if it allows to form an extended
	// load.
	if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {
	assert(LI && ExtFedByLoad && "Expect a valid load and extension");
	TPT.commit();
	// Move the extend into the same block as the load
	ExtFedByLoad->moveAfter(LI);
	// CGP does not check if the zext would be speculatively executed when moved
	// to the same basic block as the load. Preserving its original location
	// would pessimize the debugging experience, as well as negatively impact
	// the quality of sample pgo. We don't want to use "line 0" as that has a
	// size cost in the line-table section and logically the zext can be seen as
	// part of the load. Therefore we conservatively reuse the same debug
	// location for the load and the zext.
	ExtFedByLoad->setDebugLoc(LI->getDebugLoc());
	++NumExtsMoved;
	Inst = ExtFedByLoad;
	return true;
	}

	// Continue promoting SExts if known as considerable depending on targets.
	if (ATPConsiderable &&
	performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader,
	HasPromoted, TPT, SpeculativelyMovedExts))
	return true;

	TPT.rollback(LastKnownGood);
	return false;
	}

	// Perform address type promotion if doing so is profitable.
	// If AllowPromotionWithoutCommonHeader == false, we should find other sext
	// instructions that sign extended the same initial value. However, if
	// AllowPromotionWithoutCommonHeader == true, we expect promoting the
	// extension is just profitable.
	bool CodeGenPrepare::performAddressTypePromotion(
	Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
	bool HasPromoted, TypePromotionTransaction &TPT,
	SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) {
	bool Promoted = false;
	SmallPtrSet<Instruction *, 1> UnhandledExts;
	bool AllSeenFirst = true;
	for (auto I : SpeculativelyMovedExts) {
	Value *HeadOfChain = I->getOperand(0);
	DenseMap<Value , Instruction >::iterator AlreadySeen =
	SeenChainsForSExt.find(HeadOfChain);
	// If there is an unhandled SExt which has the same header, try to promote
	// it as well.
	if (AlreadySeen != SeenChainsForSExt.end()) {
	if (AlreadySeen->second != nullptr)
	UnhandledExts.insert(AlreadySeen->second);
	AllSeenFirst = false;
	}
	}

	if (!AllSeenFirst \|\| (AllowPromotionWithoutCommonHeader &&
	SpeculativelyMovedExts.size() == 1)) {
	TPT.commit();
	if (HasPromoted)
	Promoted = true;
	for (auto I : SpeculativelyMovedExts) {
	Value *HeadOfChain = I->getOperand(0);
	SeenChainsForSExt[HeadOfChain] = nullptr;
	ValToSExtendedUses[HeadOfChain].push_back(I);
	}
	// Update Inst as promotion happen.
	Inst = SpeculativelyMovedExts.pop_back_val();
	} else {
	// This is the first chain visited from the header, keep the current chain
	// as unhandled. Defer to promote this until we encounter another SExt
	// chain derived from the same header.
	for (auto I : SpeculativelyMovedExts) {
	Value *HeadOfChain = I->getOperand(0);
	SeenChainsForSExt[HeadOfChain] = Inst;
	}
	return false;
	}

	if (!AllSeenFirst && !UnhandledExts.empty())
	for (auto VisitedSExt : UnhandledExts) {
	if (RemovedInsts.count(VisitedSExt))
	continue;
	TypePromotionTransaction TPT(RemovedInsts);
	SmallVector<Instruction *, 1> Exts;
	SmallVector<Instruction *, 2> Chains;
	Exts.push_back(VisitedSExt);
	bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains);
	TPT.commit();
	if (HasPromoted)
	Promoted = true;
	for (auto I : Chains) {
	Value *HeadOfChain = I->getOperand(0);
	// Mark this as handled.
	SeenChainsForSExt[HeadOfChain] = nullptr;
	ValToSExtendedUses[HeadOfChain].push_back(I);
	}
	}
	return Promoted;
	}

	bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
	BasicBlock *DefBB = I->getParent();

	// If the result of a {s\|z}ext and its source are both live out, rewrite all
	// other uses of the source with result of extension.
	Value *Src = I->getOperand(0);
	if (Src->hasOneUse())
	return false;

	// Only do this xform if truncating is free.
	if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType()))
	return false;

	// Only safe to perform the optimization if the source is also defined in
	// this block.
	if (!isa<Instruction>(Src) \|\| DefBB != cast<Instruction>(Src)->getParent())
	return false;

	bool DefIsLiveOut = false;
	for (User *U : I->users()) {
	Instruction *UI = cast<Instruction>(U);

	// Figure out which BB this ext is used in.
	BasicBlock *UserBB = UI->getParent();
	if (UserBB == DefBB) continue;
	DefIsLiveOut = true;
	break;
	}
	if (!DefIsLiveOut)
	return false;

	// Make sure none of the uses are PHI nodes.
	for (User *U : Src->users()) {
	Instruction *UI = cast<Instruction>(U);
	BasicBlock *UserBB = UI->getParent();
	if (UserBB == DefBB) continue;
	// Be conservative. We don't want this xform to end up introducing
	// reloads just before load / store instructions.
	if (isa<PHINode>(UI) \|\| isa<LoadInst>(UI) \|\| isa<StoreInst>(UI))
	return false;
	}

	// InsertedTruncs - Only insert one trunc in each block once.
	DenseMap<BasicBlock, Instruction> InsertedTruncs;

	bool MadeChange = false;
	for (Use &U : Src->uses()) {
	Instruction *User = cast<Instruction>(U.getUser());

	// Figure out which BB this ext is used in.
	BasicBlock *UserBB = User->getParent();
	if (UserBB == DefBB) continue;

	// Both src and def are live in this block. Rewrite the use.
	Instruction *&InsertedTrunc = InsertedTruncs[UserBB];

	if (!InsertedTrunc) {
	BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
	assert(InsertPt != UserBB->end());
	InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt);
	InsertedInsts.insert(InsertedTrunc);
	}

	// Replace a use of the {s\|z}ext source with a use of the result.
	U = InsertedTrunc;
	++NumExtUses;
	MadeChange = true;
	}

	return MadeChange;
	}

	// Find loads whose uses only use some of the loaded value's bits. Add an "and"
	// just after the load if the target can fold this into one extload instruction,
	// with the hope of eliminating some of the other later "and" instructions using
	// the loaded value. "and"s that are made trivially redundant by the insertion
	// of the new "and" are removed by this function, while others (e.g. those whose
	// path from the load goes through a phi) are left for isel to potentially
	// remove.
	//
	// For example:
	//
	// b0:
	// x = load i32
	// ...
	// b1:
	// y = and x, 0xff
	// z = use y
	//
	// becomes:
	//
	// b0:
	// x = load i32
	// x' = and x, 0xff
	// ...
	// b1:
	// z = use x'
	//
	// whereas:
	//
	// b0:
	// x1 = load i32
	// ...
	// b1:
	// x2 = load i32
	// ...
	// b2:
	// x = phi x1, x2
	// y = and x, 0xff
	//
	// becomes (after a call to optimizeLoadExt for each load):
	//
	// b0:
	// x1 = load i32
	// x1' = and x1, 0xff
	// ...
	// b1:
	// x2 = load i32
	// x2' = and x2, 0xff
	// ...
	// b2:
	// x = phi x1', x2'
	// y = and x, 0xff
	bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
	if (!Load->isSimple() \|\| !Load->getType()->isIntOrPtrTy())
	return false;

	// Skip loads we've already transformed.
	if (Load->hasOneUse() &&
	InsertedInsts.count(cast<Instruction>(*Load->user_begin())))
	return false;

	// Look at all uses of Load, looking through phis, to determine how many bits
	// of the loaded value are needed.
	SmallVector<Instruction *, 8> WorkList;
	SmallPtrSet<Instruction *, 16> Visited;
	SmallVector<Instruction *, 8> AndsToMaybeRemove;
	for (auto *U : Load->users())
	WorkList.push_back(cast<Instruction>(U));

	EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());
	unsigned BitWidth = LoadResultVT.getSizeInBits();
	APInt DemandBits(BitWidth, 0);
	APInt WidestAndBits(BitWidth, 0);

	while (!WorkList.empty()) {
	Instruction *I = WorkList.back();
	WorkList.pop_back();

	// Break use-def graph loops.
	if (!Visited.insert(I).second)
	continue;

	// For a PHI node, push all of its users.
	if (auto *Phi = dyn_cast<PHINode>(I)) {
	for (auto *U : Phi->users())
	WorkList.push_back(cast<Instruction>(U));
	continue;
	}

	switch (I->getOpcode()) {
	case Instruction::And: {
	auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1));
	if (!AndC)
	return false;
	APInt AndBits = AndC->getValue();
	DemandBits \|= AndBits;
	// Keep track of the widest and mask we see.
	if (AndBits.ugt(WidestAndBits))
	WidestAndBits = AndBits;
	if (AndBits == WidestAndBits && I->getOperand(0) == Load)
	AndsToMaybeRemove.push_back(I);
	break;
	}

	case Instruction::Shl: {
	auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1));
	if (!ShlC)
	return false;
	uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);
	DemandBits.setLowBits(BitWidth - ShiftAmt);
	break;
	}

	case Instruction::Trunc: {
	EVT TruncVT = TLI->getValueType(*DL, I->getType());
	unsigned TruncBitWidth = TruncVT.getSizeInBits();
	DemandBits.setLowBits(TruncBitWidth);
	break;
	}

	default:
	return false;
	}
	}

	uint32_t ActiveBits = DemandBits.getActiveBits();
	// Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the
	// target even if isLoadExtLegal says an i1 EXTLOAD is valid. For example,
	// for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but
	// (and (load x) 1) is not matched as a single instruction, rather as a LDR
	// followed by an AND.
	// TODO: Look into removing this restriction by fixing backends to either
	// return false for isLoadExtLegal for i1 or have them select this pattern to
	// a single instruction.
	//
	// Also avoid hoisting if we didn't see any ands with the exact DemandBits
	// mask, since these are the only ands that will be removed by isel.
	if (ActiveBits <= 1 \|\| !DemandBits.isMask(ActiveBits) \|\|
	WidestAndBits != DemandBits)
	return false;

	LLVMContext &Ctx = Load->getType()->getContext();
	Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits);
	EVT TruncVT = TLI->getValueType(*DL, TruncTy);

	// Reject cases that won't be matched as extloads.
	if (!LoadResultVT.bitsGT(TruncVT) \|\| !TruncVT.isRound() \|\|
	!TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))
	return false;

	IRBuilder<> Builder(Load->getNextNode());
	auto *NewAnd = dyn_cast<Instruction>(
	Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
	// Mark this instruction as "inserted by CGP", so that other
	// optimizations don't touch it.
	InsertedInsts.insert(NewAnd);

	// Replace all uses of load with new and (except for the use of load in the
	// new and itself).
	Load->replaceAllUsesWith(NewAnd);
	NewAnd->setOperand(0, Load);

	// Remove any and instructions that are now redundant.
	for (auto *And : AndsToMaybeRemove)
	// Check that the and mask is the same as the one we decided to put on the
	// new and.
	if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {
	And->replaceAllUsesWith(NewAnd);
	if (&*CurInstIterator == And)
	CurInstIterator = std::next(And->getIterator());
	And->eraseFromParent();
	++NumAndUses;
	}

	++NumAndsAdded;
	return true;
	}

	/// Check if V (an operand of a select instruction) is an expensive instruction
	/// that is only used once.
	static bool sinkSelectOperand(const TargetTransformInfo TTI, Value V) {
	auto *I = dyn_cast<Instruction>(V);
	// If it's safe to speculatively execute, then it should not have side
	// effects; therefore, it's safe to sink and possibly not execute.
	return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
	TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive;
	}

	/// Returns true if a SelectInst should be turned into an explicit branch.
	static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
	const TargetLowering *TLI,
	SelectInst *SI) {
	// If even a predictable select is cheap, then a branch can't be cheaper.
	if (!TLI->isPredictableSelectExpensive())
	return false;

	// FIXME: This should use the same heuristics as IfConversion to determine
	// whether a select is better represented as a branch.

	// If metadata tells us that the select condition is obviously predictable,
	// then we want to replace the select with a branch.
	uint64_t TrueWeight, FalseWeight;
	if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
	uint64_t Max = std::max(TrueWeight, FalseWeight);
	uint64_t Sum = TrueWeight + FalseWeight;
	if (Sum != 0) {
	auto Probability = BranchProbability::getBranchProbability(Max, Sum);
	if (Probability > TLI->getPredictableBranchThreshold())
	return true;
	}
	}

	CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());

	// If a branch is predictable, an out-of-order CPU can avoid blocking on its
	// comparison condition. If the compare has more than one use, there's
	// probably another cmov or setcc around, so it's not worth emitting a branch.
	if (!Cmp \|\| !Cmp->hasOneUse())
	return false;

	// If either operand of the select is expensive and only needed on one side
	// of the select, we should form a branch.
	if (sinkSelectOperand(TTI, SI->getTrueValue()) \|\|
	sinkSelectOperand(TTI, SI->getFalseValue()))
	return true;

	return false;
	}

	/// If \p isTrue is true, return the true value of \p SI, otherwise return
	/// false value of \p SI. If the true/false value of \p SI is defined by any
	/// select instructions in \p Selects, look through the defining select
	/// instruction until the true/false value is not defined in \p Selects.
	static Value *getTrueOrFalseValue(
	SelectInst *SI, bool isTrue,
	const SmallPtrSet<const Instruction *, 2> &Selects) {
	Value *V = nullptr;

	for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
	DefSI = dyn_cast<SelectInst>(V)) {
	assert(DefSI->getCondition() == SI->getCondition() &&
	"The condition of DefSI does not match with SI");
	V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
	}

	assert(V && "Failed to get select true/false value");
	return V;
	}

	bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) {
	assert(Shift->isShift() && "Expected a shift");

	// If this is (1) a vector shift, (2) shifts by scalars are cheaper than
	// general vector shifts, and (3) the shift amount is a select-of-splatted
	// values, hoist the shifts before the select:
	// shift Op0, (select Cond, TVal, FVal) -->
	// select Cond, (shift Op0, TVal), (shift Op0, FVal)
	//
	// This is inverting a generic IR transform when we know that the cost of a
	// general vector shift is more than the cost of 2 shift-by-scalars.
	// We can't do this effectively in SDAG because we may not be able to
	// determine if the select operands are splats from within a basic block.
	Type *Ty = Shift->getType();
	if (!Ty->isVectorTy() \|\| !TLI->isVectorShiftByScalarCheap(Ty))
	return false;
	Value Cond, TVal, *FVal;
	if (!match(Shift->getOperand(1),
	m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
	return false;
	if (!isSplatValue(TVal) \|\| !isSplatValue(FVal))
	return false;

	IRBuilder<> Builder(Shift);
	BinaryOperator::BinaryOps Opcode = Shift->getOpcode();
	Value *NewTVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), TVal);
	Value *NewFVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), FVal);
	Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
	Shift->replaceAllUsesWith(NewSel);
	Shift->eraseFromParent();
	return true;
	}

	/// If we have a SelectInst that will likely profit from branch prediction,
	/// turn it into a branch.
	bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
	// If branch conversion isn't desirable, exit early.
	if (DisableSelectToBranch \|\| OptSize \|\| !TLI)
	return false;

	// Find all consecutive select instructions that share the same condition.
	SmallVector<SelectInst *, 2> ASI;
	ASI.push_back(SI);
	for (BasicBlock::iterator It = ++BasicBlock::iterator(SI);
	It != SI->getParent()->end(); ++It) {
	SelectInst I = dyn_cast<SelectInst>(&It);
	if (I && SI->getCondition() == I->getCondition()) {
	ASI.push_back(I);
	} else {
	break;
	}
	}

	SelectInst *LastSI = ASI.back();
	// Increment the current iterator to skip all the rest of select instructions
	// because they will be either "not lowered" or "all lowered" to branch.
	CurInstIterator = std::next(LastSI->getIterator());

	bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);

	// Can we convert the 'select' to CF ?
	if (VectorCond \|\| SI->getMetadata(LLVMContext::MD_unpredictable))
	return false;

	TargetLowering::SelectSupportKind SelectKind;
	if (VectorCond)
	SelectKind = TargetLowering::VectorMaskSelect;
	else if (SI->getType()->isVectorTy())
	SelectKind = TargetLowering::ScalarCondVectorVal;
	else
	SelectKind = TargetLowering::ScalarValSelect;

	if (TLI->isSelectSupported(SelectKind) &&
	!isFormingBranchFromSelectProfitable(TTI, TLI, SI))
	return false;

	// The DominatorTree needs to be rebuilt by any consumers after this
	// transformation. We simply reset here rather than setting the ModifiedDT
	// flag to avoid restarting the function walk in runOnFunction for each
	// select optimized.
	DT.reset();

	// Transform a sequence like this:
	// start:
	// %cmp = cmp uge i32 %a, %b
	// %sel = select i1 %cmp, i32 %c, i32 %d
	//
	// Into:
	// start:
	// %cmp = cmp uge i32 %a, %b
	// br i1 %cmp, label %select.true, label %select.false
	// select.true:
	// br label %select.end
	// select.false:
	// br label %select.end
	// select.end:
	// %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
	//
	// In addition, we may sink instructions that produce %c or %d from
	// the entry block into the destination(s) of the new branch.
	// If the true or false blocks do not contain a sunken instruction, that
	// block and its branch may be optimized away. In that case, one side of the
	// first branch will point directly to select.end, and the corresponding PHI
	// predecessor block will be the start block.

	// First, we split the block containing the select into 2 blocks.
	BasicBlock *StartBlock = SI->getParent();
	BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
	BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");

	// Delete the unconditional branch that was just created by the split.
	StartBlock->getTerminator()->eraseFromParent();

	// These are the new basic blocks for the conditional branch.
	// At least one will become an actual new basic block.
	BasicBlock *TrueBlock = nullptr;
	BasicBlock *FalseBlock = nullptr;
	BranchInst *TrueBranch = nullptr;
	BranchInst *FalseBranch = nullptr;

	// Sink expensive instructions into the conditional blocks to avoid executing
	// them speculatively.
	for (SelectInst *SI : ASI) {
	if (sinkSelectOperand(TTI, SI->getTrueValue())) {
	if (TrueBlock == nullptr) {
	TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink",
	EndBlock->getParent(), EndBlock);
	TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
	TrueBranch->setDebugLoc(SI->getDebugLoc());
	}
	auto *TrueInst = cast<Instruction>(SI->getTrueValue());
	TrueInst->moveBefore(TrueBranch);
	}
	if (sinkSelectOperand(TTI, SI->getFalseValue())) {
	if (FalseBlock == nullptr) {
	FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink",
	EndBlock->getParent(), EndBlock);
	FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
	FalseBranch->setDebugLoc(SI->getDebugLoc());
	}
	auto *FalseInst = cast<Instruction>(SI->getFalseValue());
	FalseInst->moveBefore(FalseBranch);
	}
	}

	// If there was nothing to sink, then arbitrarily choose the 'false' side
	// for a new input value to the PHI.
	if (TrueBlock == FalseBlock) {
	assert(TrueBlock == nullptr &&
	"Unexpected basic block transform while optimizing select");

	FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
	EndBlock->getParent(), EndBlock);
	auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
	FalseBranch->setDebugLoc(SI->getDebugLoc());
	}

	// Insert the real conditional branch based on the original condition.
	// If we did not create a new block for one of the 'true' or 'false' paths
	// of the condition, it means that side of the branch goes to the end block
	// directly and the path originates from the start block from the point of
	// view of the new PHI.
	BasicBlock TT, FT;
	if (TrueBlock == nullptr) {
	TT = EndBlock;
	FT = FalseBlock;
	TrueBlock = StartBlock;
	} else if (FalseBlock == nullptr) {
	TT = TrueBlock;
	FT = EndBlock;
	FalseBlock = StartBlock;
	} else {
	TT = TrueBlock;
	FT = FalseBlock;
	}
	IRBuilder<>(SI).CreateCondBr(SI->getCondition(), TT, FT, SI);

	SmallPtrSet<const Instruction *, 2> INS;
	INS.insert(ASI.begin(), ASI.end());
	// Use reverse iterator because later select may use the value of the
	// earlier select, and we need to propagate value through earlier select
	// to get the PHI operand.
	for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
	SelectInst SI = It;
	// The select itself is replaced with a PHI Node.
	PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
	PN->takeName(SI);
	PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
	PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
	PN->setDebugLoc(SI->getDebugLoc());

	SI->replaceAllUsesWith(PN);
	SI->eraseFromParent();
	INS.erase(SI);
	++NumSelectsExpanded;
	}

	// Instruct OptimizeBlock to skip to the next block.
	CurInstIterator = StartBlock->end();
	return true;
	}

	static bool isBroadcastShuffle(ShuffleVectorInst *SVI) {
	SmallVector<int, 16> Mask(SVI->getShuffleMask());
	int SplatElem = -1;
	for (unsigned i = 0; i < Mask.size(); ++i) {
	if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem)
	return false;
	SplatElem = Mask[i];
	}

	return true;
	}

	/// Some targets have expensive vector shifts if the lanes aren't all the same
	/// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
	/// it's often worth sinking a shufflevector splat down to its use so that
	/// codegen can spot all lanes are identical.
	bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
	BasicBlock *DefBB = SVI->getParent();

	// Only do this xform if variable vector shifts are particularly expensive.
	if (!TLI \|\| !TLI->isVectorShiftByScalarCheap(SVI->getType()))
	return false;

	// We only expect better codegen by sinking a shuffle if we can recognise a
	// constant splat.
	if (!isBroadcastShuffle(SVI))
	return false;

	// InsertedShuffles - Only insert a shuffle in each block once.
	DenseMap<BasicBlock, Instruction> InsertedShuffles;

	bool MadeChange = false;
	for (User *U : SVI->users()) {
	Instruction *UI = cast<Instruction>(U);

	// Figure out which BB this ext is used in.
	BasicBlock *UserBB = UI->getParent();
	if (UserBB == DefBB) continue;

	// For now only apply this when the splat is used by a shift instruction.
	if (!UI->isShift()) continue;

	// Everything checks out, sink the shuffle if the user's block doesn't
	// already have a copy.
	Instruction *&InsertedShuffle = InsertedShuffles[UserBB];

	if (!InsertedShuffle) {
	BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
	assert(InsertPt != UserBB->end());
	InsertedShuffle =
	new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
	SVI->getOperand(2), "", &*InsertPt);
	InsertedShuffle->setDebugLoc(SVI->getDebugLoc());
	}

	UI->replaceUsesOfWith(SVI, InsertedShuffle);
	MadeChange = true;
	}

	// If we removed all uses, nuke the shuffle.
	if (SVI->use_empty()) {
	SVI->eraseFromParent();
	MadeChange = true;
	}

	return MadeChange;
	}

	bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
	// If the operands of I can be folded into a target instruction together with
	// I, duplicate and sink them.
	SmallVector<Use *, 4> OpsToSink;
	if (!TLI \|\| !TLI->shouldSinkOperands(I, OpsToSink))
	return false;

	// OpsToSink can contain multiple uses in a use chain (e.g.
	// (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating
	// uses must come first, which means they are sunk first, temporarily creating
	// invalid IR. This will be fixed once their dominated users are sunk and
	// updated.
	BasicBlock *TargetBB = I->getParent();
	bool Changed = false;
	SmallVector<Use *, 4> ToReplace;
	for (Use *U : OpsToSink) {
	auto *UI = cast<Instruction>(U->get());
	if (UI->getParent() == TargetBB \|\| isa<PHINode>(UI))
	continue;
	ToReplace.push_back(U);
	}

	SmallPtrSet<Instruction *, 4> MaybeDead;
	for (Use *U : ToReplace) {
	auto *UI = cast<Instruction>(U->get());
	Instruction *NI = UI->clone();
	MaybeDead.insert(UI);
	LLVM_DEBUG(dbgs() << "Sinking " << UI << " to user " << I << "\n");
	NI->insertBefore(I);
	InsertedInsts.insert(NI);
	U->set(NI);
	Changed = true;
	}

	// Remove instructions that are dead after sinking.
	for (auto *I : MaybeDead)
	if (!I->hasNUsesOrMore(1))
	I->eraseFromParent();

	return Changed;
	}

	bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
	if (!TLI \|\| !DL)
	return false;

	Value *Cond = SI->getCondition();
	Type *OldType = Cond->getType();
	LLVMContext &Context = Cond->getContext();
	MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType));
	unsigned RegWidth = RegType.getSizeInBits();

	if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth())
	return false;

	// If the register width is greater than the type width, expand the condition
	// of the switch instruction and each case constant to the width of the
	// register. By widening the type of the switch condition, subsequent
	// comparisons (for case comparisons) will not need to be extended to the
	// preferred register width, so we will potentially eliminate N-1 extends,
	// where N is the number of cases in the switch.
	auto *NewType = Type::getIntNTy(Context, RegWidth);

	// Zero-extend the switch condition and case constants unless the switch
	// condition is a function argument that is already being sign-extended.
	// In that case, we can avoid an unnecessary mask/extension by sign-extending
	// everything instead.
	Instruction::CastOps ExtType = Instruction::ZExt;
	if (auto *Arg = dyn_cast<Argument>(Cond))
	if (Arg->hasSExtAttr())
	ExtType = Instruction::SExt;

	auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
	ExtInst->insertBefore(SI);
	ExtInst->setDebugLoc(SI->getDebugLoc());
	SI->setCondition(ExtInst);
	for (auto Case : SI->cases()) {
	APInt NarrowConst = Case.getCaseValue()->getValue();
	APInt WideConst = (ExtType == Instruction::ZExt) ?
	NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
	Case.setValue(ConstantInt::get(Context, WideConst));
	}

	return true;
	}


	namespace {

	/// Helper class to promote a scalar operation to a vector one.
	/// This class is used to move downward extractelement transition.
	/// E.g.,
	/// a = vector_op <2 x i32>
	/// b = extractelement <2 x i32> a, i32 0
	/// c = scalar_op b
	/// store c
	///
	/// =>
	/// a = vector_op <2 x i32>
	/// c = vector_op a (equivalent to scalar_op on the related lane)
	/// * d = extractelement <2 x i32> c, i32 0
	/// * store d
	/// Assuming both extractelement and store can be combine, we get rid of the
	/// transition.
	class VectorPromoteHelper {
	/// DataLayout associated with the current module.
	const DataLayout &DL;

	/// Used to perform some checks on the legality of vector operations.
	const TargetLowering &TLI;

	/// Used to estimated the cost of the promoted chain.
	const TargetTransformInfo &TTI;

	/// The transition being moved downwards.
	Instruction *Transition;

	/// The sequence of instructions to be promoted.
	SmallVector<Instruction *, 4> InstsToBePromoted;

	/// Cost of combining a store and an extract.
	unsigned StoreExtractCombineCost;

	/// Instruction that will be combined with the transition.
	Instruction *CombineInst = nullptr;

	/// The instruction that represents the current end of the transition.
	/// Since we are faking the promotion until we reach the end of the chain
	/// of computation, we need a way to get the current end of the transition.
	Instruction *getEndOfTransition() const {
	if (InstsToBePromoted.empty())
	return Transition;
	return InstsToBePromoted.back();
	}

	/// Return the index of the original value in the transition.
	/// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
	/// c, is at index 0.
	unsigned getTransitionOriginalValueIdx() const {
	assert(isa<ExtractElementInst>(Transition) &&
	"Other kind of transitions are not supported yet");
	return 0;
	}

	/// Return the index of the index in the transition.
	/// E.g., for "extractelement <2 x i32> c, i32 0" the index
	/// is at index 1.
	unsigned getTransitionIdx() const {
	assert(isa<ExtractElementInst>(Transition) &&
	"Other kind of transitions are not supported yet");
	return 1;
	}

	/// Get the type of the transition.
	/// This is the type of the original value.
	/// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
	/// transition is <2 x i32>.
	Type *getTransitionType() const {
	return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
	}

	/// Promote \p ToBePromoted by moving \p Def downward through.
	/// I.e., we have the following sequence:
	/// Def = Transition <ty1> a to <ty2>
	/// b = ToBePromoted <ty2> Def, ...
	/// =>
	/// b = ToBePromoted <ty1> a, ...
	/// Def = Transition <ty1> ToBePromoted to <ty2>
	void promoteImpl(Instruction *ToBePromoted);

	/// Check whether or not it is profitable to promote all the
	/// instructions enqueued to be promoted.
	bool isProfitableToPromote() {
	Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
	unsigned Index = isa<ConstantInt>(ValIdx)
	? cast<ConstantInt>(ValIdx)->getZExtValue()
	: -1;
	Type *PromotedType = getTransitionType();

	StoreInst *ST = cast<StoreInst>(CombineInst);
	unsigned AS = ST->getPointerAddressSpace();
	unsigned Align = ST->getAlignment();
	// Check if this store is supported.
	if (!TLI.allowsMisalignedMemoryAccesses(
	TLI.getValueType(DL, ST->getValueOperand()->getType()), AS,
	Align)) {
	// If this is not supported, there is no way we can combine
	// the extract with the store.
	return false;
	}

	// The scalar chain of computation has to pay for the transition
	// scalar to vector.
	// The vector chain has to account for the combining cost.
	uint64_t ScalarCost =
	TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
	uint64_t VectorCost = StoreExtractCombineCost;
	for (const auto &Inst : InstsToBePromoted) {
	// Compute the cost.
	// By construction, all instructions being promoted are arithmetic ones.
	// Moreover, one argument is a constant that can be viewed as a splat
	// constant.
	Value *Arg0 = Inst->getOperand(0);
	bool IsArg0Constant = isa<UndefValue>(Arg0) \|\| isa<ConstantInt>(Arg0) \|\|
	isa<ConstantFP>(Arg0);
	TargetTransformInfo::OperandValueKind Arg0OVK =
	IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
	: TargetTransformInfo::OK_AnyValue;
	TargetTransformInfo::OperandValueKind Arg1OVK =
	!IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
	: TargetTransformInfo::OK_AnyValue;
	ScalarCost += TTI.getArithmeticInstrCost(
	Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK);
	VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
	Arg0OVK, Arg1OVK);
	}
	LLVM_DEBUG(
	dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
	<< ScalarCost << "\nVector: " << VectorCost << '\n');
	return ScalarCost > VectorCost;
	}

	/// Generate a constant vector with \p Val with the same
	/// number of elements as the transition.
	/// \p UseSplat defines whether or not \p Val should be replicated
	/// across the whole vector.
	/// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
	/// otherwise we generate a vector with as many undef as possible:
	/// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
	/// used at the index of the extract.
	Value getConstantVector(Constant Val, bool UseSplat) const {
	unsigned ExtractIdx = std::numeric_limits<unsigned>::max();
	if (!UseSplat) {
	// If we cannot determine where the constant must be, we have to
	// use a splat constant.
	Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());
	if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))
	ExtractIdx = CstVal->getSExtValue();
	else
	UseSplat = true;
	}

	unsigned End = getTransitionType()->getVectorNumElements();
	if (UseSplat)
	return ConstantVector::getSplat(End, Val);

	SmallVector<Constant *, 4> ConstVec;
	UndefValue *UndefVal = UndefValue::get(Val->getType());
	for (unsigned Idx = 0; Idx != End; ++Idx) {
	if (Idx == ExtractIdx)
	ConstVec.push_back(Val);
	else
	ConstVec.push_back(UndefVal);
	}
	return ConstantVector::get(ConstVec);
	}

	/// Check if promoting to a vector type an operand at \p OperandIdx
	/// in \p Use can trigger undefined behavior.
	static bool canCauseUndefinedBehavior(const Instruction *Use,
	unsigned OperandIdx) {
	// This is not safe to introduce undef when the operand is on
	// the right hand side of a division-like instruction.
	if (OperandIdx != 1)
	return false;
	switch (Use->getOpcode()) {
	default:
	return false;
	case Instruction::SDiv:
	case Instruction::UDiv:
	case Instruction::SRem:
	case Instruction::URem:
	return true;
	case Instruction::FDiv:
	case Instruction::FRem:
	return !Use->hasNoNaNs();
	}
	llvm_unreachable(nullptr);
	}

	public:
	VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI,
	const TargetTransformInfo &TTI, Instruction *Transition,
	unsigned CombineCost)
	: DL(DL), TLI(TLI), TTI(TTI), Transition(Transition),
	StoreExtractCombineCost(CombineCost) {
	assert(Transition && "Do not know how to promote null");
	}

	/// Check if we can promote \p ToBePromoted to \p Type.
	bool canPromote(const Instruction *ToBePromoted) const {
	// We could support CastInst too.
	return isa<BinaryOperator>(ToBePromoted);
	}

	/// Check if it is profitable to promote \p ToBePromoted
	/// by moving downward the transition through.
	bool shouldPromote(const Instruction *ToBePromoted) const {
	// Promote only if all the operands can be statically expanded.
	// Indeed, we do not want to introduce any new kind of transitions.
	for (const Use &U : ToBePromoted->operands()) {
	const Value *Val = U.get();
	if (Val == getEndOfTransition()) {
	// If the use is a division and the transition is on the rhs,
	// we cannot promote the operation, otherwise we may create a
	// division by zero.
	if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))
	return false;
	continue;
	}
	if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
	!isa<ConstantFP>(Val))
	return false;
	}
	// Check that the resulting operation is legal.
	int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());
	if (!ISDOpcode)
	return false;
	return StressStoreExtract \|\|
	TLI.isOperationLegalOrCustom(
	ISDOpcode, TLI.getValueType(DL, getTransitionType(), true));
	}

	/// Check whether or not \p Use can be combined
	/// with the transition.
	/// I.e., is it possible to do Use(Transition) => AnotherUse?
	bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }

	/// Record \p ToBePromoted as part of the chain to be promoted.
	void enqueueForPromotion(Instruction *ToBePromoted) {
	InstsToBePromoted.push_back(ToBePromoted);
	}

	/// Set the instruction that will be combined with the transition.
	void recordCombineInstruction(Instruction *ToBeCombined) {
	assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
	CombineInst = ToBeCombined;
	}

	/// Promote all the instructions enqueued for promotion if it is
	/// is profitable.
	/// \return True if the promotion happened, false otherwise.
	bool promote() {
	// Check if there is something to promote.
	// Right now, if we do not have anything to combine with,
	// we assume the promotion is not profitable.
	if (InstsToBePromoted.empty() \|\| !CombineInst)
	return false;

	// Check cost.
	if (!StressStoreExtract && !isProfitableToPromote())
	return false;

	// Promote.
	for (auto &ToBePromoted : InstsToBePromoted)
	promoteImpl(ToBePromoted);
	InstsToBePromoted.clear();
	return true;
	}
	};

	} // end anonymous namespace

	void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
	// At this point, we know that all the operands of ToBePromoted but Def
	// can be statically promoted.
	// For Def, we need to use its parameter in ToBePromoted:
	// b = ToBePromoted ty1 a
	// Def = Transition ty1 b to ty2
	// Move the transition down.
	// 1. Replace all uses of the promoted operation by the transition.
	// = ... b => = ... Def.
	assert(ToBePromoted->getType() == Transition->getType() &&
	"The type of the result of the transition does not match "
	"the final type");
	ToBePromoted->replaceAllUsesWith(Transition);
	// 2. Update the type of the uses.
	// b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
	Type *TransitionTy = getTransitionType();
	ToBePromoted->mutateType(TransitionTy);
	// 3. Update all the operands of the promoted operation with promoted
	// operands.
	// b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
	for (Use &U : ToBePromoted->operands()) {
	Value *Val = U.get();
	Value *NewVal = nullptr;
	if (Val == Transition)
	NewVal = Transition->getOperand(getTransitionOriginalValueIdx());
	else if (isa<UndefValue>(Val) \|\| isa<ConstantInt>(Val) \|\|
	isa<ConstantFP>(Val)) {
	// Use a splat constant if it is not safe to use undef.
	NewVal = getConstantVector(
	cast<Constant>(Val),
	isa<UndefValue>(Val) \|\|
	canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
	} else
	llvm_unreachable("Did you modified shouldPromote and forgot to update "
	"this?");
	ToBePromoted->setOperand(U.getOperandNo(), NewVal);
	}
	Transition->moveAfter(ToBePromoted);
	Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
	}

	/// Some targets can do store(extractelement) with one instruction.
	/// Try to push the extractelement towards the stores when the target
	/// has this feature and this is profitable.
	bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
	unsigned CombineCost = std::numeric_limits<unsigned>::max();
	if (DisableStoreExtract \|\| !TLI \|\|
	(!StressStoreExtract &&
	!TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
	Inst->getOperand(1), CombineCost)))
	return false;

	// At this point we know that Inst is a vector to scalar transition.
	// Try to move it down the def-use chain, until:
	// - We can combine the transition with its single use
	// => we got rid of the transition.
	// - We escape the current basic block
	// => we would need to check that we are moving it at a cheaper place and
	// we do not do that for now.
	BasicBlock *Parent = Inst->getParent();
	LLVM_DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
	VectorPromoteHelper VPH(DL, TLI, *TTI, Inst, CombineCost);
	// If the transition has more than one use, assume this is not going to be
	// beneficial.
	while (Inst->hasOneUse()) {
	Instruction ToBePromoted = cast<Instruction>(Inst->user_begin());
	LLVM_DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');

	if (ToBePromoted->getParent() != Parent) {
	LLVM_DEBUG(dbgs() << "Instruction to promote is in a different block ("
	<< ToBePromoted->getParent()->getName()
	<< ") than the transition (" << Parent->getName()
	<< ").\n");
	return false;
	}

	if (VPH.canCombine(ToBePromoted)) {
	LLVM_DEBUG(dbgs() << "Assume " << *Inst << '\n'
	<< "will be combined with: " << *ToBePromoted << '\n');
	VPH.recordCombineInstruction(ToBePromoted);
	bool Changed = VPH.promote();
	NumStoreExtractExposed += Changed;
	return Changed;
	}

	LLVM_DEBUG(dbgs() << "Try promoting.\n");
	if (!VPH.canPromote(ToBePromoted) \|\| !VPH.shouldPromote(ToBePromoted))
	return false;

	LLVM_DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");

	VPH.enqueueForPromotion(ToBePromoted);
	Inst = ToBePromoted;
	}
	return false;
	}

	/// For the instruction sequence of store below, F and I values
	/// are bundled together as an i64 value before being stored into memory.
	/// Sometimes it is more efficient to generate separate stores for F and I,
	/// which can remove the bitwise instructions or sink them to colder places.
	///
	/// (store (or (zext (bitcast F to i32) to i64),
	/// (shl (zext I to i64), 32)), addr) -->
	/// (store F, addr) and (store I, addr+4)
	///
	/// Similarly, splitting for other merged store can also be beneficial, like:
	/// For pair of {i32, i32}, i64 store --> two i32 stores.
	/// For pair of {i32, i16}, i64 store --> two i32 stores.
	/// For pair of {i16, i16}, i32 store --> two i16 stores.
	/// For pair of {i16, i8}, i32 store --> two i16 stores.
	/// For pair of {i8, i8}, i16 store --> two i8 stores.
	///
	/// We allow each target to determine specifically which kind of splitting is
	/// supported.
	///
	/// The store patterns are commonly seen from the simple code snippet below
	/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
	/// void goo(const std::pair<int, float> &);
	/// hoo() {
	/// ...
	/// goo(std::make_pair(tmp, ftmp));
	/// ...
	/// }
	///
	/// Although we already have similar splitting in DAG Combine, we duplicate
	/// it in CodeGenPrepare to catch the case in which pattern is across
	/// multiple BBs. The logic in DAG Combine is kept to catch case generated
	/// during code expansion.
	static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
	const TargetLowering &TLI) {
	// Handle simple but common cases only.
	Type *StoreType = SI.getValueOperand()->getType();
	if (!DL.typeSizeEqualsStoreSize(StoreType) \|\|
	DL.getTypeSizeInBits(StoreType) == 0)
	return false;

	unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2;
	Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize);
	if (!DL.typeSizeEqualsStoreSize(SplitStoreType))
	return false;

	// Don't split the store if it is volatile.
	if (SI.isVolatile())
	return false;

	// Match the following patterns:
	// (store (or (zext LValue to i64),
	// (shl (zext HValue to i64), 32)), HalfValBitSize)
	// or
	// (store (or (shl (zext HValue to i64), 32)), HalfValBitSize)
	// (zext LValue to i64),
	// Expect both operands of OR and the first operand of SHL have only
	// one use.
	Value LValue, HValue;
	if (!match(SI.getValueOperand(),
	m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))),
	m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))),
	m_SpecificInt(HalfValBitSize))))))
	return false;

	// Check LValue and HValue are int with size less or equal than 32.
	if (!LValue->getType()->isIntegerTy() \|\|
	DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize \|\|
	!HValue->getType()->isIntegerTy() \|\|
	DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize)
	return false;

	// If LValue/HValue is a bitcast instruction, use the EVT before bitcast
	// as the input of target query.
	auto *LBC = dyn_cast<BitCastInst>(LValue);
	auto *HBC = dyn_cast<BitCastInst>(HValue);
	EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType())
	: EVT::getEVT(LValue->getType());
	EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType())
	: EVT::getEVT(HValue->getType());
	if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
	return false;

	// Start to split store.
	IRBuilder<> Builder(SI.getContext());
	Builder.SetInsertPoint(&SI);

	// If LValue/HValue is a bitcast in another BB, create a new one in current
	// BB so it may be merged with the splitted stores by dag combiner.
	if (LBC && LBC->getParent() != SI.getParent())
	LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType());
	if (HBC && HBC->getParent() != SI.getParent())
	HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType());

	bool IsLE = SI.getModule()->getDataLayout().isLittleEndian();
	auto CreateSplitStore = [&](Value *V, bool Upper) {
	V = Builder.CreateZExtOrBitCast(V, SplitStoreType);
	Value *Addr = Builder.CreateBitCast(
	SI.getOperand(1),
	SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));
	if ((IsLE && Upper) \|\| (!IsLE && !Upper))
	Addr = Builder.CreateGEP(
	SplitStoreType, Addr,
	ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
	Builder.CreateAlignedStore(
	V, Addr, Upper ? SI.getAlignment() / 2 : SI.getAlignment());
	};

	CreateSplitStore(LValue, false);
	CreateSplitStore(HValue, true);

	// Delete the old store.
	SI.eraseFromParent();
	return true;
	}

	// Return true if the GEP has two operands, the first operand is of a sequential
	// type, and the second operand is a constant.
	static bool GEPSequentialConstIndexed(GetElementPtrInst *GEP) {
	gep_type_iterator I = gep_type_begin(*GEP);
	return GEP->getNumOperands() == 2 &&
	I.isSequential() &&
	isa<ConstantInt>(GEP->getOperand(1));
	}

	// Try unmerging GEPs to reduce liveness interference (register pressure) across
	// IndirectBr edges. Since IndirectBr edges tend to touch on many blocks,
	// reducing liveness interference across those edges benefits global register
	// allocation. Currently handles only certain cases.
	//
	// For example, unmerge %GEPI and %UGEPI as below.
	//
	// ---------- BEFORE ----------
	// SrcBlock:
	// ...
	// %GEPIOp = ...
	// ...
	// %GEPI = gep %GEPIOp, Idx
	// ...
	// indirectbr ... [ label %DstB0, label %DstB1, ... label %DstBi ... ]
	// (* %GEPI is alive on the indirectbr edges due to other uses ahead)
	// (* %GEPIOp is alive on the indirectbr edges only because of it's used by
	// %UGEPI)
	//
	// DstB0: ... (there may be a gep similar to %UGEPI to be unmerged)
	// DstB1: ... (there may be a gep similar to %UGEPI to be unmerged)
	// ...
	//
	// DstBi:
	// ...
	// %UGEPI = gep %GEPIOp, UIdx
	// ...
	// ---------------------------
	//
	// ---------- AFTER ----------
	// SrcBlock:
	// ... (same as above)
	// (* %GEPI is still alive on the indirectbr edges)
	// (* %GEPIOp is no longer alive on the indirectbr edges as a result of the
	// unmerging)
	// ...
	//
	// DstBi:
	// ...
	// %UGEPI = gep %GEPI, (UIdx-Idx)
	// ...
	// ---------------------------
	//
	// The register pressure on the IndirectBr edges is reduced because %GEPIOp is
	// no longer alive on them.
	//
	// We try to unmerge GEPs here in CodGenPrepare, as opposed to limiting merging
	// of GEPs in the first place in InstCombiner::visitGetElementPtrInst() so as
	// not to disable further simplications and optimizations as a result of GEP
	// merging.
	//
	// Note this unmerging may increase the length of the data flow critical path
	// (the path from %GEPIOp to %UGEPI would go through %GEPI), which is a tradeoff
	// between the register pressure and the length of data-flow critical
	// path. Restricting this to the uncommon IndirectBr case would minimize the
	// impact of potentially longer critical path, if any, and the impact on compile
	// time.
	static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
	const TargetTransformInfo *TTI) {
	BasicBlock *SrcBlock = GEPI->getParent();
	// Check that SrcBlock ends with an IndirectBr. If not, give up. The common
	// (non-IndirectBr) cases exit early here.
	if (!isa<IndirectBrInst>(SrcBlock->getTerminator()))
	return false;
	// Check that GEPI is a simple gep with a single constant index.
	if (!GEPSequentialConstIndexed(GEPI))
	return false;
	ConstantInt *GEPIIdx = cast<ConstantInt>(GEPI->getOperand(1));
	// Check that GEPI is a cheap one.
	if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType())
	> TargetTransformInfo::TCC_Basic)
	return false;
	Value *GEPIOp = GEPI->getOperand(0);
	// Check that GEPIOp is an instruction that's also defined in SrcBlock.
	if (!isa<Instruction>(GEPIOp))
	return false;
	auto *GEPIOpI = cast<Instruction>(GEPIOp);
	if (GEPIOpI->getParent() != SrcBlock)
	return false;
	// Check that GEP is used outside the block, meaning it's alive on the
	// IndirectBr edge(s).
	if (find_if(GEPI->users(), [&](User *Usr) {
	if (auto *I = dyn_cast<Instruction>(Usr)) {
	if (I->getParent() != SrcBlock) {
	return true;
	}
	}
	return false;
	}) == GEPI->users().end())
	return false;
	// The second elements of the GEP chains to be unmerged.
	std::vector<GetElementPtrInst *> UGEPIs;
	// Check each user of GEPIOp to check if unmerging would make GEPIOp not alive
	// on IndirectBr edges.
	for (User *Usr : GEPIOp->users()) {
	if (Usr == GEPI) continue;
	// Check if Usr is an Instruction. If not, give up.
	if (!isa<Instruction>(Usr))
	return false;
	auto *UI = cast<Instruction>(Usr);
	// Check if Usr in the same block as GEPIOp, which is fine, skip.
	if (UI->getParent() == SrcBlock)
	continue;
	// Check if Usr is a GEP. If not, give up.
	if (!isa<GetElementPtrInst>(Usr))
	return false;
	auto *UGEPI = cast<GetElementPtrInst>(Usr);
	// Check if UGEPI is a simple gep with a single constant index and GEPIOp is
	// the pointer operand to it. If so, record it in the vector. If not, give
	// up.
	if (!GEPSequentialConstIndexed(UGEPI))
	return false;
	if (UGEPI->getOperand(0) != GEPIOp)
	return false;
	if (GEPIIdx->getType() !=
	cast<ConstantInt>(UGEPI->getOperand(1))->getType())
	return false;
	ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
	if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType())
	> TargetTransformInfo::TCC_Basic)
	return false;
	UGEPIs.push_back(UGEPI);
	}
	if (UGEPIs.size() == 0)
	return false;
	// Check the materializing cost of (Uidx-Idx).
	for (GetElementPtrInst *UGEPI : UGEPIs) {
	ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
	APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue();
	unsigned ImmCost = TTI->getIntImmCost(NewIdx, GEPIIdx->getType());
	if (ImmCost > TargetTransformInfo::TCC_Basic)
	return false;
	}
	// Now unmerge between GEPI and UGEPIs.
	for (GetElementPtrInst *UGEPI : UGEPIs) {
	UGEPI->setOperand(0, GEPI);
	ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
	Constant *NewUGEPIIdx =
	ConstantInt::get(GEPIIdx->getType(),
	UGEPIIdx->getValue() - GEPIIdx->getValue());
	UGEPI->setOperand(1, NewUGEPIIdx);
	// If GEPI is not inbounds but UGEPI is inbounds, change UGEPI to not
	// inbounds to avoid UB.
	if (!GEPI->isInBounds()) {
	UGEPI->setIsInBounds(false);
	}
	}
	// After unmerging, verify that GEPIOp is actually only used in SrcBlock (not
	// alive on IndirectBr edges).
	assert(find_if(GEPIOp->users(), [&](User *Usr) {
	return cast<Instruction>(Usr)->getParent() != SrcBlock;
	}) == GEPIOp->users().end() && "GEPIOp is used outside SrcBlock");
	return true;
	}

	bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
	// Bail out if we inserted the instruction to prevent optimizations from
	// stepping on each other's toes.
	if (InsertedInsts.count(I))
	return false;

	// TODO: Move into the switch on opcode below here.
	if (PHINode *P = dyn_cast<PHINode>(I)) {
	// It is possible for very late stage optimizations (such as SimplifyCFG)
	// to introduce PHI nodes too late to be cleaned up. If we detect such a
	// trivial PHI, go ahead and zap it here.
	if (Value V = SimplifyInstruction(P, {DL, TLInfo})) {
	LargeOffsetGEPMap.erase(P);
	P->replaceAllUsesWith(V);
	P->eraseFromParent();
	++NumPHIsElim;
	return true;
	}
	return false;
	}

	if (CastInst *CI = dyn_cast<CastInst>(I)) {
	// If the source of the cast is a constant, then this should have
	// already been constant folded. The only reason NOT to constant fold
	// it is if something (e.g. LSR) was careful to place the constant
	// evaluation in a block other than then one that uses it (e.g. to hoist
	// the address of globals out of a loop). If this is the case, we don't
	// want to forward-subst the cast.
	if (isa<Constant>(CI->getOperand(0)))
	return false;

	if (TLI && OptimizeNoopCopyExpression(CI, TLI, DL))
	return true;

	if (isa<ZExtInst>(I) \|\| isa<SExtInst>(I)) {
	/// Sink a zext or sext into its user blocks if the target type doesn't
	/// fit in one register
	if (TLI &&
	TLI->getTypeAction(CI->getContext(),
	TLI->getValueType(*DL, CI->getType())) ==
	TargetLowering::TypeExpandInteger) {
	return SinkCast(CI);
	} else {
	bool MadeChange = optimizeExt(I);
	return MadeChange \| optimizeExtUses(I);
	}
	}
	return false;
	}

	if (auto *Cmp = dyn_cast<CmpInst>(I))
	if (TLI && optimizeCmp(Cmp, ModifiedDT))
	return true;

	if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
	LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
	if (TLI) {
	bool Modified = optimizeLoadExt(LI);
	unsigned AS = LI->getPointerAddressSpace();
	Modified \|= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
	return Modified;
	}
	return false;
	}

	if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
	if (TLI && splitMergedValStore(SI, DL, *TLI))
	return true;
	SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
	if (TLI) {
	unsigned AS = SI->getPointerAddressSpace();
	return optimizeMemoryInst(I, SI->getOperand(1),
	SI->getOperand(0)->getType(), AS);
	}
	return false;
	}

	if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
	unsigned AS = RMW->getPointerAddressSpace();
	return optimizeMemoryInst(I, RMW->getPointerOperand(),
	RMW->getType(), AS);
	}

	if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) {
	unsigned AS = CmpX->getPointerAddressSpace();
	return optimizeMemoryInst(I, CmpX->getPointerOperand(),
	CmpX->getCompareOperand()->getType(), AS);
	}

	BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);

	if (BinOp && (BinOp->getOpcode() == Instruction::And) &&
	EnableAndCmpSinking && TLI)
	return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);

	// TODO: Move this into the switch on opcode - it handles shifts already.
	if (BinOp && (BinOp->getOpcode() == Instruction::AShr \|\|
	BinOp->getOpcode() == Instruction::LShr)) {
	ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
	if (TLI && CI && TLI->hasExtractBitsInsn())
	if (OptimizeExtractBits(BinOp, CI, TLI, DL))
	return true;
	}

	if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
	if (GEPI->hasAllZeroIndices()) {
	/// The GEP operand must be a pointer, so must its result -> BitCast
	Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
	GEPI->getName(), GEPI);
	NC->setDebugLoc(GEPI->getDebugLoc());
	GEPI->replaceAllUsesWith(NC);
	GEPI->eraseFromParent();
	++NumGEPsElim;
	optimizeInst(NC, ModifiedDT);
	return true;
	}
	if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {
	return true;
	}
	return false;
	}

	if (tryToSinkFreeOperands(I))
	return true;

	switch (I->getOpcode()) {
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	return optimizeShiftInst(cast<BinaryOperator>(I));
	case Instruction::Call:
	return optimizeCallInst(cast<CallInst>(I), ModifiedDT);
	case Instruction::Select:
	return optimizeSelectInst(cast<SelectInst>(I));
	case Instruction::ShuffleVector:
	return optimizeShuffleVectorInst(cast<ShuffleVectorInst>(I));
	case Instruction::Switch:
	return optimizeSwitchInst(cast<SwitchInst>(I));
	case Instruction::ExtractElement:
	return optimizeExtractElementInst(cast<ExtractElementInst>(I));
	}

	return false;
	}

	/// Given an OR instruction, check to see if this is a bitreverse
	/// idiom. If so, insert the new intrinsic and return true.
	static bool makeBitReverse(Instruction &I, const DataLayout &DL,
	const TargetLowering &TLI) {
	if (!I.getType()->isIntegerTy() \|\|
	!TLI.isOperationLegalOrCustom(ISD::BITREVERSE,
	TLI.getValueType(DL, I.getType(), true)))
	return false;

	SmallVector<Instruction*, 4> Insts;
	if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts))
	return false;
	Instruction *LastInst = Insts.back();
	I.replaceAllUsesWith(LastInst);
	RecursivelyDeleteTriviallyDeadInstructions(&I);
	return true;
	}

	// In this pass we look for GEP and cast instructions that are used
	// across basic blocks and rewrite them to improve basic-block-at-a-time
	// selection.
	bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
	SunkAddrs.clear();
	bool MadeChange = false;

	CurInstIterator = BB.begin();
	while (CurInstIterator != BB.end()) {
	MadeChange \|= optimizeInst(&*CurInstIterator++, ModifiedDT);
	if (ModifiedDT)
	return true;
	}

	bool MadeBitReverse = true;
	while (TLI && MadeBitReverse) {
	MadeBitReverse = false;
	for (auto &I : reverse(BB)) {
	if (makeBitReverse(I, DL, TLI)) {
	MadeBitReverse = MadeChange = true;
	ModifiedDT = true;
	break;
	}
	}
	}
	MadeChange \|= dupRetToEnableTailCallOpts(&BB, ModifiedDT);

	return MadeChange;
	}

	// llvm.dbg.value is far away from the value then iSel may not be able
	// handle it properly. iSel will drop llvm.dbg.value if it can not
	// find a node corresponding to the value.
	bool CodeGenPrepare::placeDbgValues(Function &F) {
	bool MadeChange = false;
	for (BasicBlock &BB : F) {
	Instruction *PrevNonDbgInst = nullptr;
	for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
	Instruction Insn = &BI++;
	DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
	// Leave dbg.values that refer to an alloca alone. These
	// intrinsics describe the address of a variable (= the alloca)
	// being taken. They should not be moved next to the alloca
	// (and to the beginning of the scope), but rather stay close to
	// where said address is used.
	if (!DVI \|\| (DVI->getValue() && isa<AllocaInst>(DVI->getValue()))) {
	PrevNonDbgInst = Insn;
	continue;
	}

	Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
	if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
	// If VI is a phi in a block with an EHPad terminator, we can't insert
	// after it.
	if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad())
	continue;
	LLVM_DEBUG(dbgs() << "Moving Debug Value before :\n"
	<< DVI << ' ' << VI);
	DVI->removeFromParent();
	if (isa<PHINode>(VI))
	DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());
	else
	DVI->insertAfter(VI);
	MadeChange = true;
	++NumDbgValueMoved;
	}
	}
	}
	return MadeChange;
	}

	/// Scale down both weights to fit into uint32_t.
	static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
	uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
	uint32_t Scale = (NewMax / std::numeric_limits<uint32_t>::max()) + 1;
	NewTrue = NewTrue / Scale;
	NewFalse = NewFalse / Scale;
	}

	/// Some targets prefer to split a conditional branch like:
	/// \code
	/// %0 = icmp ne i32 %a, 0
	/// %1 = icmp ne i32 %b, 0
	/// %or.cond = or i1 %0, %1
	/// br i1 %or.cond, label %TrueBB, label %FalseBB
	/// \endcode
	/// into multiple branch instructions like:
	/// \code
	/// bb1:
	/// %0 = icmp ne i32 %a, 0
	/// br i1 %0, label %TrueBB, label %bb2
	/// bb2:
	/// %1 = icmp ne i32 %b, 0
	/// br i1 %1, label %TrueBB, label %FalseBB
	/// \endcode
	/// This usually allows instruction selection to do even further optimizations
	/// and combine the compare with the branch instruction. Currently this is
	/// applied for targets which have "cheap" jump instructions.
	///
	/// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
	///
	bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {
	if (!TM \|\| !TM->Options.EnableFastISel \|\| !TLI \|\| TLI->isJumpExpensive())
	return false;

	bool MadeChange = false;
	for (auto &BB : F) {
	// Does this BB end with the following?
	// %cond1 = icmp\|fcmp\|binary instruction ...
	// %cond2 = icmp\|fcmp\|binary instruction ...
	// %cond.or = or\|and i1 %cond1, cond2
	// br i1 %cond.or label %dest1, label %dest2"
	BinaryOperator *LogicOp;
	BasicBlock TBB, FBB;
	if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
	continue;

	auto *Br1 = cast<BranchInst>(BB.getTerminator());
	if (Br1->getMetadata(LLVMContext::MD_unpredictable))
	continue;

	unsigned Opc;
	Value Cond1, Cond2;
	if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
	m_OneUse(m_Value(Cond2)))))
	Opc = Instruction::And;
	else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)),
	m_OneUse(m_Value(Cond2)))))
	Opc = Instruction::Or;
	else
	continue;

	if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) \|\|
	!match(Cond2, m_CombineOr(m_Cmp(), m_BinOp())) )
	continue;

	LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());

	// Create a new BB.
	auto TmpBB =
	BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",
	BB.getParent(), BB.getNextNode());

	// Update original basic block by using the first condition directly by the
	// branch instruction and removing the no longer needed and/or instruction.
	Br1->setCondition(Cond1);
	LogicOp->eraseFromParent();

	// Depending on the condition we have to either replace the true or the
	// false successor of the original branch instruction.
	if (Opc == Instruction::And)
	Br1->setSuccessor(0, TmpBB);
	else
	Br1->setSuccessor(1, TmpBB);

	// Fill in the new basic block.
	auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB);
	if (auto *I = dyn_cast<Instruction>(Cond2)) {
	I->removeFromParent();
	I->insertBefore(Br2);
	}

	// Update PHI nodes in both successors. The original BB needs to be
	// replaced in one successor's PHI nodes, because the branch comes now from
	// the newly generated BB (NewBB). In the other successor we need to add one
	// incoming edge to the PHI nodes, because both branch instructions target
	// now the same successor. Depending on the original branch condition
	// (and/or) we have to swap the successors (TrueDest, FalseDest), so that
	// we perform the correct update for the PHI nodes.
	// This doesn't change the successor order of the just created branch
	// instruction (or any other instruction).
	if (Opc == Instruction::Or)
	std::swap(TBB, FBB);

	// Replace the old BB with the new BB.
	TBB->replacePhiUsesWith(&BB, TmpBB);

	// Add another incoming edge form the new BB.
	for (PHINode &PN : FBB->phis()) {
	auto *Val = PN.getIncomingValueForBlock(&BB);
	PN.addIncoming(Val, TmpBB);
	}

	// Update the branch weights (from SelectionDAGBuilder::
	// FindMergedConditions).
	if (Opc == Instruction::Or) {
	// Codegen X \| Y as:
	// BB1:
	// jmp_if_X TBB
	// jmp TmpBB
	// TmpBB:
	// jmp_if_Y TBB
	// jmp FBB
	//

	// We have flexibility in setting Prob for BB1 and Prob for NewBB.
	// The requirement is that
	// TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
	// = TrueProb for original BB.
	// Assuming the original weights are A and B, one choice is to set BB1's
	// weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
	// assumes that
	// TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
	// Another choice is to assume TrueProb for BB1 equals to TrueProb for
	// TmpBB, but the math is more complicated.
	uint64_t TrueWeight, FalseWeight;
	if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
	uint64_t NewTrueWeight = TrueWeight;
	uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
	scaleWeights(NewTrueWeight, NewFalseWeight);
	Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
	.createBranchWeights(TrueWeight, FalseWeight));

	NewTrueWeight = TrueWeight;
	NewFalseWeight = 2 * FalseWeight;
	scaleWeights(NewTrueWeight, NewFalseWeight);
	Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
	.createBranchWeights(TrueWeight, FalseWeight));
	}
	} else {
	// Codegen X & Y as:
	// BB1:
	// jmp_if_X TmpBB
	// jmp FBB
	// TmpBB:
	// jmp_if_Y TBB
	// jmp FBB
	//
	// This requires creation of TmpBB after CurBB.

	// We have flexibility in setting Prob for BB1 and Prob for TmpBB.
	// The requirement is that
	// FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
	// = FalseProb for original BB.
	// Assuming the original weights are A and B, one choice is to set BB1's
	// weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
	// assumes that
	// FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
	uint64_t TrueWeight, FalseWeight;
	if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
	uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
	uint64_t NewFalseWeight = FalseWeight;
	scaleWeights(NewTrueWeight, NewFalseWeight);
	Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
	.createBranchWeights(TrueWeight, FalseWeight));

	NewTrueWeight = 2 * TrueWeight;
	NewFalseWeight = FalseWeight;
	scaleWeights(NewTrueWeight, NewFalseWeight);
	Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
	.createBranchWeights(TrueWeight, FalseWeight));
	}
	}

	ModifiedDT = true;
	MadeChange = true;

	LLVM_DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
	TmpBB->dump());
	}
	return MadeChange;
	}
	Index: vendor/llvm/dist-release_90/lib/CodeGen/LiveDebugValues.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/CodeGen/LiveDebugValues.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/CodeGen/LiveDebugValues.cpp (revision 351303)
	@@ -1,1316 +1,1329 @@
	//===- LiveDebugValues.cpp - Tracking Debug Value MIs ---------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	///
	/// This pass implements a data flow analysis that propagates debug location
	/// information by inserting additional DBG_VALUE instructions into the machine
	/// instruction stream. The pass internally builds debug location liveness
	/// ranges to determine the points where additional DBG_VALUEs need to be
	/// inserted.
	///
	/// This is a separate pass from DbgValueHistoryCalculator to facilitate
	/// testing and improve modularity.
	///
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/PostOrderIterator.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/SparseBitVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/UniqueVector.h"
	#include "llvm/CodeGen/LexicalScopes.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/PseudoSourceValue.h"
	#include "llvm/CodeGen/RegisterScavenging.h"
	#include "llvm/CodeGen/TargetFrameLowering.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/Config/llvm-config.h"
	#include "llvm/IR/DIBuilder.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Module.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <functional>
	#include <queue>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "livedebugvalues"

	STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");

	// If @MI is a DBG_VALUE with debug value described by a defined
	// register, returns the number of this register. In the other case, returns 0.
	static Register isDbgValueDescribedByReg(const MachineInstr &MI) {
	assert(MI.isDebugValue() && "expected a DBG_VALUE");
	assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
	// If location of variable is described using a register (directly
	// or indirectly), this register is always a first operand.
	return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : Register();
	}

	namespace {

	class LiveDebugValues : public MachineFunctionPass {
	private:
	const TargetRegisterInfo *TRI;
	const TargetInstrInfo *TII;
	const TargetFrameLowering *TFI;
	BitVector CalleeSavedRegs;
	LexicalScopes LS;

	enum struct TransferKind { TransferCopy, TransferSpill, TransferRestore };

	/// Keeps track of lexical scopes associated with a user value's source
	/// location.
	class UserValueScopes {
	DebugLoc DL;
	LexicalScopes &LS;
	SmallPtrSet<const MachineBasicBlock *, 4> LBlocks;

	public:
	UserValueScopes(DebugLoc D, LexicalScopes &L) : DL(std::move(D)), LS(L) {}

	/// Return true if current scope dominates at least one machine
	/// instruction in a given machine basic block.
	bool dominates(MachineBasicBlock *MBB) {
	if (LBlocks.empty())
	LS.getMachineBasicBlocks(DL, LBlocks);
	return LBlocks.count(MBB) != 0 \|\| LS.dominates(DL, MBB);
	}
	};

	using FragmentInfo = DIExpression::FragmentInfo;
	using OptFragmentInfo = Optional<DIExpression::FragmentInfo>;

	/// Storage for identifying a potentially inlined instance of a variable,
	/// or a fragment thereof.
	class DebugVariable {
	const DILocalVariable *Variable;
	OptFragmentInfo Fragment;
	const DILocation *InlinedAt;

	/// Fragment that will overlap all other fragments. Used as default when
	/// caller demands a fragment.
	static const FragmentInfo DefaultFragment;

	public:
	DebugVariable(const DILocalVariable *Var, OptFragmentInfo &&FragmentInfo,
	const DILocation *InlinedAt)
	: Variable(Var), Fragment(FragmentInfo), InlinedAt(InlinedAt) {}

	DebugVariable(const DILocalVariable *Var, OptFragmentInfo &FragmentInfo,
	const DILocation *InlinedAt)
	: Variable(Var), Fragment(FragmentInfo), InlinedAt(InlinedAt) {}

	DebugVariable(const DILocalVariable Var, const DIExpression DIExpr,
	const DILocation *InlinedAt)
	: DebugVariable(Var, DIExpr->getFragmentInfo(), InlinedAt) {}

	DebugVariable(const MachineInstr &MI)
	: DebugVariable(MI.getDebugVariable(),
	MI.getDebugExpression()->getFragmentInfo(),
	MI.getDebugLoc()->getInlinedAt()) {}

	const DILocalVariable *getVar() const { return Variable; }
	const OptFragmentInfo &getFragment() const { return Fragment; }
	const DILocation *getInlinedAt() const { return InlinedAt; }

	const FragmentInfo getFragmentDefault() const {
	return Fragment.getValueOr(DefaultFragment);
	}

	static bool isFragmentDefault(FragmentInfo &F) {
	return F == DefaultFragment;
	}

	bool operator==(const DebugVariable &Other) const {
	return std::tie(Variable, Fragment, InlinedAt) ==
	std::tie(Other.Variable, Other.Fragment, Other.InlinedAt);
	}

	bool operator<(const DebugVariable &Other) const {
	return std::tie(Variable, Fragment, InlinedAt) <
	std::tie(Other.Variable, Other.Fragment, Other.InlinedAt);
	}
	};

	friend struct llvm::DenseMapInfo<DebugVariable>;

	/// A pair of debug variable and value location.
	struct VarLoc {
	// The location at which a spilled variable resides. It consists of a
	// register and an offset.
	struct SpillLoc {
	unsigned SpillBase;
	int SpillOffset;
	bool operator==(const SpillLoc &Other) const {
	return SpillBase == Other.SpillBase && SpillOffset == Other.SpillOffset;
	}
	};

	const DebugVariable Var;
	const MachineInstr &MI; ///< Only used for cloning a new DBG_VALUE.
	mutable UserValueScopes UVS;
	enum VarLocKind {
	InvalidKind = 0,
	RegisterKind,
	SpillLocKind,
	ImmediateKind,
	EntryValueKind
	} Kind = InvalidKind;

	/// The value location. Stored separately to avoid repeatedly
	/// extracting it from MI.
	union {
	uint64_t RegNo;
	SpillLoc SpillLocation;
	uint64_t Hash;
	int64_t Immediate;
	const ConstantFP *FPImm;
	const ConstantInt *CImm;
	} Loc;

	VarLoc(const MachineInstr &MI, LexicalScopes &LS,
	VarLocKind K = InvalidKind)
	: Var(MI), MI(MI), UVS(MI.getDebugLoc(), LS){
	static_assert((sizeof(Loc) == sizeof(uint64_t)),
	"hash does not cover all members of Loc");
	assert(MI.isDebugValue() && "not a DBG_VALUE");
	assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
	if (int RegNo = isDbgValueDescribedByReg(MI)) {
	Kind = MI.isDebugEntryValue() ? EntryValueKind : RegisterKind;
	Loc.RegNo = RegNo;
	} else if (MI.getOperand(0).isImm()) {
	Kind = ImmediateKind;
	Loc.Immediate = MI.getOperand(0).getImm();
	} else if (MI.getOperand(0).isFPImm()) {
	Kind = ImmediateKind;
	Loc.FPImm = MI.getOperand(0).getFPImm();
	} else if (MI.getOperand(0).isCImm()) {
	Kind = ImmediateKind;
	Loc.CImm = MI.getOperand(0).getCImm();
	}
	assert((Kind != ImmediateKind \|\| !MI.isDebugEntryValue()) &&
	"entry values must be register locations");
	}

	/// The constructor for spill locations.
	VarLoc(const MachineInstr &MI, unsigned SpillBase, int SpillOffset,
	LexicalScopes &LS)
	: Var(MI), MI(MI), UVS(MI.getDebugLoc(), LS) {
	assert(MI.isDebugValue() && "not a DBG_VALUE");
	assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
	Kind = SpillLocKind;
	Loc.SpillLocation = {SpillBase, SpillOffset};
	}

	// Is the Loc field a constant or constant object?
	bool isConstant() const { return Kind == ImmediateKind; }

	/// If this variable is described by a register, return it,
	/// otherwise return 0.
	unsigned isDescribedByReg() const {
	if (Kind == RegisterKind)
	return Loc.RegNo;
	return 0;
	}

	/// Determine whether the lexical scope of this value's debug location
	/// dominates MBB.
	bool dominates(MachineBasicBlock &MBB) const { return UVS.dominates(&MBB); }

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD void dump() const { MI.dump(); }
	#endif

	bool operator==(const VarLoc &Other) const {
	return Kind == Other.Kind && Var == Other.Var &&
	Loc.Hash == Other.Loc.Hash;
	}

	/// This operator guarantees that VarLocs are sorted by Variable first.
	bool operator<(const VarLoc &Other) const {
	return std::tie(Var, Kind, Loc.Hash) <
	std::tie(Other.Var, Other.Kind, Other.Loc.Hash);
	}
	};

	using DebugParamMap = SmallDenseMap<const DILocalVariable , MachineInstr >;
	using VarLocMap = UniqueVector<VarLoc>;
	using VarLocSet = SparseBitVector<>;
	using VarLocInMBB = SmallDenseMap<const MachineBasicBlock *, VarLocSet>;
	struct TransferDebugPair {
	MachineInstr *TransferInst;
	MachineInstr *DebugInst;
	};
	using TransferMap = SmallVector<TransferDebugPair, 4>;

	// Types for recording sets of variable fragments that overlap. For a given
	// local variable, we record all other fragments of that variable that could
	// overlap it, to reduce search time.
	using FragmentOfVar =
	std::pair<const DILocalVariable *, DIExpression::FragmentInfo>;
	using OverlapMap =
	DenseMap<FragmentOfVar, SmallVector<DIExpression::FragmentInfo, 1>>;

	// Helper while building OverlapMap, a map of all fragments seen for a given
	// DILocalVariable.
	using VarToFragments =
	DenseMap<const DILocalVariable *, SmallSet<FragmentInfo, 4>>;

	/// This holds the working set of currently open ranges. For fast
	/// access, this is done both as a set of VarLocIDs, and a map of
	/// DebugVariable to recent VarLocID. Note that a DBG_VALUE ends all
	/// previous open ranges for the same variable.
	class OpenRangesSet {
	VarLocSet VarLocs;
	SmallDenseMap<DebugVariable, unsigned, 8> Vars;
	OverlapMap &OverlappingFragments;

	public:
	OpenRangesSet(OverlapMap &_OLapMap) : OverlappingFragments(_OLapMap) {}

	const VarLocSet &getVarLocs() const { return VarLocs; }

	/// Terminate all open ranges for Var by removing it from the set.
	void erase(DebugVariable Var);

	/// Terminate all open ranges listed in \c KillSet by removing
	/// them from the set.
	void erase(const VarLocSet &KillSet, const VarLocMap &VarLocIDs) {
	VarLocs.intersectWithComplement(KillSet);
	for (unsigned ID : KillSet)
	Vars.erase(VarLocIDs[ID].Var);
	}

	/// Insert a new range into the set.
	void insert(unsigned VarLocID, DebugVariable Var) {
	VarLocs.set(VarLocID);
	Vars.insert({Var, VarLocID});
	}

	/// Empty the set.
	void clear() {
	VarLocs.clear();
	Vars.clear();
	}

	/// Return whether the set is empty or not.
	bool empty() const {
	assert(Vars.empty() == VarLocs.empty() && "open ranges are inconsistent");
	return VarLocs.empty();
	}
	};

	bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF,
	unsigned &Reg);
	/// If a given instruction is identified as a spill, return the spill location
	/// and set \p Reg to the spilled register.
	Optional<VarLoc::SpillLoc> isRestoreInstruction(const MachineInstr &MI,
	MachineFunction *MF,
	unsigned &Reg);
	/// Given a spill instruction, extract the register and offset used to
	/// address the spill location in a target independent way.
	VarLoc::SpillLoc extractSpillBaseRegAndOffset(const MachineInstr &MI);
	void insertTransferDebugPair(MachineInstr &MI, OpenRangesSet &OpenRanges,
	TransferMap &Transfers, VarLocMap &VarLocIDs,
	unsigned OldVarID, TransferKind Kind,
	unsigned NewReg = 0);

	void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
	VarLocMap &VarLocIDs);
	void transferSpillOrRestoreInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
	VarLocMap &VarLocIDs, TransferMap &Transfers);
	void emitEntryValues(MachineInstr &MI, OpenRangesSet &OpenRanges,
	VarLocMap &VarLocIDs, TransferMap &Transfers,
	DebugParamMap &DebugEntryVals,
	SparseBitVector<> &KillSet);
	void transferRegisterCopy(MachineInstr &MI, OpenRangesSet &OpenRanges,
	VarLocMap &VarLocIDs, TransferMap &Transfers);
	void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
	VarLocMap &VarLocIDs, TransferMap &Transfers,
	DebugParamMap &DebugEntryVals);
	bool transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
	VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);

	bool process(MachineInstr &MI, OpenRangesSet &OpenRanges,
	VarLocInMBB &OutLocs, VarLocMap &VarLocIDs,
	TransferMap &Transfers, DebugParamMap &DebugEntryVals,
	bool transferChanges, OverlapMap &OverlapFragments,
	VarToFragments &SeenFragments);

	void accumulateFragmentMap(MachineInstr &MI, VarToFragments &SeenFragments,
	OverlapMap &OLapMap);

	bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
	const VarLocMap &VarLocIDs,
	SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
	SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks);

	bool ExtendRanges(MachineFunction &MF);

	public:
	static char ID;

	/// Default construct and initialize the pass.
	LiveDebugValues();

	/// Tell the pass manager which passes we depend on and what
	/// information we preserve.
	void getAnalysisUsage(AnalysisUsage &AU) const override;

	MachineFunctionProperties getRequiredProperties() const override {
	return MachineFunctionProperties().set(
	MachineFunctionProperties::Property::NoVRegs);
	}

	/// Print to ostream with a message.
	void printVarLocInMBB(const MachineFunction &MF, const VarLocInMBB &V,
	const VarLocMap &VarLocIDs, const char *msg,
	raw_ostream &Out) const;

	/// Calculate the liveness information for the given machine function.
	bool runOnMachineFunction(MachineFunction &MF) override;
	};

	} // end anonymous namespace

	namespace llvm {

	template <> struct DenseMapInfo<LiveDebugValues::DebugVariable> {
	using DV = LiveDebugValues::DebugVariable;
	using OptFragmentInfo = LiveDebugValues::OptFragmentInfo;
	using FragmentInfo = LiveDebugValues::FragmentInfo;

	// Empty key: no key should be generated that has no DILocalVariable.
	static inline DV getEmptyKey() {
	return DV(nullptr, OptFragmentInfo(), nullptr);
	}

	// Difference in tombstone is that the Optional is meaningful
	static inline DV getTombstoneKey() {
	return DV(nullptr, OptFragmentInfo({0, 0}), nullptr);
	}

	static unsigned getHashValue(const DV &D) {
	unsigned HV = 0;
	const OptFragmentInfo &Fragment = D.getFragment();
	if (Fragment)
	HV = DenseMapInfo<FragmentInfo>::getHashValue(*Fragment);

	return hash_combine(D.getVar(), HV, D.getInlinedAt());
	}

	static bool isEqual(const DV &A, const DV &B) { return A == B; }
	};

	} // namespace llvm

	//===----------------------------------------------------------------------===//
	// Implementation
	//===----------------------------------------------------------------------===//

	const DIExpression::FragmentInfo
	LiveDebugValues::DebugVariable::DefaultFragment = {
	std::numeric_limits<uint64_t>::max(),
	std::numeric_limits<uint64_t>::min()};

	char LiveDebugValues::ID = 0;

	char &llvm::LiveDebugValuesID = LiveDebugValues::ID;

	INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis",
	false, false)

	/// Default construct and initialize the pass.
	LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
	initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry());
	}

	/// Tell the pass manager which passes we depend on and what information we
	/// preserve.
	void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const {
	AU.setPreservesCFG();
	MachineFunctionPass::getAnalysisUsage(AU);
	}

	/// Erase a variable from the set of open ranges, and additionally erase any
	/// fragments that may overlap it.
	void LiveDebugValues::OpenRangesSet::erase(DebugVariable Var) {
	// Erasure helper.
	auto DoErase = [this](DebugVariable VarToErase) {
	auto It = Vars.find(VarToErase);
	if (It != Vars.end()) {
	unsigned ID = It->second;
	VarLocs.reset(ID);
	Vars.erase(It);
	}
	};

	// Erase the variable/fragment that ends here.
	DoErase(Var);

	// Extract the fragment. Interpret an empty fragment as one that covers all
	// possible bits.
	FragmentInfo ThisFragment = Var.getFragmentDefault();

	// There may be fragments that overlap the designated fragment. Look them up
	// in the pre-computed overlap map, and erase them too.
	auto MapIt = OverlappingFragments.find({Var.getVar(), ThisFragment});
	if (MapIt != OverlappingFragments.end()) {
	for (auto Fragment : MapIt->second) {
	LiveDebugValues::OptFragmentInfo FragmentHolder;
	if (!DebugVariable::isFragmentDefault(Fragment))
	FragmentHolder = LiveDebugValues::OptFragmentInfo(Fragment);
	DoErase({Var.getVar(), FragmentHolder, Var.getInlinedAt()});
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// Debug Range Extension Implementation
	//===----------------------------------------------------------------------===//

	#ifndef NDEBUG
	void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,
	const VarLocInMBB &V,
	const VarLocMap &VarLocIDs,
	const char *msg,
	raw_ostream &Out) const {
	Out << '\n' << msg << '\n';
	for (const MachineBasicBlock &BB : MF) {
	const VarLocSet &L = V.lookup(&BB);
	if (L.empty())
	continue;
	Out << "MBB: " << BB.getNumber() << ":\n";
	for (unsigned VLL : L) {
	const VarLoc &VL = VarLocIDs[VLL];
	Out << " Var: " << VL.Var.getVar()->getName();
	Out << " MI: ";
	VL.dump();
	}
	}
	Out << "\n";
	}
	#endif

	LiveDebugValues::VarLoc::SpillLoc
	LiveDebugValues::extractSpillBaseRegAndOffset(const MachineInstr &MI) {
	assert(MI.hasOneMemOperand() &&
	"Spill instruction does not have exactly one memory operand?");
	auto MMOI = MI.memoperands_begin();
	const PseudoSourceValue PVal = (MMOI)->getPseudoValue();
	assert(PVal->kind() == PseudoSourceValue::FixedStack &&
	"Inconsistent memory operand in spill instruction");
	int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex();
	const MachineBasicBlock *MBB = MI.getParent();
	unsigned Reg;
	int Offset = TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg);
	return {Reg, Offset};
	}

	/// End all previous ranges related to @MI and start a new range from @MI
	/// if it is a DBG_VALUE instr.
	void LiveDebugValues::transferDebugValue(const MachineInstr &MI,
	OpenRangesSet &OpenRanges,
	VarLocMap &VarLocIDs) {
	if (!MI.isDebugValue())
	return;
	const DILocalVariable *Var = MI.getDebugVariable();
	const DIExpression *Expr = MI.getDebugExpression();
	const DILocation *DebugLoc = MI.getDebugLoc();
	const DILocation *InlinedAt = DebugLoc->getInlinedAt();
	assert(Var->isValidLocationForIntrinsic(DebugLoc) &&
	"Expected inlined-at fields to agree");

	// End all previous ranges of Var.
	DebugVariable V(Var, Expr, InlinedAt);
	OpenRanges.erase(V);

	// Add the VarLoc to OpenRanges from this DBG_VALUE.
	unsigned ID;
	if (isDbgValueDescribedByReg(MI) \|\| MI.getOperand(0).isImm() \|\|
	MI.getOperand(0).isFPImm() \|\| MI.getOperand(0).isCImm()) {
	// Use normal VarLoc constructor for registers and immediates.
	VarLoc VL(MI, LS);
	ID = VarLocIDs.insert(VL);
	OpenRanges.insert(ID, VL.Var);
	} else if (MI.hasOneMemOperand()) {
	// It's a stack spill -- fetch spill base and offset.
	VarLoc::SpillLoc SpillLocation = extractSpillBaseRegAndOffset(MI);
	VarLoc VL(MI, SpillLocation.SpillBase, SpillLocation.SpillOffset, LS);
	ID = VarLocIDs.insert(VL);
	OpenRanges.insert(ID, VL.Var);
	} else {
	// This must be an undefined location. We should leave OpenRanges closed.
	assert(MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == 0 &&
	"Unexpected non-undef DBG_VALUE encountered");
	}
	}

	void LiveDebugValues::emitEntryValues(MachineInstr &MI,
	OpenRangesSet &OpenRanges,
	VarLocMap &VarLocIDs,
	TransferMap &Transfers,
	DebugParamMap &DebugEntryVals,
	SparseBitVector<> &KillSet) {
	MachineFunction *MF = MI.getParent()->getParent();
	for (unsigned ID : KillSet) {
	if (!VarLocIDs[ID].Var.getVar()->isParameter())
	continue;

	const MachineInstr *CurrDebugInstr = &VarLocIDs[ID].MI;

	// If parameter's DBG_VALUE is not in the map that means we can't
	// generate parameter's entry value.
	if (!DebugEntryVals.count(CurrDebugInstr->getDebugVariable()))
	continue;

	auto ParamDebugInstr = DebugEntryVals[CurrDebugInstr->getDebugVariable()];
	DIExpression *NewExpr = DIExpression::prepend(
	ParamDebugInstr->getDebugExpression(), DIExpression::EntryValue);
	MachineInstr *EntryValDbgMI =
	BuildMI(*MF, ParamDebugInstr->getDebugLoc(), ParamDebugInstr->getDesc(),
	ParamDebugInstr->isIndirectDebugValue(),
	ParamDebugInstr->getOperand(0).getReg(),
	ParamDebugInstr->getDebugVariable(), NewExpr);

	if (ParamDebugInstr->isIndirectDebugValue())
	EntryValDbgMI->getOperand(1).setImm(
	ParamDebugInstr->getOperand(1).getImm());

	Transfers.push_back({&MI, EntryValDbgMI});
	VarLoc VL(*EntryValDbgMI, LS);
	unsigned EntryValLocID = VarLocIDs.insert(VL);
	OpenRanges.insert(EntryValLocID, VL.Var);
	}
	}

	/// Create new TransferDebugPair and insert it in \p Transfers. The VarLoc
	/// with \p OldVarID should be deleted form \p OpenRanges and replaced with
	/// new VarLoc. If \p NewReg is different than default zero value then the
	/// new location will be register location created by the copy like instruction,
	/// otherwise it is variable's location on the stack.
	void LiveDebugValues::insertTransferDebugPair(
	MachineInstr &MI, OpenRangesSet &OpenRanges, TransferMap &Transfers,
	VarLocMap &VarLocIDs, unsigned OldVarID, TransferKind Kind,
	unsigned NewReg) {
	const MachineInstr *DebugInstr = &VarLocIDs[OldVarID].MI;
	MachineFunction *MF = MI.getParent()->getParent();
	MachineInstr *NewDebugInstr;

	auto ProcessVarLoc = [&MI, &OpenRanges, &Transfers, &DebugInstr,
	&VarLocIDs](VarLoc &VL, MachineInstr *NewDebugInstr) {
	unsigned LocId = VarLocIDs.insert(VL);

	// Close this variable's previous location range.
	DebugVariable V(*DebugInstr);
	OpenRanges.erase(V);

	OpenRanges.insert(LocId, VL.Var);
	// The newly created DBG_VALUE instruction NewDebugInstr must be inserted
	// after MI. Keep track of the pairing.
	TransferDebugPair MIP = {&MI, NewDebugInstr};
	Transfers.push_back(MIP);
	};

	// End all previous ranges of Var.
	OpenRanges.erase(VarLocIDs[OldVarID].Var);
	switch (Kind) {
	case TransferKind::TransferCopy: {
	assert(NewReg &&
	"No register supplied when handling a copy of a debug value");
	// Create a DBG_VALUE instruction to describe the Var in its new
	// register location.
	NewDebugInstr = BuildMI(
	*MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(),
	DebugInstr->isIndirectDebugValue(), NewReg,
	DebugInstr->getDebugVariable(), DebugInstr->getDebugExpression());
	if (DebugInstr->isIndirectDebugValue())
	NewDebugInstr->getOperand(1).setImm(DebugInstr->getOperand(1).getImm());
	VarLoc VL(*NewDebugInstr, LS);
	ProcessVarLoc(VL, NewDebugInstr);
	LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for register copy: ";
	NewDebugInstr->print(dbgs(), /IsStandalone/false,
	/SkipOpers/false, /SkipDebugLoc/false,
	/AddNewLine/true, TII));
	return;
	}
	case TransferKind::TransferSpill: {
	// Create a DBG_VALUE instruction to describe the Var in its spilled
	// location.
	VarLoc::SpillLoc SpillLocation = extractSpillBaseRegAndOffset(MI);
	auto *SpillExpr = DIExpression::prepend(DebugInstr->getDebugExpression(),
	DIExpression::ApplyOffset,
	SpillLocation.SpillOffset);
	NewDebugInstr = BuildMI(
	*MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(), true,
	SpillLocation.SpillBase, DebugInstr->getDebugVariable(), SpillExpr);
	VarLoc VL(*NewDebugInstr, SpillLocation.SpillBase,
	SpillLocation.SpillOffset, LS);
	ProcessVarLoc(VL, NewDebugInstr);
	LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for spill: ";
	NewDebugInstr->print(dbgs(), /IsStandalone/false,
	/SkipOpers/false, /SkipDebugLoc/false,
	/AddNewLine/true, TII));
	return;
	}
	case TransferKind::TransferRestore: {
	assert(NewReg &&
	"No register supplied when handling a restore of a debug value");
	MachineFunction *MF = MI.getMF();
	DIBuilder DIB(*const_cast<Function &>(MF->getFunction()).getParent());
	+
	+ const DIExpression *NewExpr;
	+ if (auto Fragment = DebugInstr->getDebugExpression()->getFragmentInfo())
	+ NewExpr = *DIExpression::createFragmentExpression(DIB.createExpression(),
	+ Fragment->OffsetInBits, Fragment->SizeInBits);
	+ else
	+ NewExpr = DIB.createExpression();
	+
	NewDebugInstr =
	BuildMI(*MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(), false,
	- NewReg, DebugInstr->getDebugVariable(), DIB.createExpression());
	+ NewReg, DebugInstr->getDebugVariable(), NewExpr);
	VarLoc VL(*NewDebugInstr, LS);
	ProcessVarLoc(VL, NewDebugInstr);
	LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for register restore: ";
	NewDebugInstr->print(dbgs(), /IsStandalone/false,
	/SkipOpers/false, /SkipDebugLoc/false,
	/AddNewLine/true, TII));
	return;
	}
	}
	llvm_unreachable("Invalid transfer kind");
	}

	/// A definition of a register may mark the end of a range.
	void LiveDebugValues::transferRegisterDef(
	MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs,
	TransferMap &Transfers, DebugParamMap &DebugEntryVals) {
	MachineFunction *MF = MI.getMF();
	const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
	unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
	SparseBitVector<> KillSet;
	for (const MachineOperand &MO : MI.operands()) {
	// Determine whether the operand is a register def. Assume that call
	// instructions never clobber SP, because some backends (e.g., AArch64)
	// never list SP in the regmask.
	if (MO.isReg() && MO.isDef() && MO.getReg() &&
	TRI->isPhysicalRegister(MO.getReg()) &&
	!(MI.isCall() && MO.getReg() == SP)) {
	// Remove ranges of all aliased registers.
	for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
	for (unsigned ID : OpenRanges.getVarLocs())
	if (VarLocIDs[ID].isDescribedByReg() == *RAI)
	KillSet.set(ID);
	} else if (MO.isRegMask()) {
	// Remove ranges of all clobbered registers. Register masks don't usually
	// list SP as preserved. While the debug info may be off for an
	// instruction or two around callee-cleanup calls, transferring the
	// DEBUG_VALUE across the call is still a better user experience.
	for (unsigned ID : OpenRanges.getVarLocs()) {
	unsigned Reg = VarLocIDs[ID].isDescribedByReg();
	if (Reg && Reg != SP && MO.clobbersPhysReg(Reg))
	KillSet.set(ID);
	}
	}
	}
	OpenRanges.erase(KillSet, VarLocIDs);

	if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
	auto &TM = TPC->getTM<TargetMachine>();
	if (TM.Options.EnableDebugEntryValues)
	emitEntryValues(MI, OpenRanges, VarLocIDs, Transfers, DebugEntryVals,
	KillSet);
	}
	}

	/// Decide if @MI is a spill instruction and return true if it is. We use 2
	/// criteria to make this decision:
	/// - Is this instruction a store to a spill slot?
	/// - Is there a register operand that is both used and killed?
	/// TODO: Store optimization can fold spills into other stores (including
	/// other spills). We do not handle this yet (more than one memory operand).
	bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI,
	MachineFunction *MF, unsigned &Reg) {
	SmallVector<const MachineMemOperand*, 1> Accesses;

	// TODO: Handle multiple stores folded into one.
	if (!MI.hasOneMemOperand())
	return false;

	if (!MI.getSpillSize(TII) && !MI.getFoldedSpillSize(TII))
	return false; // This is not a spill instruction, since no valid size was
	// returned from either function.

	auto isKilledReg = [&](const MachineOperand MO, unsigned &Reg) {
	if (!MO.isReg() \|\| !MO.isUse()) {
	Reg = 0;
	return false;
	}
	Reg = MO.getReg();
	return MO.isKill();
	};

	for (const MachineOperand &MO : MI.operands()) {
	// In a spill instruction generated by the InlineSpiller the spilled
	// register has its kill flag set.
	if (isKilledReg(MO, Reg))
	return true;
	if (Reg != 0) {
	// Check whether next instruction kills the spilled register.
	// FIXME: Current solution does not cover search for killed register in
	// bundles and instructions further down the chain.
	auto NextI = std::next(MI.getIterator());
	// Skip next instruction that points to basic block end iterator.
	if (MI.getParent()->end() == NextI)
	continue;
	unsigned RegNext;
	for (const MachineOperand &MONext : NextI->operands()) {
	// Return true if we came across the register from the
	// previous spill instruction that is killed in NextI.
	if (isKilledReg(MONext, RegNext) && RegNext == Reg)
	return true;
	}
	}
	}
	// Return false if we didn't find spilled register.
	return false;
	}

	Optional<LiveDebugValues::VarLoc::SpillLoc>
	LiveDebugValues::isRestoreInstruction(const MachineInstr &MI,
	MachineFunction *MF, unsigned &Reg) {
	if (!MI.hasOneMemOperand())
	return None;

	// FIXME: Handle folded restore instructions with more than one memory
	// operand.
	if (MI.getRestoreSize(TII)) {
	Reg = MI.getOperand(0).getReg();
	return extractSpillBaseRegAndOffset(MI);
	}
	return None;
	}

	/// A spilled register may indicate that we have to end the current range of
	/// a variable and create a new one for the spill location.
	/// A restored register may indicate the reverse situation.
	/// We don't want to insert any instructions in process(), so we just create
	/// the DBG_VALUE without inserting it and keep track of it in \p Transfers.
	/// It will be inserted into the BB when we're done iterating over the
	/// instructions.
	void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
	OpenRangesSet &OpenRanges,
	VarLocMap &VarLocIDs,
	TransferMap &Transfers) {
	MachineFunction *MF = MI.getMF();
	TransferKind TKind;
	unsigned Reg;
	Optional<VarLoc::SpillLoc> Loc;

	LLVM_DEBUG(dbgs() << "Examining instruction: "; MI.dump(););

	if (isSpillInstruction(MI, MF, Reg)) {
	TKind = TransferKind::TransferSpill;
	LLVM_DEBUG(dbgs() << "Recognized as spill: "; MI.dump(););
	LLVM_DEBUG(dbgs() << "Register: " << Reg << " " << printReg(Reg, TRI)
	<< "\n");
	} else {
	if (!(Loc = isRestoreInstruction(MI, MF, Reg)))
	return;
	TKind = TransferKind::TransferRestore;
	LLVM_DEBUG(dbgs() << "Recognized as restore: "; MI.dump(););
	LLVM_DEBUG(dbgs() << "Register: " << Reg << " " << printReg(Reg, TRI)
	<< "\n");
	}
	// Check if the register or spill location is the location of a debug value.
	+ // FIXME: Don't create a spill transfer if there is a complex expression,
	+ // because we currently cannot recover the original expression on restore.
	for (unsigned ID : OpenRanges.getVarLocs()) {
	+ const MachineInstr *DebugInstr = &VarLocIDs[ID].MI;
	+
	if (TKind == TransferKind::TransferSpill &&
	- VarLocIDs[ID].isDescribedByReg() == Reg) {
	+ VarLocIDs[ID].isDescribedByReg() == Reg &&
	+ !DebugInstr->getDebugExpression()->isComplex()) {
	LLVM_DEBUG(dbgs() << "Spilling Register " << printReg(Reg, TRI) << '('
	<< VarLocIDs[ID].Var.getVar()->getName() << ")\n");
	} else if (TKind == TransferKind::TransferRestore &&
	VarLocIDs[ID].Loc.SpillLocation == *Loc) {
	LLVM_DEBUG(dbgs() << "Restoring Register " << printReg(Reg, TRI) << '('
	<< VarLocIDs[ID].Var.getVar()->getName() << ")\n");
	} else
	continue;
	insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, ID, TKind,
	Reg);
	return;
	}
	}

	/// If \p MI is a register copy instruction, that copies a previously tracked
	/// value from one register to another register that is callee saved, we
	/// create new DBG_VALUE instruction described with copy destination register.
	void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
	OpenRangesSet &OpenRanges,
	VarLocMap &VarLocIDs,
	TransferMap &Transfers) {
	const MachineOperand SrcRegOp, DestRegOp;

	if (!TII->isCopyInstr(MI, SrcRegOp, DestRegOp) \|\| !SrcRegOp->isKill() \|\|
	!DestRegOp->isDef())
	return;

	auto isCalleSavedReg = [&](unsigned Reg) {
	for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
	if (CalleeSavedRegs.test(*RAI))
	return true;
	return false;
	};

	unsigned SrcReg = SrcRegOp->getReg();
	unsigned DestReg = DestRegOp->getReg();

	// We want to recognize instructions where destination register is callee
	// saved register. If register that could be clobbered by the call is
	// included, there would be a great chance that it is going to be clobbered
	// soon. It is more likely that previous register location, which is callee
	// saved, is going to stay unclobbered longer, even if it is killed.
	if (!isCalleSavedReg(DestReg))
	return;

	for (unsigned ID : OpenRanges.getVarLocs()) {
	if (VarLocIDs[ID].isDescribedByReg() == SrcReg) {
	insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, ID,
	TransferKind::TransferCopy, DestReg);
	return;
	}
	}
	}

	/// Terminate all open ranges at the end of the current basic block.
	bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
	OpenRangesSet &OpenRanges,
	VarLocInMBB &OutLocs,
	const VarLocMap &VarLocIDs) {
	bool Changed = false;
	const MachineBasicBlock *CurMBB = MI.getParent();
	if (!(MI.isTerminator() \|\| (&MI == &CurMBB->back())))
	return false;

	if (OpenRanges.empty())
	return false;

	LLVM_DEBUG(for (unsigned ID
	: OpenRanges.getVarLocs()) {
	// Copy OpenRanges to OutLocs, if not already present.
	dbgs() << "Add to OutLocs in MBB #" << CurMBB->getNumber() << ": ";
	VarLocIDs[ID].dump();
	});
	VarLocSet &VLS = OutLocs[CurMBB];
	Changed = VLS \|= OpenRanges.getVarLocs();
	// New OutLocs set may be different due to spill, restore or register
	// copy instruction processing.
	if (Changed)
	VLS = OpenRanges.getVarLocs();
	OpenRanges.clear();
	return Changed;
	}

	/// Accumulate a mapping between each DILocalVariable fragment and other
	/// fragments of that DILocalVariable which overlap. This reduces work during
	/// the data-flow stage from "Find any overlapping fragments" to "Check if the
	/// known-to-overlap fragments are present".
	/// \param MI A previously unprocessed DEBUG_VALUE instruction to analyze for
	/// fragment usage.
	/// \param SeenFragments Map from DILocalVariable to all fragments of that
	/// Variable which are known to exist.
	/// \param OverlappingFragments The overlap map being constructed, from one
	/// Var/Fragment pair to a vector of fragments known to overlap.
	void LiveDebugValues::accumulateFragmentMap(MachineInstr &MI,
	VarToFragments &SeenFragments,
	OverlapMap &OverlappingFragments) {
	DebugVariable MIVar(MI);
	FragmentInfo ThisFragment = MIVar.getFragmentDefault();

	// If this is the first sighting of this variable, then we are guaranteed
	// there are currently no overlapping fragments either. Initialize the set
	// of seen fragments, record no overlaps for the current one, and return.
	auto SeenIt = SeenFragments.find(MIVar.getVar());
	if (SeenIt == SeenFragments.end()) {
	SmallSet<FragmentInfo, 4> OneFragment;
	OneFragment.insert(ThisFragment);
	SeenFragments.insert({MIVar.getVar(), OneFragment});

	OverlappingFragments.insert({{MIVar.getVar(), ThisFragment}, {}});
	return;
	}

	// If this particular Variable/Fragment pair already exists in the overlap
	// map, it has already been accounted for.
	auto IsInOLapMap =
	OverlappingFragments.insert({{MIVar.getVar(), ThisFragment}, {}});
	if (!IsInOLapMap.second)
	return;

	auto &ThisFragmentsOverlaps = IsInOLapMap.first->second;
	auto &AllSeenFragments = SeenIt->second;

	// Otherwise, examine all other seen fragments for this variable, with "this"
	// fragment being a previously unseen fragment. Record any pair of
	// overlapping fragments.
	for (auto &ASeenFragment : AllSeenFragments) {
	// Does this previously seen fragment overlap?
	if (DIExpression::fragmentsOverlap(ThisFragment, ASeenFragment)) {
	// Yes: Mark the current fragment as being overlapped.
	ThisFragmentsOverlaps.push_back(ASeenFragment);
	// Mark the previously seen fragment as being overlapped by the current
	// one.
	auto ASeenFragmentsOverlaps =
	OverlappingFragments.find({MIVar.getVar(), ASeenFragment});
	assert(ASeenFragmentsOverlaps != OverlappingFragments.end() &&
	"Previously seen var fragment has no vector of overlaps");
	ASeenFragmentsOverlaps->second.push_back(ThisFragment);
	}
	}

	AllSeenFragments.insert(ThisFragment);
	}

	/// This routine creates OpenRanges and OutLocs.
	bool LiveDebugValues::process(MachineInstr &MI, OpenRangesSet &OpenRanges,
	VarLocInMBB &OutLocs, VarLocMap &VarLocIDs,
	TransferMap &Transfers, DebugParamMap &DebugEntryVals,
	bool transferChanges,
	OverlapMap &OverlapFragments,
	VarToFragments &SeenFragments) {
	bool Changed = false;
	transferDebugValue(MI, OpenRanges, VarLocIDs);
	transferRegisterDef(MI, OpenRanges, VarLocIDs, Transfers,
	DebugEntryVals);
	if (transferChanges) {
	transferRegisterCopy(MI, OpenRanges, VarLocIDs, Transfers);
	transferSpillOrRestoreInst(MI, OpenRanges, VarLocIDs, Transfers);
	} else {
	// Build up a map of overlapping fragments on the first run through.
	if (MI.isDebugValue())
	accumulateFragmentMap(MI, SeenFragments, OverlapFragments);
	}
	Changed = transferTerminatorInst(MI, OpenRanges, OutLocs, VarLocIDs);
	return Changed;
	}

	/// This routine joins the analysis results of all incoming edges in @MBB by
	/// inserting a new DBG_VALUE instruction at the start of the @MBB - if the same
	/// source variable in all the predecessors of @MBB reside in the same location.
	bool LiveDebugValues::join(
	MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
	const VarLocMap &VarLocIDs,
	SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
	SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks) {
	LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
	bool Changed = false;

	VarLocSet InLocsT; // Temporary incoming locations.

	// For all predecessors of this MBB, find the set of VarLocs that
	// can be joined.
	int NumVisited = 0;
	for (auto p : MBB.predecessors()) {
	// Ignore unvisited predecessor blocks. As we are processing
	// the blocks in reverse post-order any unvisited block can
	// be considered to not remove any incoming values.
	if (!Visited.count(p)) {
	LLVM_DEBUG(dbgs() << " ignoring unvisited pred MBB: " << p->getNumber()
	<< "\n");
	continue;
	}
	auto OL = OutLocs.find(p);
	// Join is null in case of empty OutLocs from any of the pred.
	if (OL == OutLocs.end())
	return false;

	// Just copy over the Out locs to incoming locs for the first visited
	// predecessor, and for all other predecessors join the Out locs.
	if (!NumVisited)
	InLocsT = OL->second;
	else
	InLocsT &= OL->second;

	LLVM_DEBUG({
	if (!InLocsT.empty()) {
	for (auto ID : InLocsT)
	dbgs() << " gathered candidate incoming var: "
	<< VarLocIDs[ID].Var.getVar()->getName() << "\n";
	}
	});

	NumVisited++;
	}

	// Filter out DBG_VALUES that are out of scope.
	VarLocSet KillSet;
	bool IsArtificial = ArtificialBlocks.count(&MBB);
	if (!IsArtificial) {
	for (auto ID : InLocsT) {
	if (!VarLocIDs[ID].dominates(MBB)) {
	KillSet.set(ID);
	LLVM_DEBUG({
	auto Name = VarLocIDs[ID].Var.getVar()->getName();
	dbgs() << " killing " << Name << ", it doesn't dominate MBB\n";
	});
	}
	}
	}
	InLocsT.intersectWithComplement(KillSet);

	// As we are processing blocks in reverse post-order we
	// should have processed at least one predecessor, unless it
	// is the entry block which has no predecessor.
	assert((NumVisited \|\| MBB.pred_empty()) &&
	"Should have processed at least one predecessor");
	if (InLocsT.empty())
	return false;

	VarLocSet &ILS = InLocs[&MBB];

	// Insert DBG_VALUE instructions, if not already inserted.
	VarLocSet Diff = InLocsT;
	Diff.intersectWithComplement(ILS);
	for (auto ID : Diff) {
	// This VarLoc is not found in InLocs i.e. it is not yet inserted. So, a
	// new range is started for the var from the mbb's beginning by inserting
	// a new DBG_VALUE. process() will end this range however appropriate.
	const VarLoc &DiffIt = VarLocIDs[ID];
	const MachineInstr *DebugInstr = &DiffIt.MI;
	MachineInstr *MI = nullptr;
	if (DiffIt.isConstant()) {
	MachineOperand MO(DebugInstr->getOperand(0));
	MI = BuildMI(MBB, MBB.instr_begin(), DebugInstr->getDebugLoc(),
	DebugInstr->getDesc(), false, MO,
	DebugInstr->getDebugVariable(),
	DebugInstr->getDebugExpression());
	} else {
	MI = BuildMI(MBB, MBB.instr_begin(), DebugInstr->getDebugLoc(),
	DebugInstr->getDesc(), DebugInstr->isIndirectDebugValue(),
	DebugInstr->getOperand(0).getReg(),
	DebugInstr->getDebugVariable(),
	DebugInstr->getDebugExpression());
	if (DebugInstr->isIndirectDebugValue())
	MI->getOperand(1).setImm(DebugInstr->getOperand(1).getImm());
	}
	LLVM_DEBUG(dbgs() << "Inserted: "; MI->dump(););
	ILS.set(ID);
	++NumInserted;
	Changed = true;
	}
	return Changed;
	}

	/// Calculate the liveness information for the given machine function and
	/// extend ranges across basic blocks.
	bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
	LLVM_DEBUG(dbgs() << "\nDebug Range Extension\n");

	bool Changed = false;
	bool OLChanged = false;
	bool MBBJoined = false;

	VarLocMap VarLocIDs; // Map VarLoc<>unique ID for use in bitvectors.
	OverlapMap OverlapFragments; // Map of overlapping variable fragments
	OpenRangesSet OpenRanges(OverlapFragments);
	// Ranges that are open until end of bb.
	VarLocInMBB OutLocs; // Ranges that exist beyond bb.
	VarLocInMBB InLocs; // Ranges that are incoming after joining.
	TransferMap Transfers; // DBG_VALUEs associated with spills.

	VarToFragments SeenFragments;

	// Blocks which are artificial, i.e. blocks which exclusively contain
	// instructions without locations, or with line 0 locations.
	SmallPtrSet<const MachineBasicBlock *, 16> ArtificialBlocks;

	DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
	DenseMap<MachineBasicBlock *, unsigned int> BBToOrder;
	std::priority_queue<unsigned int, std::vector<unsigned int>,
	std::greater<unsigned int>>
	Worklist;
	std::priority_queue<unsigned int, std::vector<unsigned int>,
	std::greater<unsigned int>>
	Pending;

	enum : bool { dontTransferChanges = false, transferChanges = true };

	// Besides parameter's modification, check whether a DBG_VALUE is inlined
	// in order to deduce whether the variable that it tracks comes from
	// a different function. If that is the case we can't track its entry value.
	auto IsUnmodifiedFuncParam = [&](const MachineInstr &MI) {
	auto *DIVar = MI.getDebugVariable();
	return DIVar->isParameter() && DIVar->isNotModified() &&
	!MI.getDebugLoc()->getInlinedAt();
	};

	const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
	unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
	unsigned FP = TRI->getFrameRegister(MF);
	auto IsRegOtherThanSPAndFP = [&](const MachineOperand &Op) -> bool {
	return Op.isReg() && Op.getReg() != SP && Op.getReg() != FP;
	};

	// Working set of currently collected debug variables mapped to DBG_VALUEs
	// representing candidates for production of debug entry values.
	DebugParamMap DebugEntryVals;

	MachineBasicBlock &First_MBB = *(MF.begin());
	// Only in the case of entry MBB collect DBG_VALUEs representing
	// function parameters in order to generate debug entry values for them.
	// Currently, we generate debug entry values only for parameters that are
	// unmodified throughout the function and located in a register.
	// TODO: Add support for parameters that are described as fragments.
	// TODO: Add support for modified arguments that can be expressed
	// by using its entry value.
	// TODO: Add support for local variables that are expressed in terms of
	// parameters entry values.
	for (auto &MI : First_MBB)
	if (MI.isDebugValue() && IsUnmodifiedFuncParam(MI) &&
	!MI.isIndirectDebugValue() && IsRegOtherThanSPAndFP(MI.getOperand(0)) &&
	!DebugEntryVals.count(MI.getDebugVariable()) &&
	!MI.getDebugExpression()->isFragment())
	DebugEntryVals[MI.getDebugVariable()] = &MI;

	// Initialize every mbb with OutLocs.
	// We are not looking at any spill instructions during the initial pass
	// over the BBs. The LiveDebugVariables pass has already created DBG_VALUE
	// instructions for spills of registers that are known to be user variables
	// within the BB in which the spill occurs.
	for (auto &MBB : MF) {
	for (auto &MI : MBB) {
	process(MI, OpenRanges, OutLocs, VarLocIDs, Transfers, DebugEntryVals,
	dontTransferChanges, OverlapFragments, SeenFragments);
	}
	// Add any entry DBG_VALUE instructions necessitated by parameter
	// clobbering.
	for (auto &TR : Transfers) {
	MBB.insertAfter(MachineBasicBlock::iterator(*TR.TransferInst),
	TR.DebugInst);
	}
	Transfers.clear();
	}

	auto hasNonArtificialLocation = [](const MachineInstr &MI) -> bool {
	if (const DebugLoc &DL = MI.getDebugLoc())
	return DL.getLine() != 0;
	return false;
	};
	for (auto &MBB : MF)
	if (none_of(MBB.instrs(), hasNonArtificialLocation))
	ArtificialBlocks.insert(&MBB);

	LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
	"OutLocs after initialization", dbgs()));

	ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
	unsigned int RPONumber = 0;
	for (auto RI = RPOT.begin(), RE = RPOT.end(); RI != RE; ++RI) {
	OrderToBB[RPONumber] = *RI;
	BBToOrder[*RI] = RPONumber;
	Worklist.push(RPONumber);
	++RPONumber;
	}
	// This is a standard "union of predecessor outs" dataflow problem.
	// To solve it, we perform join() and process() using the two worklist method
	// until the ranges converge.
	// Ranges have converged when both worklists are empty.
	SmallPtrSet<const MachineBasicBlock *, 16> Visited;
	while (!Worklist.empty() \|\| !Pending.empty()) {
	// We track what is on the pending worklist to avoid inserting the same
	// thing twice. We could avoid this with a custom priority queue, but this
	// is probably not worth it.
	SmallPtrSet<MachineBasicBlock *, 16> OnPending;
	LLVM_DEBUG(dbgs() << "Processing Worklist\n");
	while (!Worklist.empty()) {
	MachineBasicBlock *MBB = OrderToBB[Worklist.top()];
	Worklist.pop();
	MBBJoined =
	join(*MBB, OutLocs, InLocs, VarLocIDs, Visited, ArtificialBlocks);
	Visited.insert(MBB);
	if (MBBJoined) {
	MBBJoined = false;
	Changed = true;
	// Now that we have started to extend ranges across BBs we need to
	// examine spill instructions to see whether they spill registers that
	// correspond to user variables.
	for (auto &MI : *MBB)
	OLChanged \|=
	process(MI, OpenRanges, OutLocs, VarLocIDs, Transfers,
	DebugEntryVals, transferChanges, OverlapFragments,
	SeenFragments);

	// Add any DBG_VALUE instructions necessitated by spills.
	for (auto &TR : Transfers)
	MBB->insertAfter(MachineBasicBlock::iterator(*TR.TransferInst),
	TR.DebugInst);
	Transfers.clear();

	LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
	"OutLocs after propagating", dbgs()));
	LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs,
	"InLocs after propagating", dbgs()));

	if (OLChanged) {
	OLChanged = false;
	for (auto s : MBB->successors())
	if (OnPending.insert(s).second) {
	Pending.push(BBToOrder[s]);
	}
	}
	}
	}
	Worklist.swap(Pending);
	// At this point, pending must be empty, since it was just the empty
	// worklist
	assert(Pending.empty() && "Pending should be empty");
	}

	LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "Final OutLocs", dbgs()));
	LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs, "Final InLocs", dbgs()));
	return Changed;
	}

	bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
	if (!MF.getFunction().getSubprogram())
	// LiveDebugValues will already have removed all DBG_VALUEs.
	return false;

	// Skip functions from NoDebug compilation units.
	if (MF.getFunction().getSubprogram()->getUnit()->getEmissionKind() ==
	DICompileUnit::NoDebug)
	return false;

	TRI = MF.getSubtarget().getRegisterInfo();
	TII = MF.getSubtarget().getInstrInfo();
	TFI = MF.getSubtarget().getFrameLowering();
	TFI->determineCalleeSaves(MF, CalleeSavedRegs,
	make_unique<RegScavenger>().get());
	LS.initialize(MF);

	bool Changed = ExtendRanges(MF);
	return Changed;
	}
	Index: vendor/llvm/dist-release_90/lib/CodeGen/MachineCSE.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/CodeGen/MachineCSE.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/CodeGen/MachineCSE.cpp (revision 351303)
	@@ -1,871 +1,896 @@
	//===- MachineCSE.cpp - Machine Common Subexpression Elimination Pass -----===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass performs global common subexpression elimination on machine
	// instructions using a scoped hash table based value numbering scheme. It
	// must be run while the machine function is still in SSA form.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/ScopedHashTable.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/CFG.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
	#include "llvm/CodeGen/MachineDominators.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetOpcodes.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Allocator.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/RecyclingAllocator.h"
	#include "llvm/Support/raw_ostream.h"
	#include <cassert>
	#include <iterator>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "machine-cse"

	STATISTIC(NumCoalesces, "Number of copies coalesced");
	STATISTIC(NumCSEs, "Number of common subexpression eliminated");
	STATISTIC(NumPREs, "Number of partial redundant expression"
	" transformed to fully redundant");
	STATISTIC(NumPhysCSEs,
	"Number of physreg referencing common subexpr eliminated");
	STATISTIC(NumCrossBBCSEs,
	"Number of cross-MBB physreg referencing CS eliminated");
	STATISTIC(NumCommutes, "Number of copies coalesced after commuting");

	namespace {

	class MachineCSE : public MachineFunctionPass {
	const TargetInstrInfo *TII;
	const TargetRegisterInfo *TRI;
	AliasAnalysis *AA;
	MachineDominatorTree *DT;
	MachineRegisterInfo *MRI;
	+ MachineBlockFrequencyInfo *MBFI;

	public:
	static char ID; // Pass identification

	MachineCSE() : MachineFunctionPass(ID) {
	initializeMachineCSEPass(*PassRegistry::getPassRegistry());
	}

	bool runOnMachineFunction(MachineFunction &MF) override;

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	MachineFunctionPass::getAnalysisUsage(AU);
	AU.addRequired<AAResultsWrapperPass>();
	AU.addPreservedID(MachineLoopInfoID);
	AU.addRequired<MachineDominatorTree>();
	AU.addPreserved<MachineDominatorTree>();
	+ AU.addRequired<MachineBlockFrequencyInfo>();
	+ AU.addPreserved<MachineBlockFrequencyInfo>();
	}

	void releaseMemory() override {
	ScopeMap.clear();
	PREMap.clear();
	Exps.clear();
	}

	private:
	using AllocatorTy = RecyclingAllocator<BumpPtrAllocator,
	ScopedHashTableVal<MachineInstr *, unsigned>>;
	using ScopedHTType =
	ScopedHashTable<MachineInstr *, unsigned, MachineInstrExpressionTrait,
	AllocatorTy>;
	using ScopeType = ScopedHTType::ScopeTy;
	using PhysDefVector = SmallVector<std::pair<unsigned, unsigned>, 2>;

	unsigned LookAheadLimit = 0;
	DenseMap<MachineBasicBlock , ScopeType > ScopeMap;
	DenseMap<MachineInstr , MachineBasicBlock , MachineInstrExpressionTrait>
	PREMap;
	ScopedHTType VNT;
	SmallVector<MachineInstr *, 64> Exps;
	unsigned CurrVN = 0;

	bool PerformTrivialCopyPropagation(MachineInstr *MI,
	MachineBasicBlock *MBB);
	bool isPhysDefTriviallyDead(unsigned Reg,
	MachineBasicBlock::const_iterator I,
	MachineBasicBlock::const_iterator E) const;
	bool hasLivePhysRegDefUses(const MachineInstr *MI,
	const MachineBasicBlock *MBB,
	SmallSet<unsigned, 8> &PhysRefs,
	PhysDefVector &PhysDefs, bool &PhysUseDef) const;
	bool PhysRegDefsReach(MachineInstr CSMI, MachineInstr MI,
	SmallSet<unsigned, 8> &PhysRefs,
	PhysDefVector &PhysDefs, bool &NonLocal) const;
	bool isCSECandidate(MachineInstr *MI);
	bool isProfitableToCSE(unsigned CSReg, unsigned Reg,
	MachineBasicBlock CSBB, MachineInstr MI);
	void EnterScope(MachineBasicBlock *MBB);
	void ExitScope(MachineBasicBlock *MBB);
	bool ProcessBlockCSE(MachineBasicBlock *MBB);
	void ExitScopeIfDone(MachineDomTreeNode *Node,
	DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren);
	bool PerformCSE(MachineDomTreeNode *Node);

	bool isPRECandidate(MachineInstr *MI);
	bool ProcessBlockPRE(MachineDominatorTree MDT, MachineBasicBlock MBB);
	bool PerformSimplePRE(MachineDominatorTree *DT);
	+ /// Heuristics to see if it's beneficial to move common computations of MBB
	+ /// and MBB1 to CandidateBB.
	+ bool isBeneficalToHoistInto(MachineBasicBlock *CandidateBB,
	+ MachineBasicBlock *MBB,
	+ MachineBasicBlock *MBB1);
	};

	} // end anonymous namespace

	char MachineCSE::ID = 0;

	char &llvm::MachineCSEID = MachineCSE::ID;

	INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE,
	"Machine Common Subexpression Elimination", false, false)
	INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
	INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE,
	"Machine Common Subexpression Elimination", false, false)

	/// The source register of a COPY machine instruction can be propagated to all
	/// its users, and this propagation could increase the probability of finding
	/// common subexpressions. If the COPY has only one user, the COPY itself can
	/// be removed.
	bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
	MachineBasicBlock *MBB) {
	bool Changed = false;
	for (MachineOperand &MO : MI->operands()) {
	if (!MO.isReg() \|\| !MO.isUse())
	continue;
	unsigned Reg = MO.getReg();
	if (!TargetRegisterInfo::isVirtualRegister(Reg))
	continue;
	bool OnlyOneUse = MRI->hasOneNonDBGUse(Reg);
	MachineInstr *DefMI = MRI->getVRegDef(Reg);
	if (!DefMI->isCopy())
	continue;
	unsigned SrcReg = DefMI->getOperand(1).getReg();
	if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
	continue;
	if (DefMI->getOperand(0).getSubReg())
	continue;
	// FIXME: We should trivially coalesce subregister copies to expose CSE
	// opportunities on instructions with truncated operands (see
	// cse-add-with-overflow.ll). This can be done here as follows:
	// if (SrcSubReg)
	// RC = TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), RC,
	// SrcSubReg);
	// MO.substVirtReg(SrcReg, SrcSubReg, *TRI);
	//
	// The 2-addr pass has been updated to handle coalesced subregs. However,
	// some machine-specific code still can't handle it.
	// To handle it properly we also need a way find a constrained subregister
	// class given a super-reg class and subreg index.
	if (DefMI->getOperand(1).getSubReg())
	continue;
	if (!MRI->constrainRegAttrs(SrcReg, Reg))
	continue;
	LLVM_DEBUG(dbgs() << "Coalescing: " << *DefMI);
	LLVM_DEBUG(dbgs() << "*** to: " << *MI);

	// Update matching debug values.
	DefMI->changeDebugValuesDefReg(SrcReg);

	// Propagate SrcReg of copies to MI.
	MO.setReg(SrcReg);
	MRI->clearKillFlags(SrcReg);
	// Coalesce single use copies.
	if (OnlyOneUse) {
	DefMI->eraseFromParent();
	++NumCoalesces;
	}
	Changed = true;
	}

	return Changed;
	}

	bool
	MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
	MachineBasicBlock::const_iterator I,
	MachineBasicBlock::const_iterator E) const {
	unsigned LookAheadLeft = LookAheadLimit;
	while (LookAheadLeft) {
	// Skip over dbg_value's.
	I = skipDebugInstructionsForward(I, E);

	if (I == E)
	// Reached end of block, we don't know if register is dead or not.
	return false;

	bool SeenDef = false;
	for (const MachineOperand &MO : I->operands()) {
	if (MO.isRegMask() && MO.clobbersPhysReg(Reg))
	SeenDef = true;
	if (!MO.isReg() \|\| !MO.getReg())
	continue;
	if (!TRI->regsOverlap(MO.getReg(), Reg))
	continue;
	if (MO.isUse())
	// Found a use!
	return false;
	SeenDef = true;
	}
	if (SeenDef)
	// See a def of Reg (or an alias) before encountering any use, it's
	// trivially dead.
	return true;

	--LookAheadLeft;
	++I;
	}
	return false;
	}

	static bool isCallerPreservedOrConstPhysReg(unsigned Reg,
	const MachineFunction &MF,
	const TargetRegisterInfo &TRI) {
	// MachineRegisterInfo::isConstantPhysReg directly called by
	// MachineRegisterInfo::isCallerPreservedOrConstPhysReg expects the
	// reserved registers to be frozen. That doesn't cause a problem post-ISel as
	// most (if not all) targets freeze reserved registers right after ISel.
	//
	// It does cause issues mid-GlobalISel, however, hence the additional
	// reservedRegsFrozen check.
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	return TRI.isCallerPreservedPhysReg(Reg, MF) \|\|
	(MRI.reservedRegsFrozen() && MRI.isConstantPhysReg(Reg));
	}

	/// hasLivePhysRegDefUses - Return true if the specified instruction read/write
	/// physical registers (except for dead defs of physical registers). It also
	/// returns the physical register def by reference if it's the only one and the
	/// instruction does not uses a physical register.
	bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
	const MachineBasicBlock *MBB,
	SmallSet<unsigned, 8> &PhysRefs,
	PhysDefVector &PhysDefs,
	bool &PhysUseDef) const {
	// First, add all uses to PhysRefs.
	for (const MachineOperand &MO : MI->operands()) {
	if (!MO.isReg() \|\| MO.isDef())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;
	if (TargetRegisterInfo::isVirtualRegister(Reg))
	continue;
	// Reading either caller preserved or constant physregs is ok.
	if (!isCallerPreservedOrConstPhysReg(Reg, MI->getMF(), TRI))
	for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
	PhysRefs.insert(*AI);
	}

	// Next, collect all defs into PhysDefs. If any is already in PhysRefs
	// (which currently contains only uses), set the PhysUseDef flag.
	PhysUseDef = false;
	MachineBasicBlock::const_iterator I = MI; I = std::next(I);
	for (const auto &MOP : llvm::enumerate(MI->operands())) {
	const MachineOperand &MO = MOP.value();
	if (!MO.isReg() \|\| !MO.isDef())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;
	if (TargetRegisterInfo::isVirtualRegister(Reg))
	continue;
	// Check against PhysRefs even if the def is "dead".
	if (PhysRefs.count(Reg))
	PhysUseDef = true;
	// If the def is dead, it's ok. But the def may not marked "dead". That's
	// common since this pass is run before livevariables. We can scan
	// forward a few instructions and check if it is obviously dead.
	if (!MO.isDead() && !isPhysDefTriviallyDead(Reg, I, MBB->end()))
	PhysDefs.push_back(std::make_pair(MOP.index(), Reg));
	}

	// Finally, add all defs to PhysRefs as well.
	for (unsigned i = 0, e = PhysDefs.size(); i != e; ++i)
	for (MCRegAliasIterator AI(PhysDefs[i].second, TRI, true); AI.isValid();
	++AI)
	PhysRefs.insert(*AI);

	return !PhysRefs.empty();
	}

	bool MachineCSE::PhysRegDefsReach(MachineInstr CSMI, MachineInstr MI,
	SmallSet<unsigned, 8> &PhysRefs,
	PhysDefVector &PhysDefs,
	bool &NonLocal) const {
	// For now conservatively returns false if the common subexpression is
	// not in the same basic block as the given instruction. The only exception
	// is if the common subexpression is in the sole predecessor block.
	const MachineBasicBlock *MBB = MI->getParent();
	const MachineBasicBlock *CSMBB = CSMI->getParent();

	bool CrossMBB = false;
	if (CSMBB != MBB) {
	if (MBB->pred_size() != 1 \|\| *MBB->pred_begin() != CSMBB)
	return false;

	for (unsigned i = 0, e = PhysDefs.size(); i != e; ++i) {
	if (MRI->isAllocatable(PhysDefs[i].second) \|\|
	MRI->isReserved(PhysDefs[i].second))
	// Avoid extending live range of physical registers if they are
	//allocatable or reserved.
	return false;
	}
	CrossMBB = true;
	}
	MachineBasicBlock::const_iterator I = CSMI; I = std::next(I);
	MachineBasicBlock::const_iterator E = MI;
	MachineBasicBlock::const_iterator EE = CSMBB->end();
	unsigned LookAheadLeft = LookAheadLimit;
	while (LookAheadLeft) {
	// Skip over dbg_value's.
	while (I != E && I != EE && I->isDebugInstr())
	++I;

	if (I == EE) {
	assert(CrossMBB && "Reaching end-of-MBB without finding MI?");
	(void)CrossMBB;
	CrossMBB = false;
	NonLocal = true;
	I = MBB->begin();
	EE = MBB->end();
	continue;
	}

	if (I == E)
	return true;

	for (const MachineOperand &MO : I->operands()) {
	// RegMasks go on instructions like calls that clobber lots of physregs.
	// Don't attempt to CSE across such an instruction.
	if (MO.isRegMask())
	return false;
	if (!MO.isReg() \|\| !MO.isDef())
	continue;
	unsigned MOReg = MO.getReg();
	if (TargetRegisterInfo::isVirtualRegister(MOReg))
	continue;
	if (PhysRefs.count(MOReg))
	return false;
	}

	--LookAheadLeft;
	++I;
	}

	return false;
	}

	bool MachineCSE::isCSECandidate(MachineInstr *MI) {
	if (MI->isPosition() \|\| MI->isPHI() \|\| MI->isImplicitDef() \|\| MI->isKill() \|\|
	MI->isInlineAsm() \|\| MI->isDebugInstr())
	return false;

	// Ignore copies.
	if (MI->isCopyLike())
	return false;

	// Ignore stuff that we obviously can't move.
	if (MI->mayStore() \|\| MI->isCall() \|\| MI->isTerminator() \|\|
	MI->mayRaiseFPException() \|\| MI->hasUnmodeledSideEffects())
	return false;

	if (MI->mayLoad()) {
	// Okay, this instruction does a load. As a refinement, we allow the target
	// to decide whether the loaded value is actually a constant. If so, we can
	// actually use it as a load.
	if (!MI->isDereferenceableInvariantLoad(AA))
	// FIXME: we should be able to hoist loads with no other side effects if
	// there are no other instructions which can change memory in this loop.
	// This is a trivial form of alias analysis.
	return false;
	}

	// Ignore stack guard loads, otherwise the register that holds CSEed value may
	// be spilled and get loaded back with corrupted data.
	if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD)
	return false;

	return true;
	}

	/// isProfitableToCSE - Return true if it's profitable to eliminate MI with a
	/// common expression that defines Reg. CSBB is basic block where CSReg is
	/// defined.
	bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
	MachineBasicBlock CSBB, MachineInstr MI) {
	// FIXME: Heuristics that works around the lack the live range splitting.

	// If CSReg is used at all uses of Reg, CSE should not increase register
	// pressure of CSReg.
	bool MayIncreasePressure = true;
	if (TargetRegisterInfo::isVirtualRegister(CSReg) &&
	TargetRegisterInfo::isVirtualRegister(Reg)) {
	MayIncreasePressure = false;
	SmallPtrSet<MachineInstr*, 8> CSUses;
	for (MachineInstr &MI : MRI->use_nodbg_instructions(CSReg)) {
	CSUses.insert(&MI);
	}
	for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
	if (!CSUses.count(&MI)) {
	MayIncreasePressure = true;
	break;
	}
	}
	}
	if (!MayIncreasePressure) return true;

	// Heuristics #1: Don't CSE "cheap" computation if the def is not local or in
	// an immediate predecessor. We don't want to increase register pressure and
	// end up causing other computation to be spilled.
	if (TII->isAsCheapAsAMove(*MI)) {
	MachineBasicBlock *BB = MI->getParent();
	if (CSBB != BB && !CSBB->isSuccessor(BB))
	return false;
	}

	// Heuristics #2: If the expression doesn't not use a vr and the only use
	// of the redundant computation are copies, do not cse.
	bool HasVRegUse = false;
	for (const MachineOperand &MO : MI->operands()) {
	if (MO.isReg() && MO.isUse() &&
	TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
	HasVRegUse = true;
	break;
	}
	}
	if (!HasVRegUse) {
	bool HasNonCopyUse = false;
	for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
	// Ignore copies.
	if (!MI.isCopyLike()) {
	HasNonCopyUse = true;
	break;
	}
	}
	if (!HasNonCopyUse)
	return false;
	}

	// Heuristics #3: If the common subexpression is used by PHIs, do not reuse
	// it unless the defined value is already used in the BB of the new use.
	bool HasPHI = false;
	for (MachineInstr &UseMI : MRI->use_nodbg_instructions(CSReg)) {
	HasPHI \|= UseMI.isPHI();
	if (UseMI.getParent() == MI->getParent())
	return true;
	}

	return !HasPHI;
	}

	void MachineCSE::EnterScope(MachineBasicBlock *MBB) {
	LLVM_DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n');
	ScopeType *Scope = new ScopeType(VNT);
	ScopeMap[MBB] = Scope;
	}

	void MachineCSE::ExitScope(MachineBasicBlock *MBB) {
	LLVM_DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n');
	DenseMap<MachineBasicBlock, ScopeType>::iterator SI = ScopeMap.find(MBB);
	assert(SI != ScopeMap.end());
	delete SI->second;
	ScopeMap.erase(SI);
	}

	bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
	bool Changed = false;

	SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
	SmallVector<unsigned, 2> ImplicitDefsToUpdate;
	SmallVector<unsigned, 2> ImplicitDefs;
	for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) {
	MachineInstr MI = &I;
	++I;

	if (!isCSECandidate(MI))
	continue;

	bool FoundCSE = VNT.count(MI);
	if (!FoundCSE) {
	// Using trivial copy propagation to find more CSE opportunities.
	if (PerformTrivialCopyPropagation(MI, MBB)) {
	Changed = true;

	// After coalescing MI itself may become a copy.
	if (MI->isCopyLike())
	continue;

	// Try again to see if CSE is possible.
	FoundCSE = VNT.count(MI);
	}
	}

	// Commute commutable instructions.
	bool Commuted = false;
	if (!FoundCSE && MI->isCommutable()) {
	if (MachineInstr NewMI = TII->commuteInstruction(MI)) {
	Commuted = true;
	FoundCSE = VNT.count(NewMI);
	if (NewMI != MI) {
	// New instruction. It doesn't need to be kept.
	NewMI->eraseFromParent();
	Changed = true;
	} else if (!FoundCSE)
	// MI was changed but it didn't help, commute it back!
	(void)TII->commuteInstruction(*MI);
	}
	}

	// If the instruction defines physical registers and the values may be
	// used, then it's not safe to replace it with a common subexpression.
	// It's also not safe if the instruction uses physical registers.
	bool CrossMBBPhysDef = false;
	SmallSet<unsigned, 8> PhysRefs;
	PhysDefVector PhysDefs;
	bool PhysUseDef = false;
	if (FoundCSE && hasLivePhysRegDefUses(MI, MBB, PhysRefs,
	PhysDefs, PhysUseDef)) {
	FoundCSE = false;

	// ... Unless the CS is local or is in the sole predecessor block
	// and it also defines the physical register which is not clobbered
	// in between and the physical register uses were not clobbered.
	// This can never be the case if the instruction both uses and
	// defines the same physical register, which was detected above.
	if (!PhysUseDef) {
	unsigned CSVN = VNT.lookup(MI);
	MachineInstr *CSMI = Exps[CSVN];
	if (PhysRegDefsReach(CSMI, MI, PhysRefs, PhysDefs, CrossMBBPhysDef))
	FoundCSE = true;
	}
	}

	if (!FoundCSE) {
	VNT.insert(MI, CurrVN++);
	Exps.push_back(MI);
	continue;
	}

	// Found a common subexpression, eliminate it.
	unsigned CSVN = VNT.lookup(MI);
	MachineInstr *CSMI = Exps[CSVN];
	LLVM_DEBUG(dbgs() << "Examining: " << *MI);
	LLVM_DEBUG(dbgs() << "*** Found a common subexpression: " << *CSMI);

	// Check if it's profitable to perform this CSE.
	bool DoCSE = true;
	unsigned NumDefs = MI->getNumDefs();

	for (unsigned i = 0, e = MI->getNumOperands(); NumDefs && i != e; ++i) {
	MachineOperand &MO = MI->getOperand(i);
	if (!MO.isReg() \|\| !MO.isDef())
	continue;
	unsigned OldReg = MO.getReg();
	unsigned NewReg = CSMI->getOperand(i).getReg();

	// Go through implicit defs of CSMI and MI, if a def is not dead at MI,
	// we should make sure it is not dead at CSMI.
	if (MO.isImplicit() && !MO.isDead() && CSMI->getOperand(i).isDead())
	ImplicitDefsToUpdate.push_back(i);

	// Keep track of implicit defs of CSMI and MI, to clear possibly
	// made-redundant kill flags.
	if (MO.isImplicit() && !MO.isDead() && OldReg == NewReg)
	ImplicitDefs.push_back(OldReg);

	if (OldReg == NewReg) {
	--NumDefs;
	continue;
	}

	assert(TargetRegisterInfo::isVirtualRegister(OldReg) &&
	TargetRegisterInfo::isVirtualRegister(NewReg) &&
	"Do not CSE physical register defs!");

	if (!isProfitableToCSE(NewReg, OldReg, CSMI->getParent(), MI)) {
	LLVM_DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
	DoCSE = false;
	break;
	}

	// Don't perform CSE if the result of the new instruction cannot exist
	// within the constraints (register class, bank, or low-level type) of
	// the old instruction.
	if (!MRI->constrainRegAttrs(NewReg, OldReg)) {
	LLVM_DEBUG(
	dbgs() << "*** Not the same register constraints, avoid CSE!\n");
	DoCSE = false;
	break;
	}

	CSEPairs.push_back(std::make_pair(OldReg, NewReg));
	--NumDefs;
	}

	// Actually perform the elimination.
	if (DoCSE) {
	for (std::pair<unsigned, unsigned> &CSEPair : CSEPairs) {
	unsigned OldReg = CSEPair.first;
	unsigned NewReg = CSEPair.second;
	// OldReg may have been unused but is used now, clear the Dead flag
	MachineInstr *Def = MRI->getUniqueVRegDef(NewReg);
	assert(Def != nullptr && "CSEd register has no unique definition?");
	Def->clearRegisterDeads(NewReg);
	// Replace with NewReg and clear kill flags which may be wrong now.
	MRI->replaceRegWith(OldReg, NewReg);
	MRI->clearKillFlags(NewReg);
	}

	// Go through implicit defs of CSMI and MI, if a def is not dead at MI,
	// we should make sure it is not dead at CSMI.
	for (unsigned ImplicitDefToUpdate : ImplicitDefsToUpdate)
	CSMI->getOperand(ImplicitDefToUpdate).setIsDead(false);
	for (auto PhysDef : PhysDefs)
	if (!MI->getOperand(PhysDef.first).isDead())
	CSMI->getOperand(PhysDef.first).setIsDead(false);

	// Go through implicit defs of CSMI and MI, and clear the kill flags on
	// their uses in all the instructions between CSMI and MI.
	// We might have made some of the kill flags redundant, consider:
	// subs ... implicit-def %nzcv <- CSMI
	// csinc ... implicit killed %nzcv <- this kill flag isn't valid anymore
	// subs ... implicit-def %nzcv <- MI, to be eliminated
	// csinc ... implicit killed %nzcv
	// Since we eliminated MI, and reused a register imp-def'd by CSMI
	// (here %nzcv), that register, if it was killed before MI, should have
	// that kill flag removed, because it's lifetime was extended.
	if (CSMI->getParent() == MI->getParent()) {
	for (MachineBasicBlock::iterator II = CSMI, IE = MI; II != IE; ++II)
	for (auto ImplicitDef : ImplicitDefs)
	if (MachineOperand *MO = II->findRegisterUseOperand(
	ImplicitDef, /isKill=/true, TRI))
	MO->setIsKill(false);
	} else {
	// If the instructions aren't in the same BB, bail out and clear the
	// kill flag on all uses of the imp-def'd register.
	for (auto ImplicitDef : ImplicitDefs)
	MRI->clearKillFlags(ImplicitDef);
	}

	if (CrossMBBPhysDef) {
	// Add physical register defs now coming in from a predecessor to MBB
	// livein list.
	while (!PhysDefs.empty()) {
	auto LiveIn = PhysDefs.pop_back_val();
	if (!MBB->isLiveIn(LiveIn.second))
	MBB->addLiveIn(LiveIn.second);
	}
	++NumCrossBBCSEs;
	}

	MI->eraseFromParent();
	++NumCSEs;
	if (!PhysRefs.empty())
	++NumPhysCSEs;
	if (Commuted)
	++NumCommutes;
	Changed = true;
	} else {
	VNT.insert(MI, CurrVN++);
	Exps.push_back(MI);
	}
	CSEPairs.clear();
	ImplicitDefsToUpdate.clear();
	ImplicitDefs.clear();
	}

	return Changed;
	}

	/// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given
	/// dominator tree node if its a leaf or all of its children are done. Walk
	/// up the dominator tree to destroy ancestors which are now done.
	void
	MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
	DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren) {
	if (OpenChildren[Node])
	return;

	// Pop scope.
	ExitScope(Node->getBlock());

	// Now traverse upwards to pop ancestors whose offsprings are all done.
	while (MachineDomTreeNode *Parent = Node->getIDom()) {
	unsigned Left = --OpenChildren[Parent];
	if (Left != 0)
	break;
	ExitScope(Parent->getBlock());
	Node = Parent;
	}
	}

	bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
	SmallVector<MachineDomTreeNode*, 32> Scopes;
	SmallVector<MachineDomTreeNode*, 8> WorkList;
	DenseMap<MachineDomTreeNode*, unsigned> OpenChildren;

	CurrVN = 0;

	// Perform a DFS walk to determine the order of visit.
	WorkList.push_back(Node);
	do {
	Node = WorkList.pop_back_val();
	Scopes.push_back(Node);
	const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
	OpenChildren[Node] = Children.size();
	for (MachineDomTreeNode *Child : Children)
	WorkList.push_back(Child);
	} while (!WorkList.empty());

	// Now perform CSE.
	bool Changed = false;
	for (MachineDomTreeNode *Node : Scopes) {
	MachineBasicBlock *MBB = Node->getBlock();
	EnterScope(MBB);
	Changed \|= ProcessBlockCSE(MBB);
	// If it's a leaf node, it's done. Traverse upwards to pop ancestors.
	ExitScopeIfDone(Node, OpenChildren);
	}

	return Changed;
	}

	// We use stronger checks for PRE candidate rather than for CSE ones to embrace
	// checks inside ProcessBlockCSE(), not only inside isCSECandidate(). This helps
	// to exclude instrs created by PRE that won't be CSEed later.
	bool MachineCSE::isPRECandidate(MachineInstr *MI) {
	if (!isCSECandidate(MI) \|\|
	MI->isNotDuplicable() \|\|
	MI->mayLoad() \|\|
	MI->isAsCheapAsAMove() \|\|
	MI->getNumDefs() != 1 \|\|
	MI->getNumExplicitDefs() != 1)
	return false;

	for (auto def : MI->defs())
	if (!TRI->isVirtualRegister(def.getReg()))
	return false;

	for (auto use : MI->uses())
	if (use.isReg() && !TRI->isVirtualRegister(use.getReg()))
	return false;

	return true;
	}

	bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
	MachineBasicBlock *MBB) {
	bool Changed = false;
	for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
	MachineInstr MI = &I;
	++I;

	if (!isPRECandidate(MI))
	continue;

	if (!PREMap.count(MI)) {
	PREMap[MI] = MBB;
	continue;
	}

	auto MBB1 = PREMap[MI];
	assert(
	!DT->properlyDominates(MBB, MBB1) &&
	"MBB cannot properly dominate MBB1 while DFS through dominators tree!");
	auto CMBB = DT->findNearestCommonDominator(MBB, MBB1);
	if (!CMBB->isLegalToHoistInto())
	continue;

	+ if (!isBeneficalToHoistInto(CMBB, MBB, MBB1))
	+ continue;
	+
	// Two instrs are partial redundant if their basic blocks are reachable
	// from one to another but one doesn't dominate another.
	if (CMBB != MBB1) {
	auto BB = MBB->getBasicBlock(), BB1 = MBB1->getBasicBlock();
	if (BB != nullptr && BB1 != nullptr &&
	(isPotentiallyReachable(BB1, BB) \|\|
	isPotentiallyReachable(BB, BB1))) {

	assert(MI->getOperand(0).isDef() &&
	"First operand of instr with one explicit def must be this def");
	unsigned VReg = MI->getOperand(0).getReg();
	unsigned NewReg = MRI->cloneVirtualRegister(VReg);
	if (!isProfitableToCSE(NewReg, VReg, CMBB, MI))
	continue;
	MachineInstr &NewMI =
	TII->duplicate(CMBB, CMBB->getFirstTerminator(), MI);
	NewMI.getOperand(0).setReg(NewReg);

	PREMap[MI] = CMBB;
	++NumPREs;
	Changed = true;
	}
	}
	}
	return Changed;
	}

	// This simple PRE (partial redundancy elimination) pass doesn't actually
	// eliminate partial redundancy but transforms it to full redundancy,
	// anticipating that the next CSE step will eliminate this created redundancy.
	// If CSE doesn't eliminate this, than created instruction will remain dead
	// and eliminated later by Remove Dead Machine Instructions pass.
	bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) {
	SmallVector<MachineDomTreeNode *, 32> BBs;

	PREMap.clear();
	bool Changed = false;
	BBs.push_back(DT->getRootNode());
	do {
	auto Node = BBs.pop_back_val();
	const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
	for (MachineDomTreeNode *Child : Children)
	BBs.push_back(Child);

	MachineBasicBlock *MBB = Node->getBlock();
	Changed \|= ProcessBlockPRE(DT, MBB);

	} while (!BBs.empty());

	return Changed;
	}

	+bool MachineCSE::isBeneficalToHoistInto(MachineBasicBlock *CandidateBB,
	+ MachineBasicBlock *MBB,
	+ MachineBasicBlock *MBB1) {
	+ if (CandidateBB->getParent()->getFunction().hasMinSize())
	+ return true;
	+ assert(DT->dominates(CandidateBB, MBB) && "CandidateBB should dominate MBB");
	+ assert(DT->dominates(CandidateBB, MBB1) &&
	+ "CandidateBB should dominate MBB1");
	+ return MBFI->getBlockFreq(CandidateBB) <=
	+ MBFI->getBlockFreq(MBB) + MBFI->getBlockFreq(MBB1);
	+}
	+
	bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
	if (skipFunction(MF.getFunction()))
	return false;

	TII = MF.getSubtarget().getInstrInfo();
	TRI = MF.getSubtarget().getRegisterInfo();
	MRI = &MF.getRegInfo();
	AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
	DT = &getAnalysis<MachineDominatorTree>();
	+ MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
	LookAheadLimit = TII->getMachineCSELookAheadLimit();
	bool ChangedPRE, ChangedCSE;
	ChangedPRE = PerformSimplePRE(DT);
	ChangedCSE = PerformCSE(DT->getRootNode());
	return ChangedPRE \|\| ChangedCSE;
	}
	Index: vendor/llvm/dist-release_90/lib/CodeGen/MachineModuleInfo.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/CodeGen/MachineModuleInfo.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/CodeGen/MachineModuleInfo.cpp (revision 351303)
	@@ -1,329 +1,329 @@
	//===-- llvm/CodeGen/MachineModuleInfo.cpp ----------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/PostOrderIterator.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/TinyPtrVector.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Value.h"
	#include "llvm/IR/ValueHandle.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Target/TargetLoweringObjectFile.h"
	#include "llvm/Target/TargetMachine.h"
	#include <algorithm>
	#include <cassert>
	#include <memory>
	#include <utility>
	#include <vector>

	using namespace llvm;
	using namespace llvm::dwarf;

	// Handle the Pass registration stuff necessary to use DataLayout's.
	INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo",
	"Machine Module Information", false, false)
	char MachineModuleInfo::ID = 0;

	// Out of line virtual method.
	MachineModuleInfoImpl::~MachineModuleInfoImpl() = default;

	namespace llvm {

	class MMIAddrLabelMapCallbackPtr final : CallbackVH {
	MMIAddrLabelMap *Map = nullptr;

	public:
	MMIAddrLabelMapCallbackPtr() = default;
	MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V) {}

	void setPtr(BasicBlock *BB) {
	ValueHandleBase::operator=(BB);
	}

	void setMap(MMIAddrLabelMap *map) { Map = map; }

	void deleted() override;
	void allUsesReplacedWith(Value *V2) override;
	};

	class MMIAddrLabelMap {
	MCContext &Context;
	struct AddrLabelSymEntry {
	/// The symbols for the label.
	TinyPtrVector<MCSymbol *> Symbols;

	Function *Fn; // The containing function of the BasicBlock.
	unsigned Index; // The index in BBCallbacks for the BasicBlock.
	};

	DenseMap<AssertingVH<BasicBlock>, AddrLabelSymEntry> AddrLabelSymbols;

	/// Callbacks for the BasicBlock's that we have entries for. We use this so
	/// we get notified if a block is deleted or RAUWd.
	std::vector<MMIAddrLabelMapCallbackPtr> BBCallbacks;

	/// This is a per-function list of symbols whose corresponding BasicBlock got
	/// deleted. These symbols need to be emitted at some point in the file, so
	/// AsmPrinter emits them after the function body.
	DenseMap<AssertingVH<Function>, std::vector<MCSymbol*>>
	DeletedAddrLabelsNeedingEmission;

	public:
	MMIAddrLabelMap(MCContext &context) : Context(context) {}

	~MMIAddrLabelMap() {
	assert(DeletedAddrLabelsNeedingEmission.empty() &&
	"Some labels for deleted blocks never got emitted");
	}

	ArrayRef<MCSymbol > getAddrLabelSymbolToEmit(BasicBlock BB);

	void takeDeletedSymbolsForFunction(Function *F,
	std::vector<MCSymbol*> &Result);

	void UpdateForDeletedBlock(BasicBlock *BB);
	void UpdateForRAUWBlock(BasicBlock Old, BasicBlock New);
	};

	} // end namespace llvm

	ArrayRef<MCSymbol > MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock BB) {
	assert(BB->hasAddressTaken() &&
	"Shouldn't get label for block without address taken");
	AddrLabelSymEntry &Entry = AddrLabelSymbols[BB];

	// If we already had an entry for this block, just return it.
	if (!Entry.Symbols.empty()) {
	assert(BB->getParent() == Entry.Fn && "Parent changed");
	return Entry.Symbols;
	}

	// Otherwise, this is a new entry, create a new symbol for it and add an
	// entry to BBCallbacks so we can be notified if the BB is deleted or RAUWd.
	BBCallbacks.emplace_back(BB);
	BBCallbacks.back().setMap(this);
	Entry.Index = BBCallbacks.size() - 1;
	Entry.Fn = BB->getParent();
	- Entry.Symbols.push_back(Context.createTempSymbol());
	+ Entry.Symbols.push_back(Context.createTempSymbol(!BB->hasAddressTaken()));
	return Entry.Symbols;
	}

	/// If we have any deleted symbols for F, return them.
	void MMIAddrLabelMap::
	takeDeletedSymbolsForFunction(Function F, std::vector<MCSymbol> &Result) {
	DenseMap<AssertingVH<Function>, std::vector<MCSymbol*>>::iterator I =
	DeletedAddrLabelsNeedingEmission.find(F);

	// If there are no entries for the function, just return.
	if (I == DeletedAddrLabelsNeedingEmission.end()) return;

	// Otherwise, take the list.
	std::swap(Result, I->second);
	DeletedAddrLabelsNeedingEmission.erase(I);
	}

	void MMIAddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) {
	// If the block got deleted, there is no need for the symbol. If the symbol
	// was already emitted, we can just forget about it, otherwise we need to
	// queue it up for later emission when the function is output.
	AddrLabelSymEntry Entry = std::move(AddrLabelSymbols[BB]);
	AddrLabelSymbols.erase(BB);
	assert(!Entry.Symbols.empty() && "Didn't have a symbol, why a callback?");
	BBCallbacks[Entry.Index] = nullptr; // Clear the callback.

	assert((BB->getParent() == nullptr \|\| BB->getParent() == Entry.Fn) &&
	"Block/parent mismatch");

	for (MCSymbol *Sym : Entry.Symbols) {
	if (Sym->isDefined())
	return;

	// If the block is not yet defined, we need to emit it at the end of the
	// function. Add the symbol to the DeletedAddrLabelsNeedingEmission list
	// for the containing Function. Since the block is being deleted, its
	// parent may already be removed, we have to get the function from 'Entry'.
	DeletedAddrLabelsNeedingEmission[Entry.Fn].push_back(Sym);
	}
	}

	void MMIAddrLabelMap::UpdateForRAUWBlock(BasicBlock Old, BasicBlock New) {
	// Get the entry for the RAUW'd block and remove it from our map.
	AddrLabelSymEntry OldEntry = std::move(AddrLabelSymbols[Old]);
	AddrLabelSymbols.erase(Old);
	assert(!OldEntry.Symbols.empty() && "Didn't have a symbol, why a callback?");

	AddrLabelSymEntry &NewEntry = AddrLabelSymbols[New];

	// If New is not address taken, just move our symbol over to it.
	if (NewEntry.Symbols.empty()) {
	BBCallbacks[OldEntry.Index].setPtr(New); // Update the callback.
	NewEntry = std::move(OldEntry); // Set New's entry.
	return;
	}

	BBCallbacks[OldEntry.Index] = nullptr; // Update the callback.

	// Otherwise, we need to add the old symbols to the new block's set.
	NewEntry.Symbols.insert(NewEntry.Symbols.end(), OldEntry.Symbols.begin(),
	OldEntry.Symbols.end());
	}

	void MMIAddrLabelMapCallbackPtr::deleted() {
	Map->UpdateForDeletedBlock(cast<BasicBlock>(getValPtr()));
	}

	void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
	Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2));
	}

	MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM)
	: ImmutablePass(ID), TM(*TM),
	Context(TM->getMCAsmInfo(), TM->getMCRegisterInfo(),
	TM->getObjFileLowering(), nullptr, false) {
	initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry());
	}

	MachineModuleInfo::~MachineModuleInfo() = default;

	bool MachineModuleInfo::doInitialization(Module &M) {
	ObjFileMMI = nullptr;
	CurCallSite = 0;
	UsesMSVCFloatingPoint = UsesMorestackAddr = false;
	HasSplitStack = HasNosplitStack = false;
	AddrLabelSymbols = nullptr;
	TheModule = &M;
	DbgInfoAvailable = !llvm::empty(M.debug_compile_units());
	return false;
	}

	bool MachineModuleInfo::doFinalization(Module &M) {
	Personalities.clear();

	delete AddrLabelSymbols;
	AddrLabelSymbols = nullptr;

	Context.reset();

	delete ObjFileMMI;
	ObjFileMMI = nullptr;

	return false;
	}

	//===- Address of Block Management ----------------------------------------===//

	ArrayRef<MCSymbol *>
	MachineModuleInfo::getAddrLabelSymbolToEmit(const BasicBlock *BB) {
	// Lazily create AddrLabelSymbols.
	if (!AddrLabelSymbols)
	AddrLabelSymbols = new MMIAddrLabelMap(Context);
	return AddrLabelSymbols->getAddrLabelSymbolToEmit(const_cast<BasicBlock*>(BB));
	}

	void MachineModuleInfo::
	takeDeletedSymbolsForFunction(const Function *F,
	std::vector<MCSymbol*> &Result) {
	// If no blocks have had their addresses taken, we're done.
	if (!AddrLabelSymbols) return;
	return AddrLabelSymbols->
	takeDeletedSymbolsForFunction(const_cast<Function*>(F), Result);
	}

	/// \name Exception Handling
	/// \{

	void MachineModuleInfo::addPersonality(const Function *Personality) {
	for (unsigned i = 0; i < Personalities.size(); ++i)
	if (Personalities[i] == Personality)
	return;
	Personalities.push_back(Personality);
	}

	/// \}

	MachineFunction *
	MachineModuleInfo::getMachineFunction(const Function &F) const {
	auto I = MachineFunctions.find(&F);
	return I != MachineFunctions.end() ? I->second.get() : nullptr;
	}

	MachineFunction &
	MachineModuleInfo::getOrCreateMachineFunction(const Function &F) {
	// Shortcut for the common case where a sequence of MachineFunctionPasses
	// all query for the same Function.
	if (LastRequest == &F)
	return *LastResult;

	auto I = MachineFunctions.insert(
	std::make_pair(&F, std::unique_ptr<MachineFunction>()));
	MachineFunction *MF;
	if (I.second) {
	// No pre-existing machine function, create a new one.
	const TargetSubtargetInfo &STI = *TM.getSubtargetImpl(F);
	MF = new MachineFunction(F, TM, STI, NextFnNum++, *this);
	// Update the set entry.
	I.first->second.reset(MF);
	} else {
	MF = I.first->second.get();
	}

	LastRequest = &F;
	LastResult = MF;
	return *MF;
	}

	void MachineModuleInfo::deleteMachineFunctionFor(Function &F) {
	MachineFunctions.erase(&F);
	LastRequest = nullptr;
	LastResult = nullptr;
	}

	namespace {

	/// This pass frees the MachineFunction object associated with a Function.
	class FreeMachineFunction : public FunctionPass {
	public:
	static char ID;

	FreeMachineFunction() : FunctionPass(ID) {}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<MachineModuleInfo>();
	AU.addPreserved<MachineModuleInfo>();
	}

	bool runOnFunction(Function &F) override {
	MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
	MMI.deleteMachineFunctionFor(F);
	return true;
	}

	StringRef getPassName() const override {
	return "Free MachineFunction";
	}
	};

	} // end anonymous namespace

	char FreeMachineFunction::ID;

	FunctionPass *llvm::createFreeMachineFunctionPass() {
	return new FreeMachineFunction();
	}
	Index: vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp (revision 351303)
	@@ -1,1023 +1,1033 @@
	//===--- ScheduleDAGSDNodes.cpp - Implement the ScheduleDAGSDNodes class --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This implements the ScheduleDAG class, which is a base class used by
	// scheduling implementation classes.
	//
	//===----------------------------------------------------------------------===//

	#include "ScheduleDAGSDNodes.h"
	#include "InstrEmitter.h"
	#include "SDNodeDbgValue.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/Config/llvm-config.h"
	#include "llvm/MC/MCInstrItineraries.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	using namespace llvm;

	#define DEBUG_TYPE "pre-RA-sched"

	STATISTIC(LoadsClustered, "Number of loads clustered together");

	// This allows the latency-based scheduler to notice high latency instructions
	// without a target itinerary. The choice of number here has more to do with
	// balancing scheduler heuristics than with the actual machine latency.
	static cl::opt<int> HighLatencyCycles(
	"sched-high-latency-cycles", cl::Hidden, cl::init(10),
	cl::desc("Roughly estimate the number of cycles that 'long latency'"
	"instructions take for targets with no itinerary"));

	ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
	: ScheduleDAG(mf), BB(nullptr), DAG(nullptr),
	InstrItins(mf.getSubtarget().getInstrItineraryData()) {}

	/// Run - perform scheduling.
	///
	void ScheduleDAGSDNodes::Run(SelectionDAG dag, MachineBasicBlock bb) {
	BB = bb;
	DAG = dag;

	// Clear the scheduler's SUnit DAG.
	ScheduleDAG::clearDAG();
	Sequence.clear();

	// Invoke the target's selection of scheduler.
	Schedule();
	}

	/// NewSUnit - Creates a new SUnit and return a ptr to it.
	///
	SUnit ScheduleDAGSDNodes::newSUnit(SDNode N) {
	#ifndef NDEBUG
	const SUnit *Addr = nullptr;
	if (!SUnits.empty())
	Addr = &SUnits[0];
	#endif
	SUnits.emplace_back(N, (unsigned)SUnits.size());
	assert((Addr == nullptr \|\| Addr == &SUnits[0]) &&
	"SUnits std::vector reallocated on the fly!");
	SUnits.back().OrigNode = &SUnits.back();
	SUnit *SU = &SUnits.back();
	const TargetLowering &TLI = DAG->getTargetLoweringInfo();
	if (!N \|\|
	(N->isMachineOpcode() &&
	N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF))
	SU->SchedulingPref = Sched::None;
	else
	SU->SchedulingPref = TLI.getSchedulingPreference(N);
	return SU;
	}

	SUnit ScheduleDAGSDNodes::Clone(SUnit Old) {
	SUnit *SU = newSUnit(Old->getNode());
	SU->OrigNode = Old->OrigNode;
	SU->Latency = Old->Latency;
	SU->isVRegCycle = Old->isVRegCycle;
	SU->isCall = Old->isCall;
	SU->isCallOp = Old->isCallOp;
	SU->isTwoAddress = Old->isTwoAddress;
	SU->isCommutable = Old->isCommutable;
	SU->hasPhysRegDefs = Old->hasPhysRegDefs;
	SU->hasPhysRegClobbers = Old->hasPhysRegClobbers;
	SU->isScheduleHigh = Old->isScheduleHigh;
	SU->isScheduleLow = Old->isScheduleLow;
	SU->SchedulingPref = Old->SchedulingPref;
	Old->isCloned = true;
	return SU;
	}

	/// CheckForPhysRegDependency - Check if the dependency between def and use of
	/// a specified operand is a physical register dependency. If so, returns the
	/// register and the cost of copying the register.
	static void CheckForPhysRegDependency(SDNode Def, SDNode User, unsigned Op,
	const TargetRegisterInfo *TRI,
	const TargetInstrInfo *TII,
	unsigned &PhysReg, int &Cost) {
	if (Op != 2 \|\| User->getOpcode() != ISD::CopyToReg)
	return;

	unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
	if (TargetRegisterInfo::isVirtualRegister(Reg))
	return;

	unsigned ResNo = User->getOperand(2).getResNo();
	if (Def->getOpcode() == ISD::CopyFromReg &&
	cast<RegisterSDNode>(Def->getOperand(1))->getReg() == Reg) {
	PhysReg = Reg;
	} else if (Def->isMachineOpcode()) {
	const MCInstrDesc &II = TII->get(Def->getMachineOpcode());
	if (ResNo >= II.getNumDefs() &&
	II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg)
	PhysReg = Reg;
	}

	if (PhysReg != 0) {
	const TargetRegisterClass *RC =
	TRI->getMinimalPhysRegClass(Reg, Def->getSimpleValueType(ResNo));
	Cost = RC->getCopyCost();
	}
	}

	// Helper for AddGlue to clone node operands.
	static void CloneNodeWithValues(SDNode N, SelectionDAG DAG, ArrayRef<EVT> VTs,
	SDValue ExtraOper = SDValue()) {
	SmallVector<SDValue, 8> Ops(N->op_begin(), N->op_end());
	if (ExtraOper.getNode())
	Ops.push_back(ExtraOper);

	SDVTList VTList = DAG->getVTList(VTs);
	MachineSDNode *MN = dyn_cast<MachineSDNode>(N);

	// Store memory references.
	SmallVector<MachineMemOperand *, 2> MMOs;
	if (MN)
	MMOs.assign(MN->memoperands_begin(), MN->memoperands_end());

	DAG->MorphNodeTo(N, N->getOpcode(), VTList, Ops);

	// Reset the memory references
	if (MN)
	DAG->setNodeMemRefs(MN, MMOs);
	}

	static bool AddGlue(SDNode N, SDValue Glue, bool AddGlue, SelectionDAG DAG) {
	SDNode *GlueDestNode = Glue.getNode();

	// Don't add glue from a node to itself.
	if (GlueDestNode == N) return false;

	// Don't add a glue operand to something that already uses glue.
	if (GlueDestNode &&
	N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue) {
	return false;
	}
	// Don't add glue to something that already has a glue value.
	if (N->getValueType(N->getNumValues() - 1) == MVT::Glue) return false;

	SmallVector<EVT, 4> VTs(N->value_begin(), N->value_end());
	if (AddGlue)
	VTs.push_back(MVT::Glue);

	CloneNodeWithValues(N, DAG, VTs, Glue);

	return true;
	}

	// Cleanup after unsuccessful AddGlue. Use the standard method of morphing the
	// node even though simply shrinking the value list is sufficient.
	static void RemoveUnusedGlue(SDNode N, SelectionDAG DAG) {
	assert((N->getValueType(N->getNumValues() - 1) == MVT::Glue &&
	!N->hasAnyUseOfValue(N->getNumValues() - 1)) &&
	"expected an unused glue value");

	CloneNodeWithValues(N, DAG,
	makeArrayRef(N->value_begin(), N->getNumValues() - 1));
	}

	/// ClusterNeighboringLoads - Force nearby loads together by "gluing" them.
	/// This function finds loads of the same base and different offsets. If the
	/// offsets are not far apart (target specific), it add MVT::Glue inputs and
	/// outputs to ensure they are scheduled together and in order. This
	/// optimization may benefit some targets by improving cache locality.
	void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
	SDNode *Chain = nullptr;
	unsigned NumOps = Node->getNumOperands();
	if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)
	Chain = Node->getOperand(NumOps-1).getNode();
	if (!Chain)
	return;

	// Skip any load instruction that has a tied input. There may be an additional
	// dependency requiring a different order than by increasing offsets, and the
	// added glue may introduce a cycle.
	auto hasTiedInput = [this](const SDNode *N) {
	const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
	for (unsigned I = 0; I != MCID.getNumOperands(); ++I) {
	if (MCID.getOperandConstraint(I, MCOI::TIED_TO) != -1)
	return true;
	}

	return false;
	};

	// Look for other loads of the same chain. Find loads that are loading from
	// the same base pointer and different offsets.
	SmallPtrSet<SDNode*, 16> Visited;
	SmallVector<int64_t, 4> Offsets;
	DenseMap<long long, SDNode*> O2SMap; // Map from offset to SDNode.
	bool Cluster = false;
	SDNode *Base = Node;

	if (hasTiedInput(Base))
	return;

	// This algorithm requires a reasonably low use count before finding a match
	// to avoid uselessly blowing up compile time in large blocks.
	unsigned UseCount = 0;
	for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
	I != E && UseCount < 100; ++I, ++UseCount) {
	SDNode User = I;
	if (User == Node \|\| !Visited.insert(User).second)
	continue;
	int64_t Offset1, Offset2;
	if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) \|\|
	Offset1 == Offset2 \|\|
	hasTiedInput(User)) {
	// FIXME: Should be ok if they addresses are identical. But earlier
	// optimizations really should have eliminated one of the loads.
	continue;
	}
	if (O2SMap.insert(std::make_pair(Offset1, Base)).second)
	Offsets.push_back(Offset1);
	O2SMap.insert(std::make_pair(Offset2, User));
	Offsets.push_back(Offset2);
	if (Offset2 < Offset1)
	Base = User;
	Cluster = true;
	// Reset UseCount to allow more matches.
	UseCount = 0;
	}

	if (!Cluster)
	return;

	// Sort them in increasing order.
	llvm::sort(Offsets);

	// Check if the loads are close enough.
	SmallVector<SDNode*, 4> Loads;
	unsigned NumLoads = 0;
	int64_t BaseOff = Offsets[0];
	SDNode *BaseLoad = O2SMap[BaseOff];
	Loads.push_back(BaseLoad);
	for (unsigned i = 1, e = Offsets.size(); i != e; ++i) {
	int64_t Offset = Offsets[i];
	SDNode *Load = O2SMap[Offset];
	if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset,NumLoads))
	break; // Stop right here. Ignore loads that are further away.
	Loads.push_back(Load);
	++NumLoads;
	}

	if (NumLoads == 0)
	return;

	// Cluster loads by adding MVT::Glue outputs and inputs. This also
	// ensure they are scheduled in order of increasing addresses.
	SDNode *Lead = Loads[0];
	SDValue InGlue = SDValue(nullptr, 0);
	if (AddGlue(Lead, InGlue, true, DAG))
	InGlue = SDValue(Lead, Lead->getNumValues() - 1);
	for (unsigned I = 1, E = Loads.size(); I != E; ++I) {
	bool OutGlue = I < E - 1;
	SDNode *Load = Loads[I];

	// If AddGlue fails, we could leave an unsused glue value. This should not
	// cause any
	if (AddGlue(Load, InGlue, OutGlue, DAG)) {
	if (OutGlue)
	InGlue = SDValue(Load, Load->getNumValues() - 1);

	++LoadsClustered;
	}
	else if (!OutGlue && InGlue.getNode())
	RemoveUnusedGlue(InGlue.getNode(), DAG);
	}
	}

	/// ClusterNodes - Cluster certain nodes which should be scheduled together.
	///
	void ScheduleDAGSDNodes::ClusterNodes() {
	for (SDNode &NI : DAG->allnodes()) {
	SDNode *Node = &NI;
	if (!Node \|\| !Node->isMachineOpcode())
	continue;

	unsigned Opc = Node->getMachineOpcode();
	const MCInstrDesc &MCID = TII->get(Opc);
	if (MCID.mayLoad())
	// Cluster loads from "near" addresses into combined SUnits.
	ClusterNeighboringLoads(Node);
	}
	}

	void ScheduleDAGSDNodes::BuildSchedUnits() {
	// During scheduling, the NodeId field of SDNode is used to map SDNodes
	// to their associated SUnits by holding SUnits table indices. A value
	// of -1 means the SDNode does not yet have an associated SUnit.
	unsigned NumNodes = 0;
	for (SDNode &NI : DAG->allnodes()) {
	NI.setNodeId(-1);
	++NumNodes;
	}

	// Reserve entries in the vector for each of the SUnits we are creating. This
	// ensure that reallocation of the vector won't happen, so SUnit*'s won't get
	// invalidated.
	// FIXME: Multiply by 2 because we may clone nodes during scheduling.
	// This is a temporary workaround.
	SUnits.reserve(NumNodes * 2);

	// Add all nodes in depth first order.
	SmallVector<SDNode*, 64> Worklist;
	SmallPtrSet<SDNode*, 32> Visited;
	Worklist.push_back(DAG->getRoot().getNode());
	Visited.insert(DAG->getRoot().getNode());

	SmallVector<SUnit*, 8> CallSUnits;
	while (!Worklist.empty()) {
	SDNode *NI = Worklist.pop_back_val();

	// Add all operands to the worklist unless they've already been added.
	for (const SDValue &Op : NI->op_values())
	if (Visited.insert(Op.getNode()).second)
	Worklist.push_back(Op.getNode());

	if (isPassiveNode(NI)) // Leaf node, e.g. a TargetImmediate.
	continue;

	// If this node has already been processed, stop now.
	if (NI->getNodeId() != -1) continue;

	SUnit *NodeSUnit = newSUnit(NI);

	// See if anything is glued to this node, if so, add them to glued
	// nodes. Nodes can have at most one glue input and one glue output. Glue
	// is required to be the last operand and result of a node.

	// Scan up to find glued preds.
	SDNode *N = NI;
	while (N->getNumOperands() &&
	N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue) {
	N = N->getOperand(N->getNumOperands()-1).getNode();
	assert(N->getNodeId() == -1 && "Node already inserted!");
	N->setNodeId(NodeSUnit->NodeNum);
	if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall())
	NodeSUnit->isCall = true;
	}

	// Scan down to find any glued succs.
	N = NI;
	while (N->getValueType(N->getNumValues()-1) == MVT::Glue) {
	SDValue GlueVal(N, N->getNumValues()-1);

	// There are either zero or one users of the Glue result.
	bool HasGlueUse = false;
	for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
	UI != E; ++UI)
	if (GlueVal.isOperandOf(*UI)) {
	HasGlueUse = true;
	assert(N->getNodeId() == -1 && "Node already inserted!");
	N->setNodeId(NodeSUnit->NodeNum);
	N = *UI;
	if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall())
	NodeSUnit->isCall = true;
	break;
	}
	if (!HasGlueUse) break;
	}

	if (NodeSUnit->isCall)
	CallSUnits.push_back(NodeSUnit);

	// Schedule zero-latency TokenFactor below any nodes that may increase the
	// schedule height. Otherwise, ancestors of the TokenFactor may appear to
	// have false stalls.
	if (NI->getOpcode() == ISD::TokenFactor)
	NodeSUnit->isScheduleLow = true;

	// If there are glue operands involved, N is now the bottom-most node
	// of the sequence of nodes that are glued together.
	// Update the SUnit.
	NodeSUnit->setNode(N);
	assert(N->getNodeId() == -1 && "Node already inserted!");
	N->setNodeId(NodeSUnit->NodeNum);

	// Compute NumRegDefsLeft. This must be done before AddSchedEdges.
	InitNumRegDefsLeft(NodeSUnit);

	// Assign the Latency field of NodeSUnit using target-provided information.
	computeLatency(NodeSUnit);
	}

	// Find all call operands.
	while (!CallSUnits.empty()) {
	SUnit *SU = CallSUnits.pop_back_val();
	for (const SDNode *SUNode = SU->getNode(); SUNode;
	SUNode = SUNode->getGluedNode()) {
	if (SUNode->getOpcode() != ISD::CopyToReg)
	continue;
	SDNode *SrcN = SUNode->getOperand(2).getNode();
	if (isPassiveNode(SrcN)) continue; // Not scheduled.
	SUnit *SrcSU = &SUnits[SrcN->getNodeId()];
	SrcSU->isCallOp = true;
	}
	}
	}

	void ScheduleDAGSDNodes::AddSchedEdges() {
	const TargetSubtargetInfo &ST = MF.getSubtarget();

	// Check to see if the scheduler cares about latencies.
	bool UnitLatencies = forceUnitLatencies();

	// Pass 2: add the preds, succs, etc.
	for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
	SUnit *SU = &SUnits[su];
	SDNode *MainNode = SU->getNode();

	if (MainNode->isMachineOpcode()) {
	unsigned Opc = MainNode->getMachineOpcode();
	const MCInstrDesc &MCID = TII->get(Opc);
	for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
	if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
	SU->isTwoAddress = true;
	break;
	}
	}
	if (MCID.isCommutable())
	SU->isCommutable = true;
	}

	// Find all predecessors and successors of the group.
	for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) {
	if (N->isMachineOpcode() &&
	TII->get(N->getMachineOpcode()).getImplicitDefs()) {
	SU->hasPhysRegClobbers = true;
	unsigned NumUsed = InstrEmitter::CountResults(N);
	while (NumUsed != 0 && !N->hasAnyUseOfValue(NumUsed - 1))
	--NumUsed; // Skip over unused values at the end.
	if (NumUsed > TII->get(N->getMachineOpcode()).getNumDefs())
	SU->hasPhysRegDefs = true;
	}

	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	SDNode *OpN = N->getOperand(i).getNode();
	if (isPassiveNode(OpN)) continue; // Not scheduled.
	SUnit *OpSU = &SUnits[OpN->getNodeId()];
	assert(OpSU && "Node has no SUnit!");
	if (OpSU == SU) continue; // In the same group.

	EVT OpVT = N->getOperand(i).getValueType();
	assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!");
	bool isChain = OpVT == MVT::Other;

	unsigned PhysReg = 0;
	int Cost = 1;
	// Determine if this is a physical register dependency.
	CheckForPhysRegDependency(OpN, N, i, TRI, TII, PhysReg, Cost);
	assert((PhysReg == 0 \|\| !isChain) &&
	"Chain dependence via physreg data?");
	// FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler
	// emits a copy from the physical register to a virtual register unless
	// it requires a cross class copy (cost < 0). That means we are only
	// treating "expensive to copy" register dependency as physical register
	// dependency. This may change in the future though.
	if (Cost >= 0 && !StressSched)
	PhysReg = 0;

	// If this is a ctrl dep, latency is 1.
	unsigned OpLatency = isChain ? 1 : OpSU->Latency;
	// Special-case TokenFactor chains as zero-latency.
	if(isChain && OpN->getOpcode() == ISD::TokenFactor)
	OpLatency = 0;

	SDep Dep = isChain ? SDep(OpSU, SDep::Barrier)
	: SDep(OpSU, SDep::Data, PhysReg);
	Dep.setLatency(OpLatency);
	if (!isChain && !UnitLatencies) {
	computeOperandLatency(OpN, N, i, Dep);
	ST.adjustSchedDependency(OpSU, SU, Dep);
	}

	if (!SU->addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) {
	// Multiple register uses are combined in the same SUnit. For example,
	// we could have a set of glued nodes with all their defs consumed by
	// another set of glued nodes. Register pressure tracking sees this as
	// a single use, so to keep pressure balanced we reduce the defs.
	//
	// We can't tell (without more book-keeping) if this results from
	// glued nodes or duplicate operands. As long as we don't reduce
	// NumRegDefsLeft to zero, we handle the common cases well.
	--OpSU->NumRegDefsLeft;
	}
	}
	}
	}
	}

	/// BuildSchedGraph - Build the SUnit graph from the selection dag that we
	/// are input. This SUnit graph is similar to the SelectionDAG, but
	/// excludes nodes that aren't interesting to scheduling, and represents
	/// glued together nodes with a single SUnit.
	void ScheduleDAGSDNodes::BuildSchedGraph(AliasAnalysis *AA) {
	// Cluster certain nodes which should be scheduled together.
	ClusterNodes();
	// Populate the SUnits array.
	BuildSchedUnits();
	// Compute all the scheduling dependencies between nodes.
	AddSchedEdges();
	}

	// Initialize NumNodeDefs for the current Node's opcode.
	void ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs() {
	// Check for phys reg copy.
	if (!Node)
	return;

	if (!Node->isMachineOpcode()) {
	if (Node->getOpcode() == ISD::CopyFromReg)
	NodeNumDefs = 1;
	else
	NodeNumDefs = 0;
	return;
	}
	unsigned POpc = Node->getMachineOpcode();
	if (POpc == TargetOpcode::IMPLICIT_DEF) {
	// No register need be allocated for this.
	NodeNumDefs = 0;
	return;
	}
	if (POpc == TargetOpcode::PATCHPOINT &&
	Node->getValueType(0) == MVT::Other) {
	// PATCHPOINT is defined to have one result, but it might really have none
	// if we're not using CallingConv::AnyReg. Don't mistake the chain for a
	// real definition.
	NodeNumDefs = 0;
	return;
	}
	unsigned NRegDefs = SchedDAG->TII->get(Node->getMachineOpcode()).getNumDefs();
	// Some instructions define regs that are not represented in the selection DAG
	// (e.g. unused flags). See tMOVi8. Make sure we don't access past NumValues.
	NodeNumDefs = std::min(Node->getNumValues(), NRegDefs);
	DefIdx = 0;
	}

	// Construct a RegDefIter for this SUnit and find the first valid value.
	ScheduleDAGSDNodes::RegDefIter::RegDefIter(const SUnit *SU,
	const ScheduleDAGSDNodes *SD)
	: SchedDAG(SD), Node(SU->getNode()), DefIdx(0), NodeNumDefs(0) {
	InitNodeNumDefs();
	Advance();
	}

	// Advance to the next valid value defined by the SUnit.
	void ScheduleDAGSDNodes::RegDefIter::Advance() {
	for (;Node;) { // Visit all glued nodes.
	for (;DefIdx < NodeNumDefs; ++DefIdx) {
	if (!Node->hasAnyUseOfValue(DefIdx))
	continue;
	ValueType = Node->getSimpleValueType(DefIdx);
	++DefIdx;
	return; // Found a normal regdef.
	}
	Node = Node->getGluedNode();
	if (!Node) {
	return; // No values left to visit.
	}
	InitNodeNumDefs();
	}
	}

	void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) {
	assert(SU->NumRegDefsLeft == 0 && "expect a new node");
	for (RegDefIter I(SU, this); I.IsValid(); I.Advance()) {
	assert(SU->NumRegDefsLeft < USHRT_MAX && "overflow is ok but unexpected");
	++SU->NumRegDefsLeft;
	}
	}

	void ScheduleDAGSDNodes::computeLatency(SUnit *SU) {
	SDNode *N = SU->getNode();

	// TokenFactor operands are considered zero latency, and some schedulers
	// (e.g. Top-Down list) may rely on the fact that operand latency is nonzero
	// whenever node latency is nonzero.
	if (N && N->getOpcode() == ISD::TokenFactor) {
	SU->Latency = 0;
	return;
	}

	// Check to see if the scheduler cares about latencies.
	if (forceUnitLatencies()) {
	SU->Latency = 1;
	return;
	}

	if (!InstrItins \|\| InstrItins->isEmpty()) {
	if (N && N->isMachineOpcode() &&
	TII->isHighLatencyDef(N->getMachineOpcode()))
	SU->Latency = HighLatencyCycles;
	else
	SU->Latency = 1;
	return;
	}

	// Compute the latency for the node. We use the sum of the latencies for
	// all nodes glued together into this SUnit.
	SU->Latency = 0;
	for (SDNode *N = SU->getNode(); N; N = N->getGluedNode())
	if (N->isMachineOpcode())
	SU->Latency += TII->getInstrLatency(InstrItins, N);
	}

	void ScheduleDAGSDNodes::computeOperandLatency(SDNode Def, SDNode Use,
	unsigned OpIdx, SDep& dep) const{
	// Check to see if the scheduler cares about latencies.
	if (forceUnitLatencies())
	return;

	if (dep.getKind() != SDep::Data)
	return;

	unsigned DefIdx = Use->getOperand(OpIdx).getResNo();
	if (Use->isMachineOpcode())
	// Adjust the use operand index by num of defs.
	OpIdx += TII->get(Use->getMachineOpcode()).getNumDefs();
	int Latency = TII->getOperandLatency(InstrItins, Def, DefIdx, Use, OpIdx);
	if (Latency > 1 && Use->getOpcode() == ISD::CopyToReg &&
	!BB->succ_empty()) {
	unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
	if (TargetRegisterInfo::isVirtualRegister(Reg))
	// This copy is a liveout value. It is likely coalesced, so reduce the
	// latency so not to penalize the def.
	// FIXME: need target specific adjustment here?
	Latency = (Latency > 1) ? Latency - 1 : 1;
	}
	if (Latency >= 0)
	dep.setLatency(Latency);
	}

	void ScheduleDAGSDNodes::dumpNode(const SUnit &SU) const {
	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	dumpNodeName(SU);
	dbgs() << ": ";

	if (!SU.getNode()) {
	dbgs() << "PHYS REG COPY\n";
	return;
	}

	SU.getNode()->dump(DAG);
	dbgs() << "\n";
	SmallVector<SDNode *, 4> GluedNodes;
	for (SDNode *N = SU.getNode()->getGluedNode(); N; N = N->getGluedNode())
	GluedNodes.push_back(N);
	while (!GluedNodes.empty()) {
	dbgs() << " ";
	GluedNodes.back()->dump(DAG);
	dbgs() << "\n";
	GluedNodes.pop_back();
	}
	#endif
	}

	void ScheduleDAGSDNodes::dump() const {
	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	if (EntrySU.getNode() != nullptr)
	dumpNodeAll(EntrySU);
	for (const SUnit &SU : SUnits)
	dumpNodeAll(SU);
	if (ExitSU.getNode() != nullptr)
	dumpNodeAll(ExitSU);
	#endif
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	void ScheduleDAGSDNodes::dumpSchedule() const {
	for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
	if (SUnit *SU = Sequence[i])
	dumpNode(*SU);
	else
	dbgs() << "** NOOP **\n";
	}
	}
	#endif

	#ifndef NDEBUG
	/// VerifyScheduledSequence - Verify that all SUnits were scheduled and that
	/// their state is consistent with the nodes listed in Sequence.
	///
	void ScheduleDAGSDNodes::VerifyScheduledSequence(bool isBottomUp) {
	unsigned ScheduledNodes = ScheduleDAG::VerifyScheduledDAG(isBottomUp);
	unsigned Noops = 0;
	for (unsigned i = 0, e = Sequence.size(); i != e; ++i)
	if (!Sequence[i])
	++Noops;
	assert(Sequence.size() - Noops == ScheduledNodes &&
	"The number of nodes scheduled doesn't match the expected number!");
	}
	#endif // NDEBUG

	/// ProcessSDDbgValues - Process SDDbgValues associated with this node.
	static void
	ProcessSDDbgValues(SDNode N, SelectionDAG DAG, InstrEmitter &Emitter,
	SmallVectorImpl<std::pair<unsigned, MachineInstr*> > &Orders,
	DenseMap<SDValue, unsigned> &VRBaseMap, unsigned Order) {
	if (!N->getHasDebugValue())
	return;

	// Opportunistically insert immediate dbg_value uses, i.e. those with the same
	// source order number as N.
	MachineBasicBlock *BB = Emitter.getBlock();
	MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
	for (auto DV : DAG->GetDbgValues(N)) {
	if (DV->isEmitted())
	continue;
	unsigned DVOrder = DV->getOrder();
	if (!Order \|\| DVOrder == Order) {
	MachineInstr *DbgMI = Emitter.EmitDbgValue(DV, VRBaseMap);
	if (DbgMI) {
	Orders.push_back({DVOrder, DbgMI});
	BB->insert(InsertPos, DbgMI);
	}
	}
	}
	}

	// ProcessSourceNode - Process nodes with source order numbers. These are added
	// to a vector which EmitSchedule uses to determine how to insert dbg_value
	// instructions in the right order.
	static void
	ProcessSourceNode(SDNode N, SelectionDAG DAG, InstrEmitter &Emitter,
	DenseMap<SDValue, unsigned> &VRBaseMap,
	SmallVectorImpl<std::pair<unsigned, MachineInstr *>> &Orders,
	SmallSet<unsigned, 8> &Seen, MachineInstr *NewInsn) {
	unsigned Order = N->getIROrder();
	if (!Order \|\| Seen.count(Order)) {
	// Process any valid SDDbgValues even if node does not have any order
	// assigned.
	ProcessSDDbgValues(N, DAG, Emitter, Orders, VRBaseMap, 0);
	return;
	}

	// If a new instruction was generated for this Order number, record it.
	// Otherwise, leave this order number unseen: we will either find later
	// instructions for it, or leave it unseen if there were no instructions at
	// all.
	if (NewInsn) {
	Seen.insert(Order);
	Orders.push_back({Order, NewInsn});
	}

	// Even if no instruction was generated, a Value may have become defined via
	// earlier nodes. Try to process them now.
	ProcessSDDbgValues(N, DAG, Emitter, Orders, VRBaseMap, Order);
	}

	void ScheduleDAGSDNodes::
	EmitPhysRegCopy(SUnit SU, DenseMap<SUnit, unsigned> &VRBaseMap,
	MachineBasicBlock::iterator InsertPos) {
	for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
	I != E; ++I) {
	if (I->isCtrl()) continue; // ignore chain preds
	if (I->getSUnit()->CopyDstRC) {
	// Copy to physical register.
	DenseMap<SUnit*, unsigned>::iterator VRI = VRBaseMap.find(I->getSUnit());
	assert(VRI != VRBaseMap.end() && "Node emitted out of order - late");
	// Find the destination physical register.
	unsigned Reg = 0;
	for (SUnit::const_succ_iterator II = SU->Succs.begin(),
	EE = SU->Succs.end(); II != EE; ++II) {
	if (II->isCtrl()) continue; // ignore chain preds
	if (II->getReg()) {
	Reg = II->getReg();
	break;
	}
	}
	BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
	.addReg(VRI->second);
	} else {
	// Copy from physical register.
	assert(I->getReg() && "Unknown physical register!");
	unsigned VRBase = MRI.createVirtualRegister(SU->CopyDstRC);
	bool isNew = VRBaseMap.insert(std::make_pair(SU, VRBase)).second;
	(void)isNew; // Silence compiler warning.
	assert(isNew && "Node emitted out of order - early");
	BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), VRBase)
	.addReg(I->getReg());
	}
	break;
	}
	}

	/// EmitSchedule - Emit the machine code in scheduled order. Return the new
	/// InsertPos and MachineBasicBlock that contains this insertion
	/// point. ScheduleDAGSDNodes holds a BB pointer for convenience, but this does
	/// not necessarily refer to returned BB. The emitter may split blocks.
	MachineBasicBlock *ScheduleDAGSDNodes::
	EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
	InstrEmitter Emitter(BB, InsertPos);
	DenseMap<SDValue, unsigned> VRBaseMap;
	DenseMap<SUnit*, unsigned> CopyVRBaseMap;
	SmallVector<std::pair<unsigned, MachineInstr*>, 32> Orders;
	SmallSet<unsigned, 8> Seen;
	bool HasDbg = DAG->hasDebugValues();

	// Emit a node, and determine where its first instruction is for debuginfo.
	// Zero, one, or multiple instructions can be created when emitting a node.
	auto EmitNode =
	[&](SDNode *Node, bool IsClone, bool IsCloned,
	DenseMap<SDValue, unsigned> &VRBaseMap) -> MachineInstr * {
	// Fetch instruction prior to this, or end() if nonexistant.
	auto GetPrevInsn = [&](MachineBasicBlock::iterator I) {
	if (I == BB->begin())
	return BB->end();
	else
	return std::prev(Emitter.getInsertPos());
	};

	MachineBasicBlock::iterator Before = GetPrevInsn(Emitter.getInsertPos());
	Emitter.EmitNode(Node, IsClone, IsCloned, VRBaseMap);
	MachineBasicBlock::iterator After = GetPrevInsn(Emitter.getInsertPos());

	// If the iterator did not change, no instructions were inserted.
	if (Before == After)
	return nullptr;

	MachineInstr *MI;
	if (Before == BB->end()) {
	// There were no prior instructions; the new ones must start at the
	// beginning of the block.
	MI = &Emitter.getBlock()->instr_front();
	} else {
	// Return first instruction after the pre-existing instructions.
	MI = &*std::next(Before);
	}

	if (MI->isCall() && DAG->getTarget().Options.EnableDebugEntryValues)
	MF.addCallArgsForwardingRegs(MI, DAG->getSDCallSiteInfo(Node));

	return MI;
	};

	// If this is the first BB, emit byval parameter dbg_value's.
	if (HasDbg && BB->getParent()->begin() == MachineFunction::iterator(BB)) {
	SDDbgInfo::DbgIterator PDI = DAG->ByvalParmDbgBegin();
	SDDbgInfo::DbgIterator PDE = DAG->ByvalParmDbgEnd();
	for (; PDI != PDE; ++PDI) {
	MachineInstr DbgMI= Emitter.EmitDbgValue(PDI, VRBaseMap);
	if (DbgMI) {
	BB->insert(InsertPos, DbgMI);
	// We re-emit the dbg_value closer to its use, too, after instructions
	// are emitted to the BB.
	(*PDI)->clearIsEmitted();
	}
	}
	}

	for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
	SUnit *SU = Sequence[i];
	if (!SU) {
	// Null SUnit* is a noop.
	TII->insertNoop(*Emitter.getBlock(), InsertPos);
	continue;
	}

	// For pre-regalloc scheduling, create instructions corresponding to the
	// SDNode and any glued SDNodes and append them to the block.
	if (!SU->getNode()) {
	// Emit a copy.
	EmitPhysRegCopy(SU, CopyVRBaseMap, InsertPos);
	continue;
	}

	SmallVector<SDNode *, 4> GluedNodes;
	for (SDNode *N = SU->getNode()->getGluedNode(); N; N = N->getGluedNode())
	GluedNodes.push_back(N);
	while (!GluedNodes.empty()) {
	SDNode *N = GluedNodes.back();
	auto NewInsn = EmitNode(N, SU->OrigNode != SU, SU->isCloned, VRBaseMap);
	// Remember the source order of the inserted instruction.
	if (HasDbg)
	ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen, NewInsn);
	+
	+ if (MDNode *MD = DAG->getHeapAllocSite(N)) {
	+ if (NewInsn && NewInsn->isCall())
	+ MF.addCodeViewHeapAllocSite(NewInsn, MD);
	+ }
	+
	GluedNodes.pop_back();
	}
	auto NewInsn =
	EmitNode(SU->getNode(), SU->OrigNode != SU, SU->isCloned, VRBaseMap);
	// Remember the source order of the inserted instruction.
	if (HasDbg)
	ProcessSourceNode(SU->getNode(), DAG, Emitter, VRBaseMap, Orders, Seen,
	NewInsn);
	+ if (MDNode *MD = DAG->getHeapAllocSite(SU->getNode())) {
	+ if (NewInsn && NewInsn->isCall())
	+ MF.addCodeViewHeapAllocSite(NewInsn, MD);
	+ }
	}

	// Insert all the dbg_values which have not already been inserted in source
	// order sequence.
	if (HasDbg) {
	MachineBasicBlock::iterator BBBegin = BB->getFirstNonPHI();

	// Sort the source order instructions and use the order to insert debug
	// values. Use stable_sort so that DBG_VALUEs are inserted in the same order
	// regardless of the host's implementation fo std::sort.
	llvm::stable_sort(Orders, less_first());
	std::stable_sort(DAG->DbgBegin(), DAG->DbgEnd(),
	[](const SDDbgValue LHS, const SDDbgValue RHS) {
	return LHS->getOrder() < RHS->getOrder();
	});

	SDDbgInfo::DbgIterator DI = DAG->DbgBegin();
	SDDbgInfo::DbgIterator DE = DAG->DbgEnd();
	// Now emit the rest according to source order.
	unsigned LastOrder = 0;
	for (unsigned i = 0, e = Orders.size(); i != e && DI != DE; ++i) {
	unsigned Order = Orders[i].first;
	MachineInstr *MI = Orders[i].second;
	// Insert all SDDbgValue's whose order(s) are before "Order".
	assert(MI);
	for (; DI != DE; ++DI) {
	if ((DI)->getOrder() < LastOrder \|\| (DI)->getOrder() >= Order)
	break;
	if ((*DI)->isEmitted())
	continue;

	MachineInstr DbgMI = Emitter.EmitDbgValue(DI, VRBaseMap);
	if (DbgMI) {
	if (!LastOrder)
	// Insert to start of the BB (after PHIs).
	BB->insert(BBBegin, DbgMI);
	else {
	// Insert at the instruction, which may be in a different
	// block, if the block was split by a custom inserter.
	MachineBasicBlock::iterator Pos = MI;
	MI->getParent()->insert(Pos, DbgMI);
	}
	}
	}
	LastOrder = Order;
	}
	// Add trailing DbgValue's before the terminator. FIXME: May want to add
	// some of them before one or more conditional branches?
	SmallVector<MachineInstr*, 8> DbgMIs;
	for (; DI != DE; ++DI) {
	if ((*DI)->isEmitted())
	continue;
	assert((*DI)->getOrder() >= LastOrder &&
	"emitting DBG_VALUE out of order");
	if (MachineInstr DbgMI = Emitter.EmitDbgValue(DI, VRBaseMap))
	DbgMIs.push_back(DbgMI);
	}

	MachineBasicBlock *InsertBB = Emitter.getBlock();
	MachineBasicBlock::iterator Pos = InsertBB->getFirstTerminator();
	InsertBB->insert(Pos, DbgMIs.begin(), DbgMIs.end());

	SDDbgInfo::DbgLabelIterator DLI = DAG->DbgLabelBegin();
	SDDbgInfo::DbgLabelIterator DLE = DAG->DbgLabelEnd();
	// Now emit the rest according to source order.
	LastOrder = 0;
	for (const auto &InstrOrder : Orders) {
	unsigned Order = InstrOrder.first;
	MachineInstr *MI = InstrOrder.second;
	if (!MI)
	continue;

	// Insert all SDDbgLabel's whose order(s) are before "Order".
	for (; DLI != DLE &&
	(DLI)->getOrder() >= LastOrder && (DLI)->getOrder() < Order;
	++DLI) {
	MachineInstr DbgMI = Emitter.EmitDbgLabel(DLI);
	if (DbgMI) {
	if (!LastOrder)
	// Insert to start of the BB (after PHIs).
	BB->insert(BBBegin, DbgMI);
	else {
	// Insert at the instruction, which may be in a different
	// block, if the block was split by a custom inserter.
	MachineBasicBlock::iterator Pos = MI;
	MI->getParent()->insert(Pos, DbgMI);
	}
	}
	}
	if (DLI == DLE)
	break;

	LastOrder = Order;
	}
	}

	InsertPos = Emitter.getInsertPos();
	return Emitter.getBlock();
	}

	/// Return the basic block label.
	std::string ScheduleDAGSDNodes::getDAGName() const {
	return "sunit-dag." + BB->getFullName();
	}
	Index: vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (revision 351303)
	@@ -1,9594 +1,9595 @@
	//===- SelectionDAG.cpp - Implement the SelectionDAG data structures ------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This implements the SelectionDAG class.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/CodeGen/SelectionDAG.h"
	#include "SDNodeDbgValue.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/APSInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/FoldingSet.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineConstantPool.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Value.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MachineValueType.h"
	#include "llvm/Support/ManagedStatic.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/Mutex.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <cstdlib>
	#include <limits>
	#include <set>
	#include <string>
	#include <utility>
	#include <vector>

	using namespace llvm;

	/// makeVTList - Return an instance of the SDVTList struct initialized with the
	/// specified members.
	static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) {
	SDVTList Res = {VTs, NumVTs};
	return Res;
	}

	// Default null implementations of the callbacks.
	void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode, SDNode) {}
	void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}
	void SelectionDAG::DAGUpdateListener::NodeInserted(SDNode *) {}

	void SelectionDAG::DAGNodeDeletedListener::anchor() {}

	#define DEBUG_TYPE "selectiondag"

	static cl::opt<bool> EnableMemCpyDAGOpt("enable-memcpy-dag-opt",
	cl::Hidden, cl::init(true),
	cl::desc("Gang up loads and stores generated by inlining of memcpy"));

	static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max",
	cl::desc("Number limit for gluing ld/st of memcpy."),
	cl::Hidden, cl::init(0));

	static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
	LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G););
	}

	//===----------------------------------------------------------------------===//
	// ConstantFPSDNode Class
	//===----------------------------------------------------------------------===//

	/// isExactlyValue - We don't rely on operator== working on double values, as
	/// it returns true for things that are clearly not equal, like -0.0 and 0.0.
	/// As such, this method can be used to do an exact bit-for-bit comparison of
	/// two floating point values.
	bool ConstantFPSDNode::isExactlyValue(const APFloat& V) const {
	return getValueAPF().bitwiseIsEqual(V);
	}

	bool ConstantFPSDNode::isValueValidForType(EVT VT,
	const APFloat& Val) {
	assert(VT.isFloatingPoint() && "Can only convert between FP types");

	// convert modifies in place, so make a copy.
	APFloat Val2 = APFloat(Val);
	bool losesInfo;
	(void) Val2.convert(SelectionDAG::EVTToAPFloatSemantics(VT),
	APFloat::rmNearestTiesToEven,
	&losesInfo);
	return !losesInfo;
	}

	//===----------------------------------------------------------------------===//
	// ISD Namespace
	//===----------------------------------------------------------------------===//

	bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) {
	auto *BV = dyn_cast<BuildVectorSDNode>(N);
	if (!BV)
	return false;

	APInt SplatUndef;
	unsigned SplatBitSize;
	bool HasUndefs;
	unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits();
	return BV->isConstantSplat(SplatVal, SplatUndef, SplatBitSize, HasUndefs,
	EltSize) &&
	EltSize == SplatBitSize;
	}

	// FIXME: AllOnes and AllZeros duplicate a lot of code. Could these be
	// specializations of the more general isConstantSplatVector()?

	bool ISD::isBuildVectorAllOnes(const SDNode *N) {
	// Look through a bit convert.
	while (N->getOpcode() == ISD::BITCAST)
	N = N->getOperand(0).getNode();

	if (N->getOpcode() != ISD::BUILD_VECTOR) return false;

	unsigned i = 0, e = N->getNumOperands();

	// Skip over all of the undef values.
	while (i != e && N->getOperand(i).isUndef())
	++i;

	// Do not accept an all-undef vector.
	if (i == e) return false;

	// Do not accept build_vectors that aren't all constants or which have non-~0
	// elements. We have to be a bit careful here, as the type of the constant
	// may not be the same as the type of the vector elements due to type
	// legalization (the elements are promoted to a legal type for the target and
	// a vector of a type may be legal when the base element type is not).
	// We only want to check enough bits to cover the vector elements, because
	// we care if the resultant vector is all ones, not whether the individual
	// constants are.
	SDValue NotZero = N->getOperand(i);
	unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(NotZero)) {
	if (CN->getAPIntValue().countTrailingOnes() < EltSize)
	return false;
	} else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(NotZero)) {
	if (CFPN->getValueAPF().bitcastToAPInt().countTrailingOnes() < EltSize)
	return false;
	} else
	return false;

	// Okay, we have at least one ~0 value, check to see if the rest match or are
	// undefs. Even with the above element type twiddling, this should be OK, as
	// the same type legalization should have applied to all the elements.
	for (++i; i != e; ++i)
	if (N->getOperand(i) != NotZero && !N->getOperand(i).isUndef())
	return false;
	return true;
	}

	bool ISD::isBuildVectorAllZeros(const SDNode *N) {
	// Look through a bit convert.
	while (N->getOpcode() == ISD::BITCAST)
	N = N->getOperand(0).getNode();

	if (N->getOpcode() != ISD::BUILD_VECTOR) return false;

	bool IsAllUndef = true;
	for (const SDValue &Op : N->op_values()) {
	if (Op.isUndef())
	continue;
	IsAllUndef = false;
	// Do not accept build_vectors that aren't all constants or which have non-0
	// elements. We have to be a bit careful here, as the type of the constant
	// may not be the same as the type of the vector elements due to type
	// legalization (the elements are promoted to a legal type for the target
	// and a vector of a type may be legal when the base element type is not).
	// We only want to check enough bits to cover the vector elements, because
	// we care if the resultant vector is all zeros, not whether the individual
	// constants are.
	unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op)) {
	if (CN->getAPIntValue().countTrailingZeros() < EltSize)
	return false;
	} else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Op)) {
	if (CFPN->getValueAPF().bitcastToAPInt().countTrailingZeros() < EltSize)
	return false;
	} else
	return false;
	}

	// Do not accept an all-undef vector.
	if (IsAllUndef)
	return false;
	return true;
	}

	bool ISD::isBuildVectorOfConstantSDNodes(const SDNode *N) {
	if (N->getOpcode() != ISD::BUILD_VECTOR)
	return false;

	for (const SDValue &Op : N->op_values()) {
	if (Op.isUndef())
	continue;
	if (!isa<ConstantSDNode>(Op))
	return false;
	}
	return true;
	}

	bool ISD::isBuildVectorOfConstantFPSDNodes(const SDNode *N) {
	if (N->getOpcode() != ISD::BUILD_VECTOR)
	return false;

	for (const SDValue &Op : N->op_values()) {
	if (Op.isUndef())
	continue;
	if (!isa<ConstantFPSDNode>(Op))
	return false;
	}
	return true;
	}

	bool ISD::allOperandsUndef(const SDNode *N) {
	// Return false if the node has no operands.
	// This is "logically inconsistent" with the definition of "all" but
	// is probably the desired behavior.
	if (N->getNumOperands() == 0)
	return false;
	return all_of(N->op_values(), [](SDValue Op) { return Op.isUndef(); });
	}

	bool ISD::matchUnaryPredicate(SDValue Op,
	std::function<bool(ConstantSDNode *)> Match,
	bool AllowUndefs) {
	// FIXME: Add support for scalar UNDEF cases?
	if (auto *Cst = dyn_cast<ConstantSDNode>(Op))
	return Match(Cst);

	// FIXME: Add support for vector UNDEF cases?
	if (ISD::BUILD_VECTOR != Op.getOpcode())
	return false;

	EVT SVT = Op.getValueType().getScalarType();
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	if (AllowUndefs && Op.getOperand(i).isUndef()) {
	if (!Match(nullptr))
	return false;
	continue;
	}

	auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(i));
	if (!Cst \|\| Cst->getValueType(0) != SVT \|\| !Match(Cst))
	return false;
	}
	return true;
	}

	bool ISD::matchBinaryPredicate(
	SDValue LHS, SDValue RHS,
	std::function<bool(ConstantSDNode , ConstantSDNode )> Match,
	bool AllowUndefs, bool AllowTypeMismatch) {
	if (!AllowTypeMismatch && LHS.getValueType() != RHS.getValueType())
	return false;

	// TODO: Add support for scalar UNDEF cases?
	if (auto *LHSCst = dyn_cast<ConstantSDNode>(LHS))
	if (auto *RHSCst = dyn_cast<ConstantSDNode>(RHS))
	return Match(LHSCst, RHSCst);

	// TODO: Add support for vector UNDEF cases?
	if (ISD::BUILD_VECTOR != LHS.getOpcode() \|\|
	ISD::BUILD_VECTOR != RHS.getOpcode())
	return false;

	EVT SVT = LHS.getValueType().getScalarType();
	for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) {
	SDValue LHSOp = LHS.getOperand(i);
	SDValue RHSOp = RHS.getOperand(i);
	bool LHSUndef = AllowUndefs && LHSOp.isUndef();
	bool RHSUndef = AllowUndefs && RHSOp.isUndef();
	auto *LHSCst = dyn_cast<ConstantSDNode>(LHSOp);
	auto *RHSCst = dyn_cast<ConstantSDNode>(RHSOp);
	if ((!LHSCst && !LHSUndef) \|\| (!RHSCst && !RHSUndef))
	return false;
	if (!AllowTypeMismatch && (LHSOp.getValueType() != SVT \|\|
	LHSOp.getValueType() != RHSOp.getValueType()))
	return false;
	if (!Match(LHSCst, RHSCst))
	return false;
	}
	return true;
	}

	ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) {
	switch (ExtType) {
	case ISD::EXTLOAD:
	return IsFP ? ISD::FP_EXTEND : ISD::ANY_EXTEND;
	case ISD::SEXTLOAD:
	return ISD::SIGN_EXTEND;
	case ISD::ZEXTLOAD:
	return ISD::ZERO_EXTEND;
	default:
	break;
	}

	llvm_unreachable("Invalid LoadExtType");
	}

	ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
	// To perform this operation, we just need to swap the L and G bits of the
	// operation.
	unsigned OldL = (Operation >> 2) & 1;
	unsigned OldG = (Operation >> 1) & 1;
	return ISD::CondCode((Operation & ~6) \| // Keep the N, U, E bits
	(OldL << 1) \| // New G bit
	(OldG << 2)); // New L bit.
	}

	ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) {
	unsigned Operation = Op;
	if (isInteger)
	Operation ^= 7; // Flip L, G, E bits, but not U.
	else
	Operation ^= 15; // Flip all of the condition bits.

	if (Operation > ISD::SETTRUE2)
	Operation &= ~8; // Don't let N and U bits get set.

	return ISD::CondCode(Operation);
	}

	/// For an integer comparison, return 1 if the comparison is a signed operation
	/// and 2 if the result is an unsigned comparison. Return zero if the operation
	/// does not depend on the sign of the input (setne and seteq).
	static int isSignedOp(ISD::CondCode Opcode) {
	switch (Opcode) {
	default: llvm_unreachable("Illegal integer setcc operation!");
	case ISD::SETEQ:
	case ISD::SETNE: return 0;
	case ISD::SETLT:
	case ISD::SETLE:
	case ISD::SETGT:
	case ISD::SETGE: return 1;
	case ISD::SETULT:
	case ISD::SETULE:
	case ISD::SETUGT:
	case ISD::SETUGE: return 2;
	}
	}

	ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
	bool IsInteger) {
	if (IsInteger && (isSignedOp(Op1) \| isSignedOp(Op2)) == 3)
	// Cannot fold a signed integer setcc with an unsigned integer setcc.
	return ISD::SETCC_INVALID;

	unsigned Op = Op1 \| Op2; // Combine all of the condition bits.

	// If the N and U bits get set, then the resultant comparison DOES suddenly
	// care about orderedness, and it is true when ordered.
	if (Op > ISD::SETTRUE2)
	Op &= ~16; // Clear the U bit if the N bit is set.

	// Canonicalize illegal integer setcc's.
	if (IsInteger && Op == ISD::SETUNE) // e.g. SETUGT \| SETULT
	Op = ISD::SETNE;

	return ISD::CondCode(Op);
	}

	ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
	bool IsInteger) {
	if (IsInteger && (isSignedOp(Op1) \| isSignedOp(Op2)) == 3)
	// Cannot fold a signed setcc with an unsigned setcc.
	return ISD::SETCC_INVALID;

	// Combine all of the condition bits.
	ISD::CondCode Result = ISD::CondCode(Op1 & Op2);

	// Canonicalize illegal integer setcc's.
	if (IsInteger) {
	switch (Result) {
	default: break;
	case ISD::SETUO : Result = ISD::SETFALSE; break; // SETUGT & SETULT
	case ISD::SETOEQ: // SETEQ & SETU[LG]E
	case ISD::SETUEQ: Result = ISD::SETEQ ; break; // SETUGE & SETULE
	case ISD::SETOLT: Result = ISD::SETULT ; break; // SETULT & SETNE
	case ISD::SETOGT: Result = ISD::SETUGT ; break; // SETUGT & SETNE
	}
	}

	return Result;
	}

	//===----------------------------------------------------------------------===//
	// SDNode Profile Support
	//===----------------------------------------------------------------------===//

	/// AddNodeIDOpcode - Add the node opcode to the NodeID data.
	static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC) {
	ID.AddInteger(OpC);
	}

	/// AddNodeIDValueTypes - Value type lists are intern'd so we can represent them
	/// solely with their pointer.
	static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
	ID.AddPointer(VTList.VTs);
	}

	/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
	static void AddNodeIDOperands(FoldingSetNodeID &ID,
	ArrayRef<SDValue> Ops) {
	for (auto& Op : Ops) {
	ID.AddPointer(Op.getNode());
	ID.AddInteger(Op.getResNo());
	}
	}

	/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
	static void AddNodeIDOperands(FoldingSetNodeID &ID,
	ArrayRef<SDUse> Ops) {
	for (auto& Op : Ops) {
	ID.AddPointer(Op.getNode());
	ID.AddInteger(Op.getResNo());
	}
	}

	static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC,
	SDVTList VTList, ArrayRef<SDValue> OpList) {
	AddNodeIDOpcode(ID, OpC);
	AddNodeIDValueTypes(ID, VTList);
	AddNodeIDOperands(ID, OpList);
	}

	/// If this is an SDNode with special info, add this info to the NodeID data.
	static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
	switch (N->getOpcode()) {
	case ISD::TargetExternalSymbol:
	case ISD::ExternalSymbol:
	case ISD::MCSymbol:
	llvm_unreachable("Should only be used on nodes with operands");
	default: break; // Normal nodes don't need extra info.
	case ISD::TargetConstant:
	case ISD::Constant: {
	const ConstantSDNode *C = cast<ConstantSDNode>(N);
	ID.AddPointer(C->getConstantIntValue());
	ID.AddBoolean(C->isOpaque());
	break;
	}
	case ISD::TargetConstantFP:
	case ISD::ConstantFP:
	ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue());
	break;
	case ISD::TargetGlobalAddress:
	case ISD::GlobalAddress:
	case ISD::TargetGlobalTLSAddress:
	case ISD::GlobalTLSAddress: {
	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
	ID.AddPointer(GA->getGlobal());
	ID.AddInteger(GA->getOffset());
	ID.AddInteger(GA->getTargetFlags());
	break;
	}
	case ISD::BasicBlock:
	ID.AddPointer(cast<BasicBlockSDNode>(N)->getBasicBlock());
	break;
	case ISD::Register:
	ID.AddInteger(cast<RegisterSDNode>(N)->getReg());
	break;
	case ISD::RegisterMask:
	ID.AddPointer(cast<RegisterMaskSDNode>(N)->getRegMask());
	break;
	case ISD::SRCVALUE:
	ID.AddPointer(cast<SrcValueSDNode>(N)->getValue());
	break;
	case ISD::FrameIndex:
	case ISD::TargetFrameIndex:
	ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
	break;
	case ISD::LIFETIME_START:
	case ISD::LIFETIME_END:
	if (cast<LifetimeSDNode>(N)->hasOffset()) {
	ID.AddInteger(cast<LifetimeSDNode>(N)->getSize());
	ID.AddInteger(cast<LifetimeSDNode>(N)->getOffset());
	}
	break;
	case ISD::JumpTable:
	case ISD::TargetJumpTable:
	ID.AddInteger(cast<JumpTableSDNode>(N)->getIndex());
	ID.AddInteger(cast<JumpTableSDNode>(N)->getTargetFlags());
	break;
	case ISD::ConstantPool:
	case ISD::TargetConstantPool: {
	const ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
	ID.AddInteger(CP->getAlignment());
	ID.AddInteger(CP->getOffset());
	if (CP->isMachineConstantPoolEntry())
	CP->getMachineCPVal()->addSelectionDAGCSEId(ID);
	else
	ID.AddPointer(CP->getConstVal());
	ID.AddInteger(CP->getTargetFlags());
	break;
	}
	case ISD::TargetIndex: {
	const TargetIndexSDNode *TI = cast<TargetIndexSDNode>(N);
	ID.AddInteger(TI->getIndex());
	ID.AddInteger(TI->getOffset());
	ID.AddInteger(TI->getTargetFlags());
	break;
	}
	case ISD::LOAD: {
	const LoadSDNode *LD = cast<LoadSDNode>(N);
	ID.AddInteger(LD->getMemoryVT().getRawBits());
	ID.AddInteger(LD->getRawSubclassData());
	ID.AddInteger(LD->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::STORE: {
	const StoreSDNode *ST = cast<StoreSDNode>(N);
	ID.AddInteger(ST->getMemoryVT().getRawBits());
	ID.AddInteger(ST->getRawSubclassData());
	ID.AddInteger(ST->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::MLOAD: {
	const MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N);
	ID.AddInteger(MLD->getMemoryVT().getRawBits());
	ID.AddInteger(MLD->getRawSubclassData());
	ID.AddInteger(MLD->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::MSTORE: {
	const MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
	ID.AddInteger(MST->getMemoryVT().getRawBits());
	ID.AddInteger(MST->getRawSubclassData());
	ID.AddInteger(MST->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::MGATHER: {
	const MaskedGatherSDNode *MG = cast<MaskedGatherSDNode>(N);
	ID.AddInteger(MG->getMemoryVT().getRawBits());
	ID.AddInteger(MG->getRawSubclassData());
	ID.AddInteger(MG->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::MSCATTER: {
	const MaskedScatterSDNode *MS = cast<MaskedScatterSDNode>(N);
	ID.AddInteger(MS->getMemoryVT().getRawBits());
	ID.AddInteger(MS->getRawSubclassData());
	ID.AddInteger(MS->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::ATOMIC_CMP_SWAP:
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_CLR:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	case ISD::ATOMIC_LOAD:
	case ISD::ATOMIC_STORE: {
	const AtomicSDNode *AT = cast<AtomicSDNode>(N);
	ID.AddInteger(AT->getMemoryVT().getRawBits());
	ID.AddInteger(AT->getRawSubclassData());
	ID.AddInteger(AT->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::PREFETCH: {
	const MemSDNode *PF = cast<MemSDNode>(N);
	ID.AddInteger(PF->getPointerInfo().getAddrSpace());
	break;
	}
	case ISD::VECTOR_SHUFFLE: {
	const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
	for (unsigned i = 0, e = N->getValueType(0).getVectorNumElements();
	i != e; ++i)
	ID.AddInteger(SVN->getMaskElt(i));
	break;
	}
	case ISD::TargetBlockAddress:
	case ISD::BlockAddress: {
	const BlockAddressSDNode *BA = cast<BlockAddressSDNode>(N);
	ID.AddPointer(BA->getBlockAddress());
	ID.AddInteger(BA->getOffset());
	ID.AddInteger(BA->getTargetFlags());
	break;
	}
	} // end switch (N->getOpcode())

	// Target specific memory nodes could also have address spaces to check.
	if (N->isTargetMemoryOpcode())
	ID.AddInteger(cast<MemSDNode>(N)->getPointerInfo().getAddrSpace());
	}

	/// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID
	/// data.
	static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) {
	AddNodeIDOpcode(ID, N->getOpcode());
	// Add the return value info.
	AddNodeIDValueTypes(ID, N->getVTList());
	// Add the operand info.
	AddNodeIDOperands(ID, N->ops());

	// Handle SDNode leafs with special info.
	AddNodeIDCustom(ID, N);
	}

	//===----------------------------------------------------------------------===//
	// SelectionDAG Class
	//===----------------------------------------------------------------------===//

	/// doNotCSE - Return true if CSE should not be performed for this node.
	static bool doNotCSE(SDNode *N) {
	if (N->getValueType(0) == MVT::Glue)
	return true; // Never CSE anything that produces a flag.

	switch (N->getOpcode()) {
	default: break;
	case ISD::HANDLENODE:
	case ISD::EH_LABEL:
	return true; // Never CSE these nodes.
	}

	// Check that remaining values produced are not flags.
	for (unsigned i = 1, e = N->getNumValues(); i != e; ++i)
	if (N->getValueType(i) == MVT::Glue)
	return true; // Never CSE anything that produces a flag.

	return false;
	}

	/// RemoveDeadNodes - This method deletes all unreachable nodes in the
	/// SelectionDAG.
	void SelectionDAG::RemoveDeadNodes() {
	// Create a dummy node (which is not added to allnodes), that adds a reference
	// to the root node, preventing it from being deleted.
	HandleSDNode Dummy(getRoot());

	SmallVector<SDNode*, 128> DeadNodes;

	// Add all obviously-dead nodes to the DeadNodes worklist.
	for (SDNode &Node : allnodes())
	if (Node.use_empty())
	DeadNodes.push_back(&Node);

	RemoveDeadNodes(DeadNodes);

	// If the root changed (e.g. it was a dead load, update the root).
	setRoot(Dummy.getValue());
	}

	/// RemoveDeadNodes - This method deletes the unreachable nodes in the
	/// given list, and any nodes that become unreachable as a result.
	void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) {

	// Process the worklist, deleting the nodes and adding their uses to the
	// worklist.
	while (!DeadNodes.empty()) {
	SDNode *N = DeadNodes.pop_back_val();
	// Skip to next node if we've already managed to delete the node. This could
	// happen if replacing a node causes a node previously added to the node to
	// be deleted.
	if (N->getOpcode() == ISD::DELETED_NODE)
	continue;

	for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
	DUL->NodeDeleted(N, nullptr);

	// Take the node out of the appropriate CSE map.
	RemoveNodeFromCSEMaps(N);

	// Next, brutally remove the operand list. This is safe to do, as there are
	// no cycles in the graph.
	for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
	SDUse &Use = *I++;
	SDNode *Operand = Use.getNode();
	Use.set(SDValue());

	// Now that we removed this operand, see if there are no uses of it left.
	if (Operand->use_empty())
	DeadNodes.push_back(Operand);
	}

	DeallocateNode(N);
	}
	}

	void SelectionDAG::RemoveDeadNode(SDNode *N){
	SmallVector<SDNode*, 16> DeadNodes(1, N);

	// Create a dummy node that adds a reference to the root node, preventing
	// it from being deleted. (This matters if the root is an operand of the
	// dead node.)
	HandleSDNode Dummy(getRoot());

	RemoveDeadNodes(DeadNodes);
	}

	void SelectionDAG::DeleteNode(SDNode *N) {
	// First take this out of the appropriate CSE map.
	RemoveNodeFromCSEMaps(N);

	// Finally, remove uses due to operands of this node, remove from the
	// AllNodes list, and delete the node.
	DeleteNodeNotInCSEMaps(N);
	}

	void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
	assert(N->getIterator() != AllNodes.begin() &&
	"Cannot delete the entry node!");
	assert(N->use_empty() && "Cannot delete a node that is not dead!");

	// Drop all of the operands and decrement used node's use counts.
	N->DropOperands();

	DeallocateNode(N);
	}

	void SDDbgInfo::erase(const SDNode *Node) {
	DbgValMapType::iterator I = DbgValMap.find(Node);
	if (I == DbgValMap.end())
	return;
	for (auto &Val: I->second)
	Val->setIsInvalidated();
	DbgValMap.erase(I);
	}

	void SelectionDAG::DeallocateNode(SDNode *N) {
	// If we have operands, deallocate them.
	removeOperands(N);

	NodeAllocator.Deallocate(AllNodes.remove(N));

	// Set the opcode to DELETED_NODE to help catch bugs when node
	// memory is reallocated.
	// FIXME: There are places in SDag that have grown a dependency on the opcode
	// value in the released node.
	__asan_unpoison_memory_region(&N->NodeType, sizeof(N->NodeType));
	N->NodeType = ISD::DELETED_NODE;

	// If any of the SDDbgValue nodes refer to this SDNode, invalidate
	// them and forget about that node.
	DbgInfo->erase(N);
	}

	#ifndef NDEBUG
	/// VerifySDNode - Sanity check the given SDNode. Aborts if it is invalid.
	static void VerifySDNode(SDNode *N) {
	switch (N->getOpcode()) {
	default:
	break;
	case ISD::BUILD_PAIR: {
	EVT VT = N->getValueType(0);
	assert(N->getNumValues() == 1 && "Too many results!");
	assert(!VT.isVector() && (VT.isInteger() \|\| VT.isFloatingPoint()) &&
	"Wrong return type!");
	assert(N->getNumOperands() == 2 && "Wrong number of operands!");
	assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
	"Mismatched operand types!");
	assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
	"Wrong operand type!");
	assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
	"Wrong return type size");
	break;
	}
	case ISD::BUILD_VECTOR: {
	assert(N->getNumValues() == 1 && "Too many results!");
	assert(N->getValueType(0).isVector() && "Wrong return type!");
	assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
	"Wrong number of operands!");
	EVT EltVT = N->getValueType(0).getVectorElementType();
	for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) {
	assert((I->getValueType() == EltVT \|\|
	(EltVT.isInteger() && I->getValueType().isInteger() &&
	EltVT.bitsLE(I->getValueType()))) &&
	"Wrong operand type!");
	assert(I->getValueType() == N->getOperand(0).getValueType() &&
	"Operands must all have the same type");
	}
	break;
	}
	}
	}
	#endif // NDEBUG

	/// Insert a newly allocated node into the DAG.
	///
	/// Handles insertion into the all nodes list and CSE map, as well as
	/// verification and other common operations when a new node is allocated.
	void SelectionDAG::InsertNode(SDNode *N) {
	AllNodes.push_back(N);
	#ifndef NDEBUG
	N->PersistentId = NextPersistentId++;
	VerifySDNode(N);
	#endif
	for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
	DUL->NodeInserted(N);
	}

	/// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
	/// correspond to it. This is useful when we're about to delete or repurpose
	/// the node. We don't want future request for structurally identical nodes
	/// to return N anymore.
	bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
	bool Erased = false;
	switch (N->getOpcode()) {
	case ISD::HANDLENODE: return false; // noop.
	case ISD::CONDCODE:
	assert(CondCodeNodes[cast<CondCodeSDNode>(N)->get()] &&
	"Cond code doesn't exist!");
	Erased = CondCodeNodes[cast<CondCodeSDNode>(N)->get()] != nullptr;
	CondCodeNodes[cast<CondCodeSDNode>(N)->get()] = nullptr;
	break;
	case ISD::ExternalSymbol:
	Erased = ExternalSymbols.erase(cast<ExternalSymbolSDNode>(N)->getSymbol());
	break;
	case ISD::TargetExternalSymbol: {
	ExternalSymbolSDNode *ESN = cast<ExternalSymbolSDNode>(N);
	Erased = TargetExternalSymbols.erase(
	std::pair<std::string,unsigned char>(ESN->getSymbol(),
	ESN->getTargetFlags()));
	break;
	}
	case ISD::MCSymbol: {
	auto *MCSN = cast<MCSymbolSDNode>(N);
	Erased = MCSymbols.erase(MCSN->getMCSymbol());
	break;
	}
	case ISD::VALUETYPE: {
	EVT VT = cast<VTSDNode>(N)->getVT();
	if (VT.isExtended()) {
	Erased = ExtendedValueTypeNodes.erase(VT);
	} else {
	Erased = ValueTypeNodes[VT.getSimpleVT().SimpleTy] != nullptr;
	ValueTypeNodes[VT.getSimpleVT().SimpleTy] = nullptr;
	}
	break;
	}
	default:
	// Remove it from the CSE Map.
	assert(N->getOpcode() != ISD::DELETED_NODE && "DELETED_NODE in CSEMap!");
	assert(N->getOpcode() != ISD::EntryToken && "EntryToken in CSEMap!");
	Erased = CSEMap.RemoveNode(N);
	break;
	}
	#ifndef NDEBUG
	// Verify that the node was actually in one of the CSE maps, unless it has a
	// flag result (which cannot be CSE'd) or is one of the special cases that are
	// not subject to CSE.
	if (!Erased && N->getValueType(N->getNumValues()-1) != MVT::Glue &&
	!N->isMachineOpcode() && !doNotCSE(N)) {
	N->dump(this);
	dbgs() << "\n";
	llvm_unreachable("Node is not in map!");
	}
	#endif
	return Erased;
	}

	/// AddModifiedNodeToCSEMaps - The specified node has been removed from the CSE
	/// maps and modified in place. Add it back to the CSE maps, unless an identical
	/// node already exists, in which case transfer all its users to the existing
	/// node. This transfer can potentially trigger recursive merging.
	void
	SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) {
	// For node types that aren't CSE'd, just act as if no identical node
	// already exists.
	if (!doNotCSE(N)) {
	SDNode *Existing = CSEMap.GetOrInsertNode(N);
	if (Existing != N) {
	// If there was already an existing matching node, use ReplaceAllUsesWith
	// to replace the dead one with the existing one. This can cause
	// recursive merging of other unrelated nodes down the line.
	ReplaceAllUsesWith(N, Existing);

	// N is now dead. Inform the listeners and delete it.
	for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
	DUL->NodeDeleted(N, Existing);
	DeleteNodeNotInCSEMaps(N);
	return;
	}
	}

	// If the node doesn't already exist, we updated it. Inform listeners.
	for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
	DUL->NodeUpdated(N);
	}

	/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
	/// were replaced with those specified. If this node is never memoized,
	/// return null, otherwise return a pointer to the slot it would take. If a
	/// node already exists with these operands, the slot will be non-null.
	SDNode SelectionDAG::FindModifiedNodeSlot(SDNode N, SDValue Op,
	void *&InsertPos) {
	if (doNotCSE(N))
	return nullptr;

	SDValue Ops[] = { Op };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
	AddNodeIDCustom(ID, N);
	SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
	if (Node)
	Node->intersectFlagsWith(N->getFlags());
	return Node;
	}

	/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
	/// were replaced with those specified. If this node is never memoized,
	/// return null, otherwise return a pointer to the slot it would take. If a
	/// node already exists with these operands, the slot will be non-null.
	SDNode SelectionDAG::FindModifiedNodeSlot(SDNode N,
	SDValue Op1, SDValue Op2,
	void *&InsertPos) {
	if (doNotCSE(N))
	return nullptr;

	SDValue Ops[] = { Op1, Op2 };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
	AddNodeIDCustom(ID, N);
	SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
	if (Node)
	Node->intersectFlagsWith(N->getFlags());
	return Node;
	}

	/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
	/// were replaced with those specified. If this node is never memoized,
	/// return null, otherwise return a pointer to the slot it would take. If a
	/// node already exists with these operands, the slot will be non-null.
	SDNode SelectionDAG::FindModifiedNodeSlot(SDNode N, ArrayRef<SDValue> Ops,
	void *&InsertPos) {
	if (doNotCSE(N))
	return nullptr;

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
	AddNodeIDCustom(ID, N);
	SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
	if (Node)
	Node->intersectFlagsWith(N->getFlags());
	return Node;
	}

	unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
	Type *Ty = VT == MVT::iPTR ?
	PointerType::get(Type::getInt8Ty(*getContext()), 0) :
	VT.getTypeForEVT(*getContext());

	return getDataLayout().getABITypeAlignment(Ty);
	}

	// EntryNode could meaningfully have debug info if we can find it...
	SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
	: TM(tm), OptLevel(OL),
	EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
	Root(getEntryNode()) {
	InsertNode(&EntryNode);
	DbgInfo = new SDDbgInfo();
	}

	void SelectionDAG::init(MachineFunction &NewMF,
	OptimizationRemarkEmitter &NewORE,
	Pass PassPtr, const TargetLibraryInfo LibraryInfo,
	LegacyDivergenceAnalysis * Divergence) {
	MF = &NewMF;
	SDAGISelPass = PassPtr;
	ORE = &NewORE;
	TLI = getSubtarget().getTargetLowering();
	TSI = getSubtarget().getSelectionDAGInfo();
	LibInfo = LibraryInfo;
	Context = &MF->getFunction().getContext();
	DA = Divergence;
	}

	SelectionDAG::~SelectionDAG() {
	assert(!UpdateListeners && "Dangling registered DAGUpdateListeners");
	allnodes_clear();
	OperandRecycler.clear(OperandAllocator);
	delete DbgInfo;
	}

	void SelectionDAG::allnodes_clear() {
	assert(&*AllNodes.begin() == &EntryNode);
	AllNodes.remove(AllNodes.begin());
	while (!AllNodes.empty())
	DeallocateNode(&AllNodes.front());
	#ifndef NDEBUG
	NextPersistentId = 0;
	#endif
	}

	SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
	void *&InsertPos) {
	SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
	if (N) {
	switch (N->getOpcode()) {
	default: break;
	case ISD::Constant:
	case ISD::ConstantFP:
	llvm_unreachable("Querying for Constant and ConstantFP nodes requires "
	"debug location. Use another overload.");
	}
	}
	return N;
	}

	SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
	const SDLoc &DL, void *&InsertPos) {
	SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
	if (N) {
	switch (N->getOpcode()) {
	case ISD::Constant:
	case ISD::ConstantFP:
	// Erase debug location from the node if the node is used at several
	// different places. Do not propagate one location to all uses as it
	// will cause a worse single stepping debugging experience.
	if (N->getDebugLoc() != DL.getDebugLoc())
	N->setDebugLoc(DebugLoc());
	break;
	default:
	// When the node's point of use is located earlier in the instruction
	// sequence than its prior point of use, update its debug info to the
	// earlier location.
	if (DL.getIROrder() && DL.getIROrder() < N->getIROrder())
	N->setDebugLoc(DL.getDebugLoc());
	break;
	}
	}
	return N;
	}

	void SelectionDAG::clear() {
	allnodes_clear();
	OperandRecycler.clear(OperandAllocator);
	OperandAllocator.Reset();
	CSEMap.clear();

	ExtendedValueTypeNodes.clear();
	ExternalSymbols.clear();
	TargetExternalSymbols.clear();
	MCSymbols.clear();
	+ SDCallSiteDbgInfo.clear();
	std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
	static_cast<CondCodeSDNode*>(nullptr));
	std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
	static_cast<SDNode*>(nullptr));

	EntryNode.UseList = nullptr;
	InsertNode(&EntryNode);
	Root = getEntryNode();
	DbgInfo->clear();
	}

	SDValue SelectionDAG::getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT) {
	return VT.bitsGT(Op.getValueType())
	? getNode(ISD::FP_EXTEND, DL, VT, Op)
	: getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL));
	}

	SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
	return VT.bitsGT(Op.getValueType()) ?
	getNode(ISD::ANY_EXTEND, DL, VT, Op) :
	getNode(ISD::TRUNCATE, DL, VT, Op);
	}

	SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
	return VT.bitsGT(Op.getValueType()) ?
	getNode(ISD::SIGN_EXTEND, DL, VT, Op) :
	getNode(ISD::TRUNCATE, DL, VT, Op);
	}

	SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
	return VT.bitsGT(Op.getValueType()) ?
	getNode(ISD::ZERO_EXTEND, DL, VT, Op) :
	getNode(ISD::TRUNCATE, DL, VT, Op);
	}

	SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT,
	EVT OpVT) {
	if (VT.bitsLE(Op.getValueType()))
	return getNode(ISD::TRUNCATE, SL, VT, Op);

	TargetLowering::BooleanContent BType = TLI->getBooleanContents(OpVT);
	return getNode(TLI->getExtendForContent(BType), SL, VT, Op);
	}

	SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
	assert(!VT.isVector() &&
	"getZeroExtendInReg should use the vector element type instead of "
	"the vector type!");
	if (Op.getValueType().getScalarType() == VT) return Op;
	unsigned BitWidth = Op.getScalarValueSizeInBits();
	APInt Imm = APInt::getLowBitsSet(BitWidth,
	VT.getSizeInBits());
	return getNode(ISD::AND, DL, Op.getValueType(), Op,
	getConstant(Imm, DL, Op.getValueType()));
	}

	SDValue SelectionDAG::getPtrExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
	// Only unsigned pointer semantics are supported right now. In the future this
	// might delegate to TLI to check pointer signedness.
	return getZExtOrTrunc(Op, DL, VT);
	}

	SDValue SelectionDAG::getPtrExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
	// Only unsigned pointer semantics are supported right now. In the future this
	// might delegate to TLI to check pointer signedness.
	return getZeroExtendInReg(Op, DL, VT);
	}

	/// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
	SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
	EVT EltVT = VT.getScalarType();
	SDValue NegOne =
	getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, VT);
	return getNode(ISD::XOR, DL, VT, Val, NegOne);
	}

	SDValue SelectionDAG::getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT) {
	SDValue TrueValue = getBoolConstant(true, DL, VT, VT);
	return getNode(ISD::XOR, DL, VT, Val, TrueValue);
	}

	SDValue SelectionDAG::getBoolConstant(bool V, const SDLoc &DL, EVT VT,
	EVT OpVT) {
	if (!V)
	return getConstant(0, DL, VT);

	switch (TLI->getBooleanContents(OpVT)) {
	case TargetLowering::ZeroOrOneBooleanContent:
	case TargetLowering::UndefinedBooleanContent:
	return getConstant(1, DL, VT);
	case TargetLowering::ZeroOrNegativeOneBooleanContent:
	return getAllOnesConstant(DL, VT);
	}
	llvm_unreachable("Unexpected boolean content enum!");
	}

	SDValue SelectionDAG::getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
	bool isT, bool isO) {
	EVT EltVT = VT.getScalarType();
	assert((EltVT.getSizeInBits() >= 64 \|\|
	(uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
	"getConstant with a uint64_t value that doesn't fit in the type!");
	return getConstant(APInt(EltVT.getSizeInBits(), Val), DL, VT, isT, isO);
	}

	SDValue SelectionDAG::getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
	bool isT, bool isO) {
	return getConstant(ConstantInt::get(Context, Val), DL, VT, isT, isO);
	}

	SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
	EVT VT, bool isT, bool isO) {
	assert(VT.isInteger() && "Cannot create FP integer constant!");

	EVT EltVT = VT.getScalarType();
	const ConstantInt *Elt = &Val;

	// In some cases the vector type is legal but the element type is illegal and
	// needs to be promoted, for example v8i8 on ARM. In this case, promote the
	// inserted value (the type does not need to match the vector element type).
	// Any extra bits introduced will be truncated away.
	if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
	TargetLowering::TypePromoteInteger) {
	EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
	APInt NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
	Elt = ConstantInt::get(*getContext(), NewVal);
	}
	// In other cases the element type is illegal and needs to be expanded, for
	// example v2i64 on MIPS32. In this case, find the nearest legal type, split
	// the value into n parts and use a vector type with n-times the elements.
	// Then bitcast to the type requested.
	// Legalizing constants too early makes the DAGCombiner's job harder so we
	// only legalize if the DAG tells us we must produce legal types.
	else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
	TLI->getTypeAction(*getContext(), EltVT) ==
	TargetLowering::TypeExpandInteger) {
	const APInt &NewVal = Elt->getValue();
	EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
	unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
	unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
	EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);

	// Check the temporary vector is the correct size. If this fails then
	// getTypeToTransformTo() probably returned a type whose size (in bits)
	// isn't a power-of-2 factor of the requested type size.
	assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());

	SmallVector<SDValue, 2> EltParts;
	for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) {
	EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits)
	.zextOrTrunc(ViaEltSizeInBits), DL,
	ViaEltVT, isT, isO));
	}

	// EltParts is currently in little endian order. If we actually want
	// big-endian order then reverse it now.
	if (getDataLayout().isBigEndian())
	std::reverse(EltParts.begin(), EltParts.end());

	// The elements must be reversed when the element order is different
	// to the endianness of the elements (because the BITCAST is itself a
	// vector shuffle in this situation). However, we do not need any code to
	// perform this reversal because getConstant() is producing a vector
	// splat.
	// This situation occurs in MIPS MSA.

	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
	Ops.insert(Ops.end(), EltParts.begin(), EltParts.end());

	SDValue V = getNode(ISD::BITCAST, DL, VT, getBuildVector(ViaVecVT, DL, Ops));
	return V;
	}

	assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
	"APInt size does not match type size!");
	unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
	ID.AddPointer(Elt);
	ID.AddBoolean(isO);
	void *IP = nullptr;
	SDNode *N = nullptr;
	if ((N = FindNodeOrInsertPos(ID, DL, IP)))
	if (!VT.isVector())
	return SDValue(N, 0);

	if (!N) {
	N = newSDNode<ConstantSDNode>(isT, isO, Elt, EltVT);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	NewSDValueDbgMsg(SDValue(N, 0), "Creating constant: ", this);
	}

	SDValue Result(N, 0);
	if (VT.isVector())
	Result = getSplatBuildVector(VT, DL, Result);

	return Result;
	}

	SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, const SDLoc &DL,
	bool isTarget) {
	return getConstant(Val, DL, TLI->getPointerTy(getDataLayout()), isTarget);
	}

	SDValue SelectionDAG::getShiftAmountConstant(uint64_t Val, EVT VT,
	const SDLoc &DL, bool LegalTypes) {
	EVT ShiftVT = TLI->getShiftAmountTy(VT, getDataLayout(), LegalTypes);
	return getConstant(Val, DL, ShiftVT);
	}

	SDValue SelectionDAG::getConstantFP(const APFloat &V, const SDLoc &DL, EVT VT,
	bool isTarget) {
	return getConstantFP(ConstantFP::get(getContext(), V), DL, VT, isTarget);
	}

	SDValue SelectionDAG::getConstantFP(const ConstantFP &V, const SDLoc &DL,
	EVT VT, bool isTarget) {
	assert(VT.isFloatingPoint() && "Cannot create integer FP constant!");

	EVT EltVT = VT.getScalarType();

	// Do the map lookup using the actual bit pattern for the floating point
	// value, so that we don't have problems with 0.0 comparing equal to -0.0, and
	// we don't have issues with SNANs.
	unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
	ID.AddPointer(&V);
	void *IP = nullptr;
	SDNode *N = nullptr;
	if ((N = FindNodeOrInsertPos(ID, DL, IP)))
	if (!VT.isVector())
	return SDValue(N, 0);

	if (!N) {
	N = newSDNode<ConstantFPSDNode>(isTarget, &V, EltVT);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	}

	SDValue Result(N, 0);
	if (VT.isVector())
	Result = getSplatBuildVector(VT, DL, Result);
	NewSDValueDbgMsg(Result, "Creating fp constant: ", this);
	return Result;
	}

	SDValue SelectionDAG::getConstantFP(double Val, const SDLoc &DL, EVT VT,
	bool isTarget) {
	EVT EltVT = VT.getScalarType();
	if (EltVT == MVT::f32)
	return getConstantFP(APFloat((float)Val), DL, VT, isTarget);
	else if (EltVT == MVT::f64)
	return getConstantFP(APFloat(Val), DL, VT, isTarget);
	else if (EltVT == MVT::f80 \|\| EltVT == MVT::f128 \|\| EltVT == MVT::ppcf128 \|\|
	EltVT == MVT::f16) {
	bool Ignored;
	APFloat APF = APFloat(Val);
	APF.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
	&Ignored);
	return getConstantFP(APF, DL, VT, isTarget);
	} else
	llvm_unreachable("Unsupported type in getConstantFP");
	}

	SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
	EVT VT, int64_t Offset, bool isTargetGA,
	unsigned char TargetFlags) {
	assert((TargetFlags == 0 \|\| isTargetGA) &&
	"Cannot set target flags on target-independent globals");

	// Truncate (with sign-extension) the offset value to the pointer size.
	unsigned BitWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
	if (BitWidth < 64)
	Offset = SignExtend64(Offset, BitWidth);

	unsigned Opc;
	if (GV->isThreadLocal())
	Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress;
	else
	Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddPointer(GV);
	ID.AddInteger(Offset);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<GlobalAddressSDNode>(
	Opc, DL.getIROrder(), DL.getDebugLoc(), GV, VT, Offset, TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
	unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddInteger(FI);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<FrameIndexSDNode>(FI, VT, isTarget);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
	unsigned char TargetFlags) {
	assert((TargetFlags == 0 \|\| isTarget) &&
	"Cannot set target flags on target-independent jump tables");
	unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddInteger(JTI);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<JumpTableSDNode>(JTI, VT, isTarget, TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
	unsigned Alignment, int Offset,
	bool isTarget,
	unsigned char TargetFlags) {
	assert((TargetFlags == 0 \|\| isTarget) &&
	"Cannot set target flags on target-independent globals");
	if (Alignment == 0)
	Alignment = MF->getFunction().hasOptSize()
	? getDataLayout().getABITypeAlignment(C->getType())
	: getDataLayout().getPrefTypeAlignment(C->getType());
	unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddInteger(Alignment);
	ID.AddInteger(Offset);
	ID.AddPointer(C);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
	TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
	unsigned Alignment, int Offset,
	bool isTarget,
	unsigned char TargetFlags) {
	assert((TargetFlags == 0 \|\| isTarget) &&
	"Cannot set target flags on target-independent globals");
	if (Alignment == 0)
	Alignment = getDataLayout().getPrefTypeAlignment(C->getType());
	unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddInteger(Alignment);
	ID.AddInteger(Offset);
	C->addSelectionDAGCSEId(ID);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
	TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
	unsigned char TargetFlags) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), None);
	ID.AddInteger(Index);
	ID.AddInteger(Offset);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<TargetIndexSDNode>(Index, VT, Offset, TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), None);
	ID.AddPointer(MBB);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<BasicBlockSDNode>(MBB);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getValueType(EVT VT) {
	if (VT.isSimple() && (unsigned)VT.getSimpleVT().SimpleTy >=
	ValueTypeNodes.size())
	ValueTypeNodes.resize(VT.getSimpleVT().SimpleTy+1);

	SDNode *&N = VT.isExtended() ?
	ExtendedValueTypeNodes[VT] : ValueTypeNodes[VT.getSimpleVT().SimpleTy];

	if (N) return SDValue(N, 0);
	N = newSDNode<VTSDNode>(VT);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
	SDNode *&N = ExternalSymbols[Sym];
	if (N) return SDValue(N, 0);
	N = newSDNode<ExternalSymbolSDNode>(false, Sym, 0, VT);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getMCSymbol(MCSymbol *Sym, EVT VT) {
	SDNode *&N = MCSymbols[Sym];
	if (N)
	return SDValue(N, 0);
	N = newSDNode<MCSymbolSDNode>(Sym, VT);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
	unsigned char TargetFlags) {
	SDNode *&N =
	TargetExternalSymbols[std::pair<std::string,unsigned char>(Sym,
	TargetFlags)];
	if (N) return SDValue(N, 0);
	N = newSDNode<ExternalSymbolSDNode>(true, Sym, TargetFlags, VT);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
	if ((unsigned)Cond >= CondCodeNodes.size())
	CondCodeNodes.resize(Cond+1);

	if (!CondCodeNodes[Cond]) {
	auto *N = newSDNode<CondCodeSDNode>(Cond);
	CondCodeNodes[Cond] = N;
	InsertNode(N);
	}

	return SDValue(CondCodeNodes[Cond], 0);
	}

	/// Swaps the values of N1 and N2. Swaps all indices in the shuffle mask M that
	/// point at N1 to point at N2 and indices that point at N2 to point at N1.
	static void commuteShuffle(SDValue &N1, SDValue &N2, MutableArrayRef<int> M) {
	std::swap(N1, N2);
	ShuffleVectorSDNode::commuteMask(M);
	}

	SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
	SDValue N2, ArrayRef<int> Mask) {
	assert(VT.getVectorNumElements() == Mask.size() &&
	"Must have the same number of vector elements as mask elements!");
	assert(VT == N1.getValueType() && VT == N2.getValueType() &&
	"Invalid VECTOR_SHUFFLE");

	// Canonicalize shuffle undef, undef -> undef
	if (N1.isUndef() && N2.isUndef())
	return getUNDEF(VT);

	// Validate that all indices in Mask are within the range of the elements
	// input to the shuffle.
	int NElts = Mask.size();
	assert(llvm::all_of(Mask,
	[&](int M) { return M < (NElts * 2) && M >= -1; }) &&
	"Index out of range");

	// Copy the mask so we can do any needed cleanup.
	SmallVector<int, 8> MaskVec(Mask.begin(), Mask.end());

	// Canonicalize shuffle v, v -> v, undef
	if (N1 == N2) {
	N2 = getUNDEF(VT);
	for (int i = 0; i != NElts; ++i)
	if (MaskVec[i] >= NElts) MaskVec[i] -= NElts;
	}

	// Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask.
	if (N1.isUndef())
	commuteShuffle(N1, N2, MaskVec);

	if (TLI->hasVectorBlend()) {
	// If shuffling a splat, try to blend the splat instead. We do this here so
	// that even when this arises during lowering we don't have to re-handle it.
	auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) {
	BitVector UndefElements;
	SDValue Splat = BV->getSplatValue(&UndefElements);
	if (!Splat)
	return;

	for (int i = 0; i < NElts; ++i) {
	if (MaskVec[i] < Offset \|\| MaskVec[i] >= (Offset + NElts))
	continue;

	// If this input comes from undef, mark it as such.
	if (UndefElements[MaskVec[i] - Offset]) {
	MaskVec[i] = -1;
	continue;
	}

	// If we can blend a non-undef lane, use that instead.
	if (!UndefElements[i])
	MaskVec[i] = i + Offset;
	}
	};
	if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
	BlendSplat(N1BV, 0);
	if (auto *N2BV = dyn_cast<BuildVectorSDNode>(N2))
	BlendSplat(N2BV, NElts);
	}

	// Canonicalize all index into lhs, -> shuffle lhs, undef
	// Canonicalize all index into rhs, -> shuffle rhs, undef
	bool AllLHS = true, AllRHS = true;
	bool N2Undef = N2.isUndef();
	for (int i = 0; i != NElts; ++i) {
	if (MaskVec[i] >= NElts) {
	if (N2Undef)
	MaskVec[i] = -1;
	else
	AllLHS = false;
	} else if (MaskVec[i] >= 0) {
	AllRHS = false;
	}
	}
	if (AllLHS && AllRHS)
	return getUNDEF(VT);
	if (AllLHS && !N2Undef)
	N2 = getUNDEF(VT);
	if (AllRHS) {
	N1 = getUNDEF(VT);
	commuteShuffle(N1, N2, MaskVec);
	}
	// Reset our undef status after accounting for the mask.
	N2Undef = N2.isUndef();
	// Re-check whether both sides ended up undef.
	if (N1.isUndef() && N2Undef)
	return getUNDEF(VT);

	// If Identity shuffle return that node.
	bool Identity = true, AllSame = true;
	for (int i = 0; i != NElts; ++i) {
	if (MaskVec[i] >= 0 && MaskVec[i] != i) Identity = false;
	if (MaskVec[i] != MaskVec[0]) AllSame = false;
	}
	if (Identity && NElts)
	return N1;

	// Shuffling a constant splat doesn't change the result.
	if (N2Undef) {
	SDValue V = N1;

	// Look through any bitcasts. We check that these don't change the number
	// (and size) of elements and just changes their types.
	while (V.getOpcode() == ISD::BITCAST)
	V = V->getOperand(0);

	// A splat should always show up as a build vector node.
	if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
	BitVector UndefElements;
	SDValue Splat = BV->getSplatValue(&UndefElements);
	// If this is a splat of an undef, shuffling it is also undef.
	if (Splat && Splat.isUndef())
	return getUNDEF(VT);

	bool SameNumElts =
	V.getValueType().getVectorNumElements() == VT.getVectorNumElements();

	// We only have a splat which can skip shuffles if there is a splatted
	// value and no undef lanes rearranged by the shuffle.
	if (Splat && UndefElements.none()) {
	// Splat of <x, x, ..., x>, return <x, x, ..., x>, provided that the
	// number of elements match or the value splatted is a zero constant.
	if (SameNumElts)
	return N1;
	if (auto *C = dyn_cast<ConstantSDNode>(Splat))
	if (C->isNullValue())
	return N1;
	}

	// If the shuffle itself creates a splat, build the vector directly.
	if (AllSame && SameNumElts) {
	EVT BuildVT = BV->getValueType(0);
	const SDValue &Splatted = BV->getOperand(MaskVec[0]);
	SDValue NewBV = getSplatBuildVector(BuildVT, dl, Splatted);

	// We may have jumped through bitcasts, so the type of the
	// BUILD_VECTOR may not match the type of the shuffle.
	if (BuildVT != VT)
	NewBV = getNode(ISD::BITCAST, dl, VT, NewBV);
	return NewBV;
	}
	}
	}

	FoldingSetNodeID ID;
	SDValue Ops[2] = { N1, N2 };
	AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops);
	for (int i = 0; i != NElts; ++i)
	ID.AddInteger(MaskVec[i]);

	void* IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
	return SDValue(E, 0);

	// Allocate the mask array for the node out of the BumpPtrAllocator, since
	// SDNode doesn't have access to it. This memory will be "leaked" when
	// the node is deallocated, but recovered when the NodeAllocator is released.
	int *MaskAlloc = OperandAllocator.Allocate<int>(NElts);
	llvm::copy(MaskVec, MaskAlloc);

	auto *N = newSDNode<ShuffleVectorSDNode>(VT, dl.getIROrder(),
	dl.getDebugLoc(), MaskAlloc);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V = SDValue(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) {
	EVT VT = SV.getValueType(0);
	SmallVector<int, 8> MaskVec(SV.getMask().begin(), SV.getMask().end());
	ShuffleVectorSDNode::commuteMask(MaskVec);

	SDValue Op0 = SV.getOperand(0);
	SDValue Op1 = SV.getOperand(1);
	return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, MaskVec);
	}

	SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::Register, getVTList(VT), None);
	ID.AddInteger(RegNo);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<RegisterSDNode>(RegNo, VT);
	N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, DA);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::RegisterMask, getVTList(MVT::Untyped), None);
	ID.AddPointer(RegMask);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<RegisterMaskSDNode>(RegMask);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getEHLabel(const SDLoc &dl, SDValue Root,
	MCSymbol *Label) {
	return getLabelNode(ISD::EH_LABEL, dl, Root, Label);
	}

	SDValue SelectionDAG::getLabelNode(unsigned Opcode, const SDLoc &dl,
	SDValue Root, MCSymbol *Label) {
	FoldingSetNodeID ID;
	SDValue Ops[] = { Root };
	AddNodeIDNode(ID, Opcode, getVTList(MVT::Other), Ops);
	ID.AddPointer(Label);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N =
	newSDNode<LabelSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(), Label);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
	int64_t Offset,
	bool isTarget,
	unsigned char TargetFlags) {
	unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, getVTList(VT), None);
	ID.AddPointer(BA);
	ID.AddInteger(Offset);
	ID.AddInteger(TargetFlags);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<BlockAddressSDNode>(Opc, VT, BA, Offset, TargetFlags);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getSrcValue(const Value *V) {
	assert((!V \|\| V->getType()->isPointerTy()) &&
	"SrcValue is not a pointer?");

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), None);
	ID.AddPointer(V);

	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<SrcValueSDNode>(V);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getMDNode(const MDNode *MD) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), None);
	ID.AddPointer(MD);

	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<MDNodeSDNode>(MD);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getBitcast(EVT VT, SDValue V) {
	if (VT == V.getValueType())
	return V;

	return getNode(ISD::BITCAST, SDLoc(V), VT, V);
	}

	SDValue SelectionDAG::getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr,
	unsigned SrcAS, unsigned DestAS) {
	SDValue Ops[] = {Ptr};
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), Ops);
	ID.AddInteger(SrcAS);
	ID.AddInteger(DestAS);

	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<AddrSpaceCastSDNode>(dl.getIROrder(), dl.getDebugLoc(),
	VT, SrcAS, DestAS);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	/// getShiftAmountOperand - Return the specified value casted to
	/// the target's desired shift amount type.
	SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
	EVT OpTy = Op.getValueType();
	EVT ShTy = TLI->getShiftAmountTy(LHSTy, getDataLayout());
	if (OpTy == ShTy \|\| OpTy.isVector()) return Op;

	return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
	}

	SDValue SelectionDAG::expandVAArg(SDNode *Node) {
	SDLoc dl(Node);
	const TargetLowering &TLI = getTargetLoweringInfo();
	const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
	EVT VT = Node->getValueType(0);
	SDValue Tmp1 = Node->getOperand(0);
	SDValue Tmp2 = Node->getOperand(1);
	unsigned Align = Node->getConstantOperandVal(3);

	SDValue VAListLoad = getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1,
	Tmp2, MachinePointerInfo(V));
	SDValue VAList = VAListLoad;

	if (Align > TLI.getMinStackArgumentAlignment()) {
	assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");

	VAList = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
	getConstant(Align - 1, dl, VAList.getValueType()));

	VAList = getNode(ISD::AND, dl, VAList.getValueType(), VAList,
	getConstant(-(int64_t)Align, dl, VAList.getValueType()));
	}

	// Increment the pointer, VAList, to the next vaarg
	Tmp1 = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
	getConstant(getDataLayout().getTypeAllocSize(
	VT.getTypeForEVT(*getContext())),
	dl, VAList.getValueType()));
	// Store the incremented VAList to the legalized pointer
	Tmp1 =
	getStore(VAListLoad.getValue(1), dl, Tmp1, Tmp2, MachinePointerInfo(V));
	// Load the actual argument out of the pointer VAList
	return getLoad(VT, dl, Tmp1, VAList, MachinePointerInfo());
	}

	SDValue SelectionDAG::expandVACopy(SDNode *Node) {
	SDLoc dl(Node);
	const TargetLowering &TLI = getTargetLoweringInfo();
	// This defaults to loading a pointer from the input and storing it to the
	// output, returning the chain.
	const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
	const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
	SDValue Tmp1 =
	getLoad(TLI.getPointerTy(getDataLayout()), dl, Node->getOperand(0),
	Node->getOperand(2), MachinePointerInfo(VS));
	return getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1),
	MachinePointerInfo(VD));
	}

	SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
	MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
	unsigned ByteSize = VT.getStoreSize();
	Type Ty = VT.getTypeForEVT(getContext());
	unsigned StackAlign =
	std::max((unsigned)getDataLayout().getPrefTypeAlignment(Ty), minAlign);

	int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
	return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
	}

	SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
	unsigned Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize());
	Type Ty1 = VT1.getTypeForEVT(getContext());
	Type Ty2 = VT2.getTypeForEVT(getContext());
	const DataLayout &DL = getDataLayout();
	unsigned Align =
	std::max(DL.getPrefTypeAlignment(Ty1), DL.getPrefTypeAlignment(Ty2));

	MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
	int FrameIdx = MFI.CreateStackObject(Bytes, Align, false);
	return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
	}

	SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
	ISD::CondCode Cond, const SDLoc &dl) {
	EVT OpVT = N1.getValueType();

	// These setcc operations always fold.
	switch (Cond) {
	default: break;
	case ISD::SETFALSE:
	case ISD::SETFALSE2: return getBoolConstant(false, dl, VT, OpVT);
	case ISD::SETTRUE:
	case ISD::SETTRUE2: return getBoolConstant(true, dl, VT, OpVT);

	case ISD::SETOEQ:
	case ISD::SETOGT:
	case ISD::SETOGE:
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETONE:
	case ISD::SETO:
	case ISD::SETUO:
	case ISD::SETUEQ:
	case ISD::SETUNE:
	assert(!OpVT.isInteger() && "Illegal setcc for integer!");
	break;
	}

	if (OpVT.isInteger()) {
	// For EQ and NE, we can always pick a value for the undef to make the
	// predicate pass or fail, so we can return undef.
	// Matches behavior in llvm::ConstantFoldCompareInstruction.
	// icmp eq/ne X, undef -> undef.
	if ((N1.isUndef() \|\| N2.isUndef()) &&
	(Cond == ISD::SETEQ \|\| Cond == ISD::SETNE))
	return getUNDEF(VT);

	// If both operands are undef, we can return undef for int comparison.
	// icmp undef, undef -> undef.
	if (N1.isUndef() && N2.isUndef())
	return getUNDEF(VT);

	// icmp X, X -> true/false
	// icmp X, undef -> true/false because undef could be X.
	if (N1 == N2)
	return getBoolConstant(ISD::isTrueWhenEqual(Cond), dl, VT, OpVT);
	}

	if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2)) {
	const APInt &C2 = N2C->getAPIntValue();
	if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
	const APInt &C1 = N1C->getAPIntValue();

	switch (Cond) {
	default: llvm_unreachable("Unknown integer setcc!");
	case ISD::SETEQ: return getBoolConstant(C1 == C2, dl, VT, OpVT);
	case ISD::SETNE: return getBoolConstant(C1 != C2, dl, VT, OpVT);
	case ISD::SETULT: return getBoolConstant(C1.ult(C2), dl, VT, OpVT);
	case ISD::SETUGT: return getBoolConstant(C1.ugt(C2), dl, VT, OpVT);
	case ISD::SETULE: return getBoolConstant(C1.ule(C2), dl, VT, OpVT);
	case ISD::SETUGE: return getBoolConstant(C1.uge(C2), dl, VT, OpVT);
	case ISD::SETLT: return getBoolConstant(C1.slt(C2), dl, VT, OpVT);
	case ISD::SETGT: return getBoolConstant(C1.sgt(C2), dl, VT, OpVT);
	case ISD::SETLE: return getBoolConstant(C1.sle(C2), dl, VT, OpVT);
	case ISD::SETGE: return getBoolConstant(C1.sge(C2), dl, VT, OpVT);
	}
	}
	}

	auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	auto *N2CFP = dyn_cast<ConstantFPSDNode>(N2);

	if (N1CFP && N2CFP) {
	APFloat::cmpResult R = N1CFP->getValueAPF().compare(N2CFP->getValueAPF());
	switch (Cond) {
	default: break;
	case ISD::SETEQ: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOEQ: return getBoolConstant(R==APFloat::cmpEqual, dl, VT,
	OpVT);
	case ISD::SETNE: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETONE: return getBoolConstant(R==APFloat::cmpGreaterThan \|\|
	R==APFloat::cmpLessThan, dl, VT,
	OpVT);
	case ISD::SETLT: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT: return getBoolConstant(R==APFloat::cmpLessThan, dl, VT,
	OpVT);
	case ISD::SETGT: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT: return getBoolConstant(R==APFloat::cmpGreaterThan, dl,
	VT, OpVT);
	case ISD::SETLE: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOLE: return getBoolConstant(R==APFloat::cmpLessThan \|\|
	R==APFloat::cmpEqual, dl, VT,
	OpVT);
	case ISD::SETGE: if (R==APFloat::cmpUnordered)
	return getUNDEF(VT);
	LLVM_FALLTHROUGH;
	case ISD::SETOGE: return getBoolConstant(R==APFloat::cmpGreaterThan \|\|
	R==APFloat::cmpEqual, dl, VT, OpVT);
	case ISD::SETO: return getBoolConstant(R!=APFloat::cmpUnordered, dl, VT,
	OpVT);
	case ISD::SETUO: return getBoolConstant(R==APFloat::cmpUnordered, dl, VT,
	OpVT);
	case ISD::SETUEQ: return getBoolConstant(R==APFloat::cmpUnordered \|\|
	R==APFloat::cmpEqual, dl, VT,
	OpVT);
	case ISD::SETUNE: return getBoolConstant(R!=APFloat::cmpEqual, dl, VT,
	OpVT);
	case ISD::SETULT: return getBoolConstant(R==APFloat::cmpUnordered \|\|
	R==APFloat::cmpLessThan, dl, VT,
	OpVT);
	case ISD::SETUGT: return getBoolConstant(R==APFloat::cmpGreaterThan \|\|
	R==APFloat::cmpUnordered, dl, VT,
	OpVT);
	case ISD::SETULE: return getBoolConstant(R!=APFloat::cmpGreaterThan, dl,
	VT, OpVT);
	case ISD::SETUGE: return getBoolConstant(R!=APFloat::cmpLessThan, dl, VT,
	OpVT);
	}
	} else if (N1CFP && OpVT.isSimple() && !N2.isUndef()) {
	// Ensure that the constant occurs on the RHS.
	ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond);
	if (!TLI->isCondCodeLegal(SwappedCond, OpVT.getSimpleVT()))
	return SDValue();
	return getSetCC(dl, VT, N2, N1, SwappedCond);
	} else if ((N2CFP && N2CFP->getValueAPF().isNaN()) \|\|
	(OpVT.isFloatingPoint() && (N1.isUndef() \|\| N2.isUndef()))) {
	// If an operand is known to be a nan (or undef that could be a nan), we can
	// fold it.
	// Choosing NaN for the undef will always make unordered comparison succeed
	// and ordered comparison fails.
	// Matches behavior in llvm::ConstantFoldCompareInstruction.
	switch (ISD::getUnorderedFlavor(Cond)) {
	default:
	llvm_unreachable("Unknown flavor!");
	case 0: // Known false.
	return getBoolConstant(false, dl, VT, OpVT);
	case 1: // Known true.
	return getBoolConstant(true, dl, VT, OpVT);
	case 2: // Undefined.
	return getUNDEF(VT);
	}
	}

	// Could not fold it.
	return SDValue();
	}

	/// See if the specified operand can be simplified with the knowledge that only
	/// the bits specified by DemandedBits are used.
	/// TODO: really we should be making this into the DAG equivalent of
	/// SimplifyMultipleUseDemandedBits and not generate any new nodes.
	SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits) {
	EVT VT = V.getValueType();
	APInt DemandedElts = VT.isVector()
	? APInt::getAllOnesValue(VT.getVectorNumElements())
	: APInt(1, 1);
	return GetDemandedBits(V, DemandedBits, DemandedElts);
	}

	/// See if the specified operand can be simplified with the knowledge that only
	/// the bits specified by DemandedBits are used in the elements specified by
	/// DemandedElts.
	/// TODO: really we should be making this into the DAG equivalent of
	/// SimplifyMultipleUseDemandedBits and not generate any new nodes.
	SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits,
	const APInt &DemandedElts) {
	switch (V.getOpcode()) {
	default:
	break;
	case ISD::Constant: {
	auto *CV = cast<ConstantSDNode>(V.getNode());
	assert(CV && "Const value should be ConstSDNode.");
	const APInt &CVal = CV->getAPIntValue();
	APInt NewVal = CVal & DemandedBits;
	if (NewVal != CVal)
	return getConstant(NewVal, SDLoc(V), V.getValueType());
	break;
	}
	case ISD::OR:
	case ISD::XOR:
	// If the LHS or RHS don't contribute bits to the or, drop them.
	if (MaskedValueIsZero(V.getOperand(0), DemandedBits))
	return V.getOperand(1);
	if (MaskedValueIsZero(V.getOperand(1), DemandedBits))
	return V.getOperand(0);
	break;
	case ISD::SRL:
	// Only look at single-use SRLs.
	if (!V.getNode()->hasOneUse())
	break;
	if (auto *RHSC = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
	// See if we can recursively simplify the LHS.
	unsigned Amt = RHSC->getZExtValue();

	// Watch out for shift count overflow though.
	if (Amt >= DemandedBits.getBitWidth())
	break;
	APInt SrcDemandedBits = DemandedBits << Amt;
	if (SDValue SimplifyLHS =
	GetDemandedBits(V.getOperand(0), SrcDemandedBits))
	return getNode(ISD::SRL, SDLoc(V), V.getValueType(), SimplifyLHS,
	V.getOperand(1));
	}
	break;
	case ISD::AND: {
	// X & -1 -> X (ignoring bits which aren't demanded).
	// Also handle the case where masked out bits in X are known to be zero.
	if (ConstantSDNode *RHSC = isConstOrConstSplat(V.getOperand(1))) {
	const APInt &AndVal = RHSC->getAPIntValue();
	if (DemandedBits.isSubsetOf(AndVal) \|\|
	DemandedBits.isSubsetOf(computeKnownBits(V.getOperand(0)).Zero \|
	AndVal))
	return V.getOperand(0);
	}
	break;
	}
	case ISD::ANY_EXTEND: {
	SDValue Src = V.getOperand(0);
	unsigned SrcBitWidth = Src.getScalarValueSizeInBits();
	// Being conservative here - only peek through if we only demand bits in the
	// non-extended source (even though the extended bits are technically
	// undef).
	if (DemandedBits.getActiveBits() > SrcBitWidth)
	break;
	APInt SrcDemandedBits = DemandedBits.trunc(SrcBitWidth);
	if (SDValue DemandedSrc = GetDemandedBits(Src, SrcDemandedBits))
	return getNode(ISD::ANY_EXTEND, SDLoc(V), V.getValueType(), DemandedSrc);
	break;
	}
	case ISD::SIGN_EXTEND_INREG:
	EVT ExVT = cast<VTSDNode>(V.getOperand(1))->getVT();
	unsigned ExVTBits = ExVT.getScalarSizeInBits();

	// If none of the extended bits are demanded, eliminate the sextinreg.
	if (DemandedBits.getActiveBits() <= ExVTBits)
	return V.getOperand(0);

	break;
	}
	return SDValue();
	}

	/// SignBitIsZero - Return true if the sign bit of Op is known to be zero. We
	/// use this predicate to simplify operations downstream.
	bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
	unsigned BitWidth = Op.getScalarValueSizeInBits();
	return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
	}

	/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero. We use
	/// this predicate to simplify operations downstream. Mask is known to be zero
	/// for bits that V cannot have.
	bool SelectionDAG::MaskedValueIsZero(SDValue V, const APInt &Mask,
	unsigned Depth) const {
	EVT VT = V.getValueType();
	APInt DemandedElts = VT.isVector()
	? APInt::getAllOnesValue(VT.getVectorNumElements())
	: APInt(1, 1);
	return MaskedValueIsZero(V, Mask, DemandedElts, Depth);
	}

	/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero in
	/// DemandedElts. We use this predicate to simplify operations downstream.
	/// Mask is known to be zero for bits that V cannot have.
	bool SelectionDAG::MaskedValueIsZero(SDValue V, const APInt &Mask,
	const APInt &DemandedElts,
	unsigned Depth) const {
	return Mask.isSubsetOf(computeKnownBits(V, DemandedElts, Depth).Zero);
	}

	/// MaskedValueIsAllOnes - Return true if '(Op & Mask) == Mask'.
	bool SelectionDAG::MaskedValueIsAllOnes(SDValue V, const APInt &Mask,
	unsigned Depth) const {
	return Mask.isSubsetOf(computeKnownBits(V, Depth).One);
	}

	/// isSplatValue - Return true if the vector V has the same value
	/// across all DemandedElts.
	bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
	APInt &UndefElts) {
	if (!DemandedElts)
	return false; // No demanded elts, better to assume we don't know anything.

	EVT VT = V.getValueType();
	assert(VT.isVector() && "Vector type expected");

	unsigned NumElts = VT.getVectorNumElements();
	assert(NumElts == DemandedElts.getBitWidth() && "Vector size mismatch");
	UndefElts = APInt::getNullValue(NumElts);

	switch (V.getOpcode()) {
	case ISD::BUILD_VECTOR: {
	SDValue Scl;
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue Op = V.getOperand(i);
	if (Op.isUndef()) {
	UndefElts.setBit(i);
	continue;
	}
	if (!DemandedElts[i])
	continue;
	if (Scl && Scl != Op)
	return false;
	Scl = Op;
	}
	return true;
	}
	case ISD::VECTOR_SHUFFLE: {
	// Check if this is a shuffle node doing a splat.
	// TODO: Do we need to handle shuffle(splat, undef, mask)?
	int SplatIndex = -1;
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(V)->getMask();
	for (int i = 0; i != (int)NumElts; ++i) {
	int M = Mask[i];
	if (M < 0) {
	UndefElts.setBit(i);
	continue;
	}
	if (!DemandedElts[i])
	continue;
	if (0 <= SplatIndex && SplatIndex != M)
	return false;
	SplatIndex = M;
	}
	return true;
	}
	case ISD::EXTRACT_SUBVECTOR: {
	SDValue Src = V.getOperand(0);
	ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(V.getOperand(1));
	unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
	if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
	// Offset the demanded elts by the subvector index.
	uint64_t Idx = SubIdx->getZExtValue();
	APInt UndefSrcElts;
	APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
	if (isSplatValue(Src, DemandedSrc, UndefSrcElts)) {
	UndefElts = UndefSrcElts.extractBits(NumElts, Idx);
	return true;
	}
	}
	break;
	}
	case ISD::ADD:
	case ISD::SUB:
	case ISD::AND: {
	APInt UndefLHS, UndefRHS;
	SDValue LHS = V.getOperand(0);
	SDValue RHS = V.getOperand(1);
	if (isSplatValue(LHS, DemandedElts, UndefLHS) &&
	isSplatValue(RHS, DemandedElts, UndefRHS)) {
	UndefElts = UndefLHS \| UndefRHS;
	return true;
	}
	break;
	}
	}

	return false;
	}

	/// Helper wrapper to main isSplatValue function.
	bool SelectionDAG::isSplatValue(SDValue V, bool AllowUndefs) {
	EVT VT = V.getValueType();
	assert(VT.isVector() && "Vector type expected");
	unsigned NumElts = VT.getVectorNumElements();

	APInt UndefElts;
	APInt DemandedElts = APInt::getAllOnesValue(NumElts);
	return isSplatValue(V, DemandedElts, UndefElts) &&
	(AllowUndefs \|\| !UndefElts);
	}

	SDValue SelectionDAG::getSplatSourceVector(SDValue V, int &SplatIdx) {
	V = peekThroughExtractSubvectors(V);

	EVT VT = V.getValueType();
	unsigned Opcode = V.getOpcode();
	switch (Opcode) {
	default: {
	APInt UndefElts;
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (isSplatValue(V, DemandedElts, UndefElts)) {
	// Handle case where all demanded elements are UNDEF.
	if (DemandedElts.isSubsetOf(UndefElts)) {
	SplatIdx = 0;
	return getUNDEF(VT);
	}
	SplatIdx = (UndefElts & DemandedElts).countTrailingOnes();
	return V;
	}
	break;
	}
	case ISD::VECTOR_SHUFFLE: {
	// Check if this is a shuffle node doing a splat.
	// TODO - remove this and rely purely on SelectionDAG::isSplatValue,
	// getTargetVShiftNode currently struggles without the splat source.
	auto *SVN = cast<ShuffleVectorSDNode>(V);
	if (!SVN->isSplat())
	break;
	int Idx = SVN->getSplatIndex();
	int NumElts = V.getValueType().getVectorNumElements();
	SplatIdx = Idx % NumElts;
	return V.getOperand(Idx / NumElts);
	}
	}

	return SDValue();
	}

	SDValue SelectionDAG::getSplatValue(SDValue V) {
	int SplatIdx;
	if (SDValue SrcVector = getSplatSourceVector(V, SplatIdx))
	return getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V),
	SrcVector.getValueType().getScalarType(), SrcVector,
	getIntPtrConstant(SplatIdx, SDLoc(V)));
	return SDValue();
	}

	/// If a SHL/SRA/SRL node has a constant or splat constant shift amount that
	/// is less than the element bit-width of the shift node, return it.
	static const APInt *getValidShiftAmountConstant(SDValue V) {
	if (ConstantSDNode *SA = isConstOrConstSplat(V.getOperand(1))) {
	// Shifting more than the bitwidth is not valid.
	const APInt &ShAmt = SA->getAPIntValue();
	if (ShAmt.ult(V.getScalarValueSizeInBits()))
	return &ShAmt;
	}
	return nullptr;
	}

	/// Determine which bits of Op are known to be either zero or one and return
	/// them in Known. For vectors, the known bits are those that are shared by
	/// every vector element.
	KnownBits SelectionDAG::computeKnownBits(SDValue Op, unsigned Depth) const {
	EVT VT = Op.getValueType();
	APInt DemandedElts = VT.isVector()
	? APInt::getAllOnesValue(VT.getVectorNumElements())
	: APInt(1, 1);
	return computeKnownBits(Op, DemandedElts, Depth);
	}

	/// Determine which bits of Op are known to be either zero or one and return
	/// them in Known. The DemandedElts argument allows us to only collect the known
	/// bits that are shared by the requested vector elements.
	KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
	unsigned Depth) const {
	unsigned BitWidth = Op.getScalarValueSizeInBits();

	KnownBits Known(BitWidth); // Don't know anything.

	if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
	// We know all of the bits for a constant!
	Known.One = C->getAPIntValue();
	Known.Zero = ~Known.One;
	return Known;
	}
	if (auto *C = dyn_cast<ConstantFPSDNode>(Op)) {
	// We know all of the bits for a constant fp!
	Known.One = C->getValueAPF().bitcastToAPInt();
	Known.Zero = ~Known.One;
	return Known;
	}

	if (Depth == 6)
	return Known; // Limit search depth.

	KnownBits Known2;
	unsigned NumElts = DemandedElts.getBitWidth();
	assert((!Op.getValueType().isVector() \|\|
	NumElts == Op.getValueType().getVectorNumElements()) &&
	"Unexpected vector size");

	if (!DemandedElts)
	return Known; // No demanded elts, better to assume we don't know anything.

	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case ISD::BUILD_VECTOR:
	// Collect the known bits that are shared by every demanded vector element.
	Known.Zero.setAllBits(); Known.One.setAllBits();
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	if (!DemandedElts[i])
	continue;

	SDValue SrcOp = Op.getOperand(i);
	Known2 = computeKnownBits(SrcOp, Depth + 1);

	// BUILD_VECTOR can implicitly truncate sources, we must handle this.
	if (SrcOp.getValueSizeInBits() != BitWidth) {
	assert(SrcOp.getValueSizeInBits() > BitWidth &&
	"Expected BUILD_VECTOR implicit truncation");
	Known2 = Known2.trunc(BitWidth);
	}

	// Known bits are the values that are shared by every demanded element.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;

	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	}
	break;
	case ISD::VECTOR_SHUFFLE: {
	// Collect the known bits that are shared by every vector element referenced
	// by the shuffle.
	APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
	Known.Zero.setAllBits(); Known.One.setAllBits();
	const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
	assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;

	int M = SVN->getMaskElt(i);
	if (M < 0) {
	// For UNDEF elements, we don't know anything about the common state of
	// the shuffle result.
	Known.resetAll();
	DemandedLHS.clearAllBits();
	DemandedRHS.clearAllBits();
	break;
	}

	if ((unsigned)M < NumElts)
	DemandedLHS.setBit((unsigned)M % NumElts);
	else
	DemandedRHS.setBit((unsigned)M % NumElts);
	}
	// Known bits are the values that are shared by every demanded element.
	if (!!DemandedLHS) {
	SDValue LHS = Op.getOperand(0);
	Known2 = computeKnownBits(LHS, DemandedLHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	if (!!DemandedRHS) {
	SDValue RHS = Op.getOperand(1);
	Known2 = computeKnownBits(RHS, DemandedRHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	break;
	}
	case ISD::CONCAT_VECTORS: {
	// Split DemandedElts and test each of the demanded subvectors.
	Known.Zero.setAllBits(); Known.One.setAllBits();
	EVT SubVectorVT = Op.getOperand(0).getValueType();
	unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
	unsigned NumSubVectors = Op.getNumOperands();
	for (unsigned i = 0; i != NumSubVectors; ++i) {
	APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
	DemandedSub = DemandedSub.trunc(NumSubVectorElts);
	if (!!DemandedSub) {
	SDValue Sub = Op.getOperand(i);
	Known2 = computeKnownBits(Sub, DemandedSub, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	}
	break;
	}
	case ISD::INSERT_SUBVECTOR: {
	// If we know the element index, demand any elements from the subvector and
	// the remainder from the src its inserted into, otherwise demand them all.
	SDValue Src = Op.getOperand(0);
	SDValue Sub = Op.getOperand(1);
	ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
	if (SubIdx && SubIdx->getAPIntValue().ule(NumElts - NumSubElts)) {
	Known.One.setAllBits();
	Known.Zero.setAllBits();
	uint64_t Idx = SubIdx->getZExtValue();
	APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
	if (!!DemandedSubElts) {
	Known = computeKnownBits(Sub, DemandedSubElts, Depth + 1);
	if (Known.isUnknown())
	break; // early-out.
	}
	APInt SubMask = APInt::getBitsSet(NumElts, Idx, Idx + NumSubElts);
	APInt DemandedSrcElts = DemandedElts & ~SubMask;
	if (!!DemandedSrcElts) {
	Known2 = computeKnownBits(Src, DemandedSrcElts, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	} else {
	Known = computeKnownBits(Sub, Depth + 1);
	if (Known.isUnknown())
	break; // early-out.
	Known2 = computeKnownBits(Src, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	break;
	}
	case ISD::EXTRACT_SUBVECTOR: {
	// If we know the element index, just demand that subvector elements,
	// otherwise demand them all.
	SDValue Src = Op.getOperand(0);
	ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
	if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
	// Offset the demanded elts by the subvector index.
	uint64_t Idx = SubIdx->getZExtValue();
	APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
	Known = computeKnownBits(Src, DemandedSrc, Depth + 1);
	} else {
	Known = computeKnownBits(Src, Depth + 1);
	}
	break;
	}
	case ISD::SCALAR_TO_VECTOR: {
	// We know about scalar_to_vector as much as we know about it source,
	// which becomes the first element of otherwise unknown vector.
	if (DemandedElts != 1)
	break;

	SDValue N0 = Op.getOperand(0);
	Known = computeKnownBits(N0, Depth + 1);
	if (N0.getValueSizeInBits() != BitWidth)
	Known = Known.trunc(BitWidth);

	break;
	}
	case ISD::BITCAST: {
	SDValue N0 = Op.getOperand(0);
	EVT SubVT = N0.getValueType();
	unsigned SubBitWidth = SubVT.getScalarSizeInBits();

	// Ignore bitcasts from unsupported types.
	if (!(SubVT.isInteger() \|\| SubVT.isFloatingPoint()))
	break;

	// Fast handling of 'identity' bitcasts.
	if (BitWidth == SubBitWidth) {
	Known = computeKnownBits(N0, DemandedElts, Depth + 1);
	break;
	}

	bool IsLE = getDataLayout().isLittleEndian();

	// Bitcast 'small element' vector to 'large element' scalar/vector.
	if ((BitWidth % SubBitWidth) == 0) {
	assert(N0.getValueType().isVector() && "Expected bitcast from vector");

	// Collect known bits for the (larger) output by collecting the known
	// bits from each set of sub elements and shift these into place.
	// We need to separately call computeKnownBits for each set of
	// sub elements as the knownbits for each is likely to be different.
	unsigned SubScale = BitWidth / SubBitWidth;
	APInt SubDemandedElts(NumElts * SubScale, 0);
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i])
	SubDemandedElts.setBit(i * SubScale);

	for (unsigned i = 0; i != SubScale; ++i) {
	Known2 = computeKnownBits(N0, SubDemandedElts.shl(i),
	Depth + 1);
	unsigned Shifts = IsLE ? i : SubScale - 1 - i;
	Known.One \|= Known2.One.zext(BitWidth).shl(SubBitWidth * Shifts);
	Known.Zero \|= Known2.Zero.zext(BitWidth).shl(SubBitWidth * Shifts);
	}
	}

	// Bitcast 'large element' scalar/vector to 'small element' vector.
	if ((SubBitWidth % BitWidth) == 0) {
	assert(Op.getValueType().isVector() && "Expected bitcast to vector");

	// Collect known bits for the (smaller) output by collecting the known
	// bits from the overlapping larger input elements and extracting the
	// sub sections we actually care about.
	unsigned SubScale = SubBitWidth / BitWidth;
	APInt SubDemandedElts(NumElts / SubScale, 0);
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i])
	SubDemandedElts.setBit(i / SubScale);

	Known2 = computeKnownBits(N0, SubDemandedElts, Depth + 1);

	Known.Zero.setAllBits(); Known.One.setAllBits();
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i]) {
	unsigned Shifts = IsLE ? i : NumElts - 1 - i;
	unsigned Offset = (Shifts % SubScale) * BitWidth;
	Known.One &= Known2.One.lshr(Offset).trunc(BitWidth);
	Known.Zero &= Known2.Zero.lshr(Offset).trunc(BitWidth);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	}
	}
	break;
	}
	case ISD::AND:
	// If either the LHS or the RHS are Zero, the result is zero.
	Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// Output known-1 bits are only known if set in both the LHS & RHS.
	Known.One &= Known2.One;
	// Output known-0 are known to be clear if zero in either the LHS \| RHS.
	Known.Zero \|= Known2.Zero;
	break;
	case ISD::OR:
	Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// Output known-0 bits are only known if clear in both the LHS & RHS.
	Known.Zero &= Known2.Zero;
	// Output known-1 are known to be set if set in either the LHS \| RHS.
	Known.One \|= Known2.One;
	break;
	case ISD::XOR: {
	Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// Output known-0 bits are known if clear or set in both the LHS & RHS.
	APInt KnownZeroOut = (Known.Zero & Known2.Zero) \| (Known.One & Known2.One);
	// Output known-1 are known to be set if set in only one of the LHS, RHS.
	Known.One = (Known.Zero & Known2.One) \| (Known.One & Known2.Zero);
	Known.Zero = KnownZeroOut;
	break;
	}
	case ISD::MUL: {
	Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// If low bits are zero in either operand, output low known-0 bits.
	// Also compute a conservative estimate for high known-0 bits.
	// More trickiness is possible, but this is sufficient for the
	// interesting case of alignment computation.
	unsigned TrailZ = Known.countMinTrailingZeros() +
	Known2.countMinTrailingZeros();
	unsigned LeadZ = std::max(Known.countMinLeadingZeros() +
	Known2.countMinLeadingZeros(),
	BitWidth) - BitWidth;

	Known.resetAll();
	Known.Zero.setLowBits(std::min(TrailZ, BitWidth));
	Known.Zero.setHighBits(std::min(LeadZ, BitWidth));
	break;
	}
	case ISD::UDIV: {
	// For the purposes of computing leading zeros we can conservatively
	// treat a udiv as a logical right shift by the power of 2 known to
	// be less than the denominator.
	Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	unsigned LeadZ = Known2.countMinLeadingZeros();

	Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
	if (RHSMaxLeadingZeros != BitWidth)
	LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);

	Known.Zero.setHighBits(LeadZ);
	break;
	}
	case ISD::SELECT:
	case ISD::VSELECT:
	Known = computeKnownBits(Op.getOperand(2), DemandedElts, Depth+1);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth+1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	case ISD::SELECT_CC:
	Known = computeKnownBits(Op.getOperand(3), DemandedElts, Depth+1);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	Known2 = computeKnownBits(Op.getOperand(2), DemandedElts, Depth+1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	case ISD::SMULO:
	case ISD::UMULO:
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	if (Op.getResNo() != 1)
	break;
	// The boolean result conforms to getBooleanContents.
	// If we know the result of a setcc has the top bits zero, use this info.
	// We know that we have an integer-based boolean since these operations
	// are only available for integer.
	if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	BitWidth > 1)
	Known.Zero.setBitsFrom(1);
	break;
	case ISD::SETCC:
	// If we know the result of a setcc has the top bits zero, use this info.
	if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	BitWidth > 1)
	Known.Zero.setBitsFrom(1);
	break;
	case ISD::SHL:
	if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
	Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	unsigned Shift = ShAmt->getZExtValue();
	Known.Zero <<= Shift;
	Known.One <<= Shift;
	// Low bits are known zero.
	Known.Zero.setLowBits(Shift);
	}
	break;
	case ISD::SRL:
	if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
	Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	unsigned Shift = ShAmt->getZExtValue();
	Known.Zero.lshrInPlace(Shift);
	Known.One.lshrInPlace(Shift);
	// High bits are known zero.
	Known.Zero.setHighBits(Shift);
	} else if (auto *BV = dyn_cast<BuildVectorSDNode>(Op.getOperand(1))) {
	// If the shift amount is a vector of constants see if we can bound
	// the number of upper zero bits.
	unsigned ShiftAmountMin = BitWidth;
	for (unsigned i = 0; i != BV->getNumOperands(); ++i) {
	if (auto *C = dyn_cast<ConstantSDNode>(BV->getOperand(i))) {
	const APInt &ShAmt = C->getAPIntValue();
	if (ShAmt.ult(BitWidth)) {
	ShiftAmountMin = std::min<unsigned>(ShiftAmountMin,
	ShAmt.getZExtValue());
	continue;
	}
	}
	// Don't know anything.
	ShiftAmountMin = 0;
	break;
	}

	Known.Zero.setHighBits(ShiftAmountMin);
	}
	break;
	case ISD::SRA:
	if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
	Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	unsigned Shift = ShAmt->getZExtValue();
	// Sign extend known zero/one bit (else is unknown).
	Known.Zero.ashrInPlace(Shift);
	Known.One.ashrInPlace(Shift);
	}
	break;
	case ISD::FSHL:
	case ISD::FSHR:
	if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(2), DemandedElts)) {
	unsigned Amt = C->getAPIntValue().urem(BitWidth);

	// For fshl, 0-shift returns the 1st arg.
	// For fshr, 0-shift returns the 2nd arg.
	if (Amt == 0) {
	Known = computeKnownBits(Op.getOperand(Opcode == ISD::FSHL ? 0 : 1),
	DemandedElts, Depth + 1);
	break;
	}

	// fshl: (X << (Z % BW)) \| (Y >> (BW - (Z % BW)))
	// fshr: (X << (BW - (Z % BW))) \| (Y >> (Z % BW))
	Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	if (Opcode == ISD::FSHL) {
	Known.One <<= Amt;
	Known.Zero <<= Amt;
	Known2.One.lshrInPlace(BitWidth - Amt);
	Known2.Zero.lshrInPlace(BitWidth - Amt);
	} else {
	Known.One <<= BitWidth - Amt;
	Known.Zero <<= BitWidth - Amt;
	Known2.One.lshrInPlace(Amt);
	Known2.Zero.lshrInPlace(Amt);
	}
	Known.One \|= Known2.One;
	Known.Zero \|= Known2.Zero;
	}
	break;
	case ISD::SIGN_EXTEND_INREG: {
	EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
	unsigned EBits = EVT.getScalarSizeInBits();

	// Sign extension. Compute the demanded bits in the result that are not
	// present in the input.
	APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits);

	APInt InSignMask = APInt::getSignMask(EBits);
	APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits);

	// If the sign extended bits are demanded, we know that the sign
	// bit is demanded.
	InSignMask = InSignMask.zext(BitWidth);
	if (NewBits.getBoolValue())
	InputDemandedBits \|= InSignMask;

	Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	Known.One &= InputDemandedBits;
	Known.Zero &= InputDemandedBits;

	// If the sign bit of the input is known set or clear, then we know the
	// top bits of the result.
	if (Known.Zero.intersects(InSignMask)) { // Input sign bit known clear
	Known.Zero \|= NewBits;
	Known.One &= ~NewBits;
	} else if (Known.One.intersects(InSignMask)) { // Input sign bit known set
	Known.One \|= NewBits;
	Known.Zero &= ~NewBits;
	} else { // Input sign bit unknown
	Known.Zero &= ~NewBits;
	Known.One &= ~NewBits;
	}
	break;
	}
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF: {
	Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	// If we have a known 1, its position is our upper bound.
	unsigned PossibleTZ = Known2.countMaxTrailingZeros();
	unsigned LowBits = Log2_32(PossibleTZ) + 1;
	Known.Zero.setBitsFrom(LowBits);
	break;
	}
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF: {
	Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	// If we have a known 1, its position is our upper bound.
	unsigned PossibleLZ = Known2.countMaxLeadingZeros();
	unsigned LowBits = Log2_32(PossibleLZ) + 1;
	Known.Zero.setBitsFrom(LowBits);
	break;
	}
	case ISD::CTPOP: {
	Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	// If we know some of the bits are zero, they can't be one.
	unsigned PossibleOnes = Known2.countMaxPopulation();
	Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1);
	break;
	}
	case ISD::LOAD: {
	LoadSDNode *LD = cast<LoadSDNode>(Op);
	const Constant *Cst = TLI->getTargetConstantFromLoad(LD);
	if (ISD::isNON_EXTLoad(LD) && Cst) {
	// Determine any common known bits from the loaded constant pool value.
	Type *CstTy = Cst->getType();
	if ((NumElts * BitWidth) == CstTy->getPrimitiveSizeInBits()) {
	// If its a vector splat, then we can (quickly) reuse the scalar path.
	// NOTE: We assume all elements match and none are UNDEF.
	if (CstTy->isVectorTy()) {
	if (const Constant *Splat = Cst->getSplatValue()) {
	Cst = Splat;
	CstTy = Cst->getType();
	}
	}
	// TODO - do we need to handle different bitwidths?
	if (CstTy->isVectorTy() && BitWidth == CstTy->getScalarSizeInBits()) {
	// Iterate across all vector elements finding common known bits.
	Known.One.setAllBits();
	Known.Zero.setAllBits();
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;
	if (Constant *Elt = Cst->getAggregateElement(i)) {
	if (auto *CInt = dyn_cast<ConstantInt>(Elt)) {
	const APInt &Value = CInt->getValue();
	Known.One &= Value;
	Known.Zero &= ~Value;
	continue;
	}
	if (auto *CFP = dyn_cast<ConstantFP>(Elt)) {
	APInt Value = CFP->getValueAPF().bitcastToAPInt();
	Known.One &= Value;
	Known.Zero &= ~Value;
	continue;
	}
	}
	Known.One.clearAllBits();
	Known.Zero.clearAllBits();
	break;
	}
	} else if (BitWidth == CstTy->getPrimitiveSizeInBits()) {
	if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
	const APInt &Value = CInt->getValue();
	Known.One = Value;
	Known.Zero = ~Value;
	} else if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
	APInt Value = CFP->getValueAPF().bitcastToAPInt();
	Known.One = Value;
	Known.Zero = ~Value;
	}
	}
	}
	} else if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
	// If this is a ZEXTLoad and we are looking at the loaded value.
	EVT VT = LD->getMemoryVT();
	unsigned MemBits = VT.getScalarSizeInBits();
	Known.Zero.setBitsFrom(MemBits);
	} else if (const MDNode *Ranges = LD->getRanges()) {
	if (LD->getExtensionType() == ISD::NON_EXTLOAD)
	computeKnownBitsFromRangeMetadata(*Ranges, Known);
	}
	break;
	}
	case ISD::ZERO_EXTEND_VECTOR_INREG: {
	EVT InVT = Op.getOperand(0).getValueType();
	APInt InDemandedElts = DemandedElts.zextOrSelf(InVT.getVectorNumElements());
	Known = computeKnownBits(Op.getOperand(0), InDemandedElts, Depth + 1);
	Known = Known.zext(BitWidth, true /* ExtendedBitsAreKnownZero */);
	break;
	}
	case ISD::ZERO_EXTEND: {
	Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	Known = Known.zext(BitWidth, true /* ExtendedBitsAreKnownZero */);
	break;
	}
	case ISD::SIGN_EXTEND_VECTOR_INREG: {
	EVT InVT = Op.getOperand(0).getValueType();
	APInt InDemandedElts = DemandedElts.zextOrSelf(InVT.getVectorNumElements());
	Known = computeKnownBits(Op.getOperand(0), InDemandedElts, Depth + 1);
	// If the sign bit is known to be zero or one, then sext will extend
	// it to the top bits, else it will just zext.
	Known = Known.sext(BitWidth);
	break;
	}
	case ISD::SIGN_EXTEND: {
	Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	// If the sign bit is known to be zero or one, then sext will extend
	// it to the top bits, else it will just zext.
	Known = Known.sext(BitWidth);
	break;
	}
	case ISD::ANY_EXTEND: {
	Known = computeKnownBits(Op.getOperand(0), Depth+1);
	Known = Known.zext(BitWidth, false /* ExtendedBitsAreKnownZero */);
	break;
	}
	case ISD::TRUNCATE: {
	Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	Known = Known.trunc(BitWidth);
	break;
	}
	case ISD::AssertZext: {
	EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
	APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
	Known = computeKnownBits(Op.getOperand(0), Depth+1);
	Known.Zero \|= (~InMask);
	Known.One &= (~Known.Zero);
	break;
	}
	case ISD::FGETSIGN:
	// All bits are zero except the low bit.
	Known.Zero.setBitsFrom(1);
	break;
	case ISD::USUBO:
	case ISD::SSUBO:
	if (Op.getResNo() == 1) {
	// If we know the result of a setcc has the top bits zero, use this info.
	if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	BitWidth > 1)
	Known.Zero.setBitsFrom(1);
	break;
	}
	LLVM_FALLTHROUGH;
	case ISD::SUB:
	case ISD::SUBC: {
	Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known = KnownBits::computeForAddSub(/* Add / false, / NSW */ false,
	Known, Known2);
	break;
	}
	case ISD::UADDO:
	case ISD::SADDO:
	case ISD::ADDCARRY:
	if (Op.getResNo() == 1) {
	// If we know the result of a setcc has the top bits zero, use this info.
	if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	BitWidth > 1)
	Known.Zero.setBitsFrom(1);
	break;
	}
	LLVM_FALLTHROUGH;
	case ISD::ADD:
	case ISD::ADDC:
	case ISD::ADDE: {
	assert(Op.getResNo() == 0 && "We only compute knownbits for the sum here.");

	// With ADDE and ADDCARRY, a carry bit may be added in.
	KnownBits Carry(1);
	if (Opcode == ISD::ADDE)
	// Can't track carry from glue, set carry to unknown.
	Carry.resetAll();
	else if (Opcode == ISD::ADDCARRY)
	// TODO: Compute known bits for the carry operand. Not sure if it is worth
	// the trouble (how often will we find a known carry bit). And I haven't
	// tested this very much yet, but something like this might work:
	// Carry = computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
	// Carry = Carry.zextOrTrunc(1, false);
	Carry.resetAll();
	else
	Carry.setAllZero();

	Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known = KnownBits::computeForAddCarry(Known, Known2, Carry);
	break;
	}
	case ISD::SREM:
	if (ConstantSDNode *Rem = isConstOrConstSplat(Op.getOperand(1))) {
	const APInt &RA = Rem->getAPIntValue().abs();
	if (RA.isPowerOf2()) {
	APInt LowBits = RA - 1;
	Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// The low bits of the first operand are unchanged by the srem.
	Known.Zero = Known2.Zero & LowBits;
	Known.One = Known2.One & LowBits;

	// If the first operand is non-negative or has all low bits zero, then
	// the upper bits are all zero.
	if (Known2.Zero[BitWidth-1] \|\| ((Known2.Zero & LowBits) == LowBits))
	Known.Zero \|= ~LowBits;

	// If the first operand is negative and not all low bits are zero, then
	// the upper bits are all one.
	if (Known2.One[BitWidth-1] && ((Known2.One & LowBits) != 0))
	Known.One \|= ~LowBits;
	assert((Known.Zero & Known.One) == 0&&"Bits known to be one AND zero?");
	}
	}
	break;
	case ISD::UREM: {
	if (ConstantSDNode *Rem = isConstOrConstSplat(Op.getOperand(1))) {
	const APInt &RA = Rem->getAPIntValue();
	if (RA.isPowerOf2()) {
	APInt LowBits = (RA - 1);
	Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// The upper bits are all zero, the lower ones are unchanged.
	Known.Zero = Known2.Zero \| ~LowBits;
	Known.One = Known2.One & LowBits;
	break;
	}
	}

	// Since the result is less than or equal to either operand, any leading
	// zero bits in either operand must also exist in the result.
	Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

	uint32_t Leaders =
	std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
	Known.resetAll();
	Known.Zero.setHighBits(Leaders);
	break;
	}
	case ISD::EXTRACT_ELEMENT: {
	Known = computeKnownBits(Op.getOperand(0), Depth+1);
	const unsigned Index = Op.getConstantOperandVal(1);
	const unsigned EltBitWidth = Op.getValueSizeInBits();

	// Remove low part of known bits mask
	Known.Zero = Known.Zero.getHiBits(Known.getBitWidth() - Index * EltBitWidth);
	Known.One = Known.One.getHiBits(Known.getBitWidth() - Index * EltBitWidth);

	// Remove high part of known bit mask
	Known = Known.trunc(EltBitWidth);
	break;
	}
	case ISD::EXTRACT_VECTOR_ELT: {
	SDValue InVec = Op.getOperand(0);
	SDValue EltNo = Op.getOperand(1);
	EVT VecVT = InVec.getValueType();
	const unsigned EltBitWidth = VecVT.getScalarSizeInBits();
	const unsigned NumSrcElts = VecVT.getVectorNumElements();
	// If BitWidth > EltBitWidth the value is anyext:ed. So we do not know
	// anything about the extended bits.
	if (BitWidth > EltBitWidth)
	Known = Known.trunc(EltBitWidth);
	ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
	if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) {
	// If we know the element index, just demand that vector element.
	unsigned Idx = ConstEltNo->getZExtValue();
	APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
	Known = computeKnownBits(InVec, DemandedElt, Depth + 1);
	} else {
	// Unknown element index, so ignore DemandedElts and demand them all.
	Known = computeKnownBits(InVec, Depth + 1);
	}
	if (BitWidth > EltBitWidth)
	Known = Known.zext(BitWidth, false /* => any extend */);
	break;
	}
	case ISD::INSERT_VECTOR_ELT: {
	SDValue InVec = Op.getOperand(0);
	SDValue InVal = Op.getOperand(1);
	SDValue EltNo = Op.getOperand(2);

	ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
	if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
	// If we know the element index, split the demand between the
	// source vector and the inserted element.
	Known.Zero = Known.One = APInt::getAllOnesValue(BitWidth);
	unsigned EltIdx = CEltNo->getZExtValue();

	// If we demand the inserted element then add its common known bits.
	if (DemandedElts[EltIdx]) {
	Known2 = computeKnownBits(InVal, Depth + 1);
	Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
	Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
	}

	// If we demand the source vector then add its common known bits, ensuring
	// that we don't demand the inserted element.
	APInt VectorElts = DemandedElts & ~(APInt::getOneBitSet(NumElts, EltIdx));
	if (!!VectorElts) {
	Known2 = computeKnownBits(InVec, VectorElts, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	} else {
	// Unknown element index, so ignore DemandedElts and demand them all.
	Known = computeKnownBits(InVec, Depth + 1);
	Known2 = computeKnownBits(InVal, Depth + 1);
	Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
	Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
	}
	break;
	}
	case ISD::BITREVERSE: {
	Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	Known.Zero = Known2.Zero.reverseBits();
	Known.One = Known2.One.reverseBits();
	break;
	}
	case ISD::BSWAP: {
	Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	Known.Zero = Known2.Zero.byteSwap();
	Known.One = Known2.One.byteSwap();
	break;
	}
	case ISD::ABS: {
	Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// If the source's MSB is zero then we know the rest of the bits already.
	if (Known2.isNonNegative()) {
	Known.Zero = Known2.Zero;
	Known.One = Known2.One;
	break;
	}

	// We only know that the absolute values's MSB will be zero iff there is
	// a set bit that isn't the sign bit (otherwise it could be INT_MIN).
	Known2.One.clearSignBit();
	if (Known2.One.getBoolValue()) {
	Known.Zero = APInt::getSignMask(BitWidth);
	break;
	}
	break;
	}
	case ISD::UMIN: {
	Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

	// UMIN - we know that the result will have the maximum of the
	// known zero leading bits of the inputs.
	unsigned LeadZero = Known.countMinLeadingZeros();
	LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros());

	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	Known.Zero.setHighBits(LeadZero);
	break;
	}
	case ISD::UMAX: {
	Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

	// UMAX - we know that the result will have the maximum of the
	// known one leading bits of the inputs.
	unsigned LeadOne = Known.countMinLeadingOnes();
	LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes());

	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	Known.One.setHighBits(LeadOne);
	break;
	}
	case ISD::SMIN:
	case ISD::SMAX: {
	// If we have a clamp pattern, we know that the number of sign bits will be
	// the minimum of the clamp min/max range.
	bool IsMax = (Opcode == ISD::SMAX);
	ConstantSDNode CstLow = nullptr, CstHigh = nullptr;
	if ((CstLow = isConstOrConstSplat(Op.getOperand(1), DemandedElts)))
	if (Op.getOperand(0).getOpcode() == (IsMax ? ISD::SMIN : ISD::SMAX))
	CstHigh =
	isConstOrConstSplat(Op.getOperand(0).getOperand(1), DemandedElts);
	if (CstLow && CstHigh) {
	if (!IsMax)
	std::swap(CstLow, CstHigh);

	const APInt &ValueLow = CstLow->getAPIntValue();
	const APInt &ValueHigh = CstHigh->getAPIntValue();
	if (ValueLow.sle(ValueHigh)) {
	unsigned LowSignBits = ValueLow.getNumSignBits();
	unsigned HighSignBits = ValueHigh.getNumSignBits();
	unsigned MinSignBits = std::min(LowSignBits, HighSignBits);
	if (ValueLow.isNegative() && ValueHigh.isNegative()) {
	Known.One.setHighBits(MinSignBits);
	break;
	}
	if (ValueLow.isNonNegative() && ValueHigh.isNonNegative()) {
	Known.Zero.setHighBits(MinSignBits);
	break;
	}
	}
	}

	// Fallback - just get the shared known bits of the operands.
	Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	if (Known.isUnknown()) break; // Early-out
	Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	break;
	}
	case ISD::FrameIndex:
	case ISD::TargetFrameIndex:
	TLI->computeKnownBitsForFrameIndex(Op, Known, DemandedElts, *this, Depth);
	break;

	default:
	if (Opcode < ISD::BUILTIN_OP_END)
	break;
	LLVM_FALLTHROUGH;
	case ISD::INTRINSIC_WO_CHAIN:
	case ISD::INTRINSIC_W_CHAIN:
	case ISD::INTRINSIC_VOID:
	// Allow the target to implement this method for its nodes.
	TLI->computeKnownBitsForTargetNode(Op, Known, DemandedElts, *this, Depth);
	break;
	}

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	return Known;
	}

	SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0,
	SDValue N1) const {
	// X + 0 never overflow
	if (isNullConstant(N1))
	return OFK_Never;

	KnownBits N1Known = computeKnownBits(N1);
	if (N1Known.Zero.getBoolValue()) {
	KnownBits N0Known = computeKnownBits(N0);

	bool overflow;
	(void)(~N0Known.Zero).uadd_ov(~N1Known.Zero, overflow);
	if (!overflow)
	return OFK_Never;
	}

	// mulhi + 1 never overflow
	if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 &&
	(~N1Known.Zero & 0x01) == ~N1Known.Zero)
	return OFK_Never;

	if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) {
	KnownBits N0Known = computeKnownBits(N0);

	if ((~N0Known.Zero & 0x01) == ~N0Known.Zero)
	return OFK_Never;
	}

	return OFK_Sometime;
	}

	bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
	EVT OpVT = Val.getValueType();
	unsigned BitWidth = OpVT.getScalarSizeInBits();

	// Is the constant a known power of 2?
	if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val))
	return Const->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();

	// A left-shift of a constant one will have exactly one bit set because
	// shifting the bit off the end is undefined.
	if (Val.getOpcode() == ISD::SHL) {
	auto *C = isConstOrConstSplat(Val.getOperand(0));
	if (C && C->getAPIntValue() == 1)
	return true;
	}

	// Similarly, a logical right-shift of a constant sign-bit will have exactly
	// one bit set.
	if (Val.getOpcode() == ISD::SRL) {
	auto *C = isConstOrConstSplat(Val.getOperand(0));
	if (C && C->getAPIntValue().isSignMask())
	return true;
	}

	// Are all operands of a build vector constant powers of two?
	if (Val.getOpcode() == ISD::BUILD_VECTOR)
	if (llvm::all_of(Val->ops(), [BitWidth](SDValue E) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(E))
	return C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();
	return false;
	}))
	return true;

	// More could be done here, though the above checks are enough
	// to handle some common cases.

	// Fall back to computeKnownBits to catch other known cases.
	KnownBits Known = computeKnownBits(Val);
	return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1);
	}

	unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
	EVT VT = Op.getValueType();
	APInt DemandedElts = VT.isVector()
	? APInt::getAllOnesValue(VT.getVectorNumElements())
	: APInt(1, 1);
	return ComputeNumSignBits(Op, DemandedElts, Depth);
	}

	unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	assert((VT.isInteger() \|\| VT.isFloatingPoint()) && "Invalid VT!");
	unsigned VTBits = VT.getScalarSizeInBits();
	unsigned NumElts = DemandedElts.getBitWidth();
	unsigned Tmp, Tmp2;
	unsigned FirstAnswer = 1;

	if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
	const APInt &Val = C->getAPIntValue();
	return Val.getNumSignBits();
	}

	if (Depth == 6)
	return 1; // Limit search depth.

	if (!DemandedElts)
	return 1; // No demanded elts, better to assume we don't know anything.

	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	default: break;
	case ISD::AssertSext:
	Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
	return VTBits-Tmp+1;
	case ISD::AssertZext:
	Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
	return VTBits-Tmp;

	case ISD::BUILD_VECTOR:
	Tmp = VTBits;
	for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) {
	if (!DemandedElts[i])
	continue;

	SDValue SrcOp = Op.getOperand(i);
	Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1);

	// BUILD_VECTOR can implicitly truncate sources, we must handle this.
	if (SrcOp.getValueSizeInBits() != VTBits) {
	assert(SrcOp.getValueSizeInBits() > VTBits &&
	"Expected BUILD_VECTOR implicit truncation");
	unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits;
	Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1);
	}
	Tmp = std::min(Tmp, Tmp2);
	}
	return Tmp;

	case ISD::VECTOR_SHUFFLE: {
	// Collect the minimum number of sign bits that are shared by every vector
	// element referenced by the shuffle.
	APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
	const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
	assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
	for (unsigned i = 0; i != NumElts; ++i) {
	int M = SVN->getMaskElt(i);
	if (!DemandedElts[i])
	continue;
	// For UNDEF elements, we don't know anything about the common state of
	// the shuffle result.
	if (M < 0)
	return 1;
	if ((unsigned)M < NumElts)
	DemandedLHS.setBit((unsigned)M % NumElts);
	else
	DemandedRHS.setBit((unsigned)M % NumElts);
	}
	Tmp = std::numeric_limits<unsigned>::max();
	if (!!DemandedLHS)
	Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
	if (!!DemandedRHS) {
	Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
	Tmp = std::min(Tmp, Tmp2);
	}
	// If we don't know anything, early out and try computeKnownBits fall-back.
	if (Tmp == 1)
	break;
	assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
	return Tmp;
	}

	case ISD::BITCAST: {
	SDValue N0 = Op.getOperand(0);
	EVT SrcVT = N0.getValueType();
	unsigned SrcBits = SrcVT.getScalarSizeInBits();

	// Ignore bitcasts from unsupported types..
	if (!(SrcVT.isInteger() \|\| SrcVT.isFloatingPoint()))
	break;

	// Fast handling of 'identity' bitcasts.
	if (VTBits == SrcBits)
	return ComputeNumSignBits(N0, DemandedElts, Depth + 1);

	bool IsLE = getDataLayout().isLittleEndian();

	// Bitcast 'large element' scalar/vector to 'small element' vector.
	if ((SrcBits % VTBits) == 0) {
	assert(VT.isVector() && "Expected bitcast to vector");

	unsigned Scale = SrcBits / VTBits;
	APInt SrcDemandedElts(NumElts / Scale, 0);
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i])
	SrcDemandedElts.setBit(i / Scale);

	// Fast case - sign splat can be simply split across the small elements.
	Tmp = ComputeNumSignBits(N0, SrcDemandedElts, Depth + 1);
	if (Tmp == SrcBits)
	return VTBits;

	// Slow case - determine how far the sign extends into each sub-element.
	Tmp2 = VTBits;
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i]) {
	unsigned SubOffset = i % Scale;
	SubOffset = (IsLE ? ((Scale - 1) - SubOffset) : SubOffset);
	SubOffset = SubOffset * VTBits;
	if (Tmp <= SubOffset)
	return 1;
	Tmp2 = std::min(Tmp2, Tmp - SubOffset);
	}
	return Tmp2;
	}
	break;
	}

	case ISD::SIGN_EXTEND:
	Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
	return ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1) + Tmp;
	case ISD::SIGN_EXTEND_INREG:
	// Max of the input and what this extends.
	Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getScalarSizeInBits();
	Tmp = VTBits-Tmp+1;
	Tmp2 = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
	return std::max(Tmp, Tmp2);
	case ISD::SIGN_EXTEND_VECTOR_INREG: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	APInt DemandedSrcElts = DemandedElts.zextOrSelf(SrcVT.getVectorNumElements());
	Tmp = VTBits - SrcVT.getScalarSizeInBits();
	return ComputeNumSignBits(Src, DemandedSrcElts, Depth+1) + Tmp;
	}

	case ISD::SRA:
	Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
	// SRA X, C -> adds C sign bits.
	if (ConstantSDNode *C =
	isConstOrConstSplat(Op.getOperand(1), DemandedElts)) {
	APInt ShiftVal = C->getAPIntValue();
	ShiftVal += Tmp;
	Tmp = ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
	}
	return Tmp;
	case ISD::SHL:
	if (ConstantSDNode *C =
	isConstOrConstSplat(Op.getOperand(1), DemandedElts)) {
	// shl destroys sign bits.
	Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
	if (C->getAPIntValue().uge(VTBits) \|\| // Bad shift.
	C->getAPIntValue().uge(Tmp)) break; // Shifted all sign bits out.
	return Tmp - C->getZExtValue();
	}
	break;
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR: // NOT is handled here.
	// Logical binary ops preserve the number of sign bits at the worst.
	Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
	if (Tmp != 1) {
	Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth+1);
	FirstAnswer = std::min(Tmp, Tmp2);
	// We computed what we know about the sign bits as our first
	// answer. Now proceed to the generic code that uses
	// computeKnownBits, and pick whichever answer is better.
	}
	break;

	case ISD::SELECT:
	case ISD::VSELECT:
	Tmp = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth+1);
	if (Tmp == 1) return 1; // Early out.
	Tmp2 = ComputeNumSignBits(Op.getOperand(2), DemandedElts, Depth+1);
	return std::min(Tmp, Tmp2);
	case ISD::SELECT_CC:
	Tmp = ComputeNumSignBits(Op.getOperand(2), DemandedElts, Depth+1);
	if (Tmp == 1) return 1; // Early out.
	Tmp2 = ComputeNumSignBits(Op.getOperand(3), DemandedElts, Depth+1);
	return std::min(Tmp, Tmp2);

	case ISD::SMIN:
	case ISD::SMAX: {
	// If we have a clamp pattern, we know that the number of sign bits will be
	// the minimum of the clamp min/max range.
	bool IsMax = (Opcode == ISD::SMAX);
	ConstantSDNode CstLow = nullptr, CstHigh = nullptr;
	if ((CstLow = isConstOrConstSplat(Op.getOperand(1), DemandedElts)))
	if (Op.getOperand(0).getOpcode() == (IsMax ? ISD::SMIN : ISD::SMAX))
	CstHigh =
	isConstOrConstSplat(Op.getOperand(0).getOperand(1), DemandedElts);
	if (CstLow && CstHigh) {
	if (!IsMax)
	std::swap(CstLow, CstHigh);
	if (CstLow->getAPIntValue().sle(CstHigh->getAPIntValue())) {
	Tmp = CstLow->getAPIntValue().getNumSignBits();
	Tmp2 = CstHigh->getAPIntValue().getNumSignBits();
	return std::min(Tmp, Tmp2);
	}
	}

	// Fallback - just get the minimum number of sign bits of the operands.
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
	if (Tmp == 1)
	return 1; // Early out.
	Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
	return std::min(Tmp, Tmp2);
	}
	case ISD::UMIN:
	case ISD::UMAX:
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
	if (Tmp == 1)
	return 1; // Early out.
	Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
	return std::min(Tmp, Tmp2);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO:
	if (Op.getResNo() != 1)
	break;
	// The boolean result conforms to getBooleanContents. Fall through.
	// If setcc returns 0/-1, all bits are sign bits.
	// We know that we have an integer-based boolean since these operations
	// are only available for integer.
	if (TLI->getBooleanContents(VT.isVector(), false) ==
	TargetLowering::ZeroOrNegativeOneBooleanContent)
	return VTBits;
	break;
	case ISD::SETCC:
	// If setcc returns 0/-1, all bits are sign bits.
	if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrNegativeOneBooleanContent)
	return VTBits;
	break;
	case ISD::ROTL:
	case ISD::ROTR:
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	unsigned RotAmt = C->getAPIntValue().urem(VTBits);

	// Handle rotate right by N like a rotate left by 32-N.
	if (Opcode == ISD::ROTR)
	RotAmt = (VTBits - RotAmt) % VTBits;

	// If we aren't rotating out all of the known-in sign bits, return the
	// number that are left. This handles rotl(sext(x), 1) for example.
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp > (RotAmt + 1)) return (Tmp - RotAmt);
	}
	break;
	case ISD::ADD:
	case ISD::ADDC:
	// Add can have at most one carry bit. Thus we know that the output
	// is, at worst, one more bit than the inputs.
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp == 1) return 1; // Early out.

	// Special case decrementing a value (ADD X, -1):
	if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
	if (CRHS->isAllOnesValue()) {
	KnownBits Known = computeKnownBits(Op.getOperand(0), Depth+1);

	// If the input is known to be 0 or 1, the output is 0/-1, which is all
	// sign bits set.
	if ((Known.Zero \| 1).isAllOnesValue())
	return VTBits;

	// If we are subtracting one from a positive number, there is no carry
	// out of the result.
	if (Known.isNonNegative())
	return Tmp;
	}

	Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
	if (Tmp2 == 1) return 1;
	return std::min(Tmp, Tmp2)-1;

	case ISD::SUB:
	Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
	if (Tmp2 == 1) return 1;

	// Handle NEG.
	if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0)))
	if (CLHS->isNullValue()) {
	KnownBits Known = computeKnownBits(Op.getOperand(1), Depth+1);
	// If the input is known to be 0 or 1, the output is 0/-1, which is all
	// sign bits set.
	if ((Known.Zero \| 1).isAllOnesValue())
	return VTBits;

	// If the input is known to be positive (the sign bit is known clear),
	// the output of the NEG has the same number of sign bits as the input.
	if (Known.isNonNegative())
	return Tmp2;

	// Otherwise, we treat this like a SUB.
	}

	// Sub can have at most one carry bit. Thus we know that the output
	// is, at worst, one more bit than the inputs.
	Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp == 1) return 1; // Early out.
	return std::min(Tmp, Tmp2)-1;
	case ISD::TRUNCATE: {
	// Check if the sign bits of source go down as far as the truncated value.
	unsigned NumSrcBits = Op.getOperand(0).getScalarValueSizeInBits();
	unsigned NumSrcSignBits = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
	if (NumSrcSignBits > (NumSrcBits - VTBits))
	return NumSrcSignBits - (NumSrcBits - VTBits);
	break;
	}
	case ISD::EXTRACT_ELEMENT: {
	const int KnownSign = ComputeNumSignBits(Op.getOperand(0), Depth+1);
	const int BitWidth = Op.getValueSizeInBits();
	const int Items = Op.getOperand(0).getValueSizeInBits() / BitWidth;

	// Get reverse index (starting from 1), Op1 value indexes elements from
	// little end. Sign starts at big end.
	const int rIndex = Items - 1 - Op.getConstantOperandVal(1);

	// If the sign portion ends in our element the subtraction gives correct
	// result. Otherwise it gives either negative or > bitwidth result
	return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
	}
	case ISD::INSERT_VECTOR_ELT: {
	SDValue InVec = Op.getOperand(0);
	SDValue InVal = Op.getOperand(1);
	SDValue EltNo = Op.getOperand(2);

	ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
	if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
	// If we know the element index, split the demand between the
	// source vector and the inserted element.
	unsigned EltIdx = CEltNo->getZExtValue();

	// If we demand the inserted element then get its sign bits.
	Tmp = std::numeric_limits<unsigned>::max();
	if (DemandedElts[EltIdx]) {
	// TODO - handle implicit truncation of inserted elements.
	if (InVal.getScalarValueSizeInBits() != VTBits)
	break;
	Tmp = ComputeNumSignBits(InVal, Depth + 1);
	}

	// If we demand the source vector then get its sign bits, and determine
	// the minimum.
	APInt VectorElts = DemandedElts;
	VectorElts.clearBit(EltIdx);
	if (!!VectorElts) {
	Tmp2 = ComputeNumSignBits(InVec, VectorElts, Depth + 1);
	Tmp = std::min(Tmp, Tmp2);
	}
	} else {
	// Unknown element index, so ignore DemandedElts and demand them all.
	Tmp = ComputeNumSignBits(InVec, Depth + 1);
	Tmp2 = ComputeNumSignBits(InVal, Depth + 1);
	Tmp = std::min(Tmp, Tmp2);
	}
	assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
	return Tmp;
	}
	case ISD::EXTRACT_VECTOR_ELT: {
	SDValue InVec = Op.getOperand(0);
	SDValue EltNo = Op.getOperand(1);
	EVT VecVT = InVec.getValueType();
	const unsigned BitWidth = Op.getValueSizeInBits();
	const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
	const unsigned NumSrcElts = VecVT.getVectorNumElements();

	// If BitWidth > EltBitWidth the value is anyext:ed, and we do not know
	// anything about sign bits. But if the sizes match we can derive knowledge
	// about sign bits from the vector operand.
	if (BitWidth != EltBitWidth)
	break;

	// If we know the element index, just demand that vector element, else for
	// an unknown element index, ignore DemandedElts and demand them all.
	APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
	ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
	if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
	DemandedSrcElts =
	APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());

	return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
	}
	case ISD::EXTRACT_SUBVECTOR: {
	// If we know the element index, just demand that subvector elements,
	// otherwise demand them all.
	SDValue Src = Op.getOperand(0);
	ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
	if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
	// Offset the demanded elts by the subvector index.
	uint64_t Idx = SubIdx->getZExtValue();
	APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
	return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
	}
	return ComputeNumSignBits(Src, Depth + 1);
	}
	case ISD::CONCAT_VECTORS: {
	// Determine the minimum number of sign bits across all demanded
	// elts of the input vectors. Early out if the result is already 1.
	Tmp = std::numeric_limits<unsigned>::max();
	EVT SubVectorVT = Op.getOperand(0).getValueType();
	unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
	unsigned NumSubVectors = Op.getNumOperands();
	for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) {
	APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
	DemandedSub = DemandedSub.trunc(NumSubVectorElts);
	if (!DemandedSub)
	continue;
	Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1);
	Tmp = std::min(Tmp, Tmp2);
	}
	assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
	return Tmp;
	}
	case ISD::INSERT_SUBVECTOR: {
	// If we know the element index, demand any elements from the subvector and
	// the remainder from the src its inserted into, otherwise demand them all.
	SDValue Src = Op.getOperand(0);
	SDValue Sub = Op.getOperand(1);
	auto *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
	if (SubIdx && SubIdx->getAPIntValue().ule(NumElts - NumSubElts)) {
	Tmp = std::numeric_limits<unsigned>::max();
	uint64_t Idx = SubIdx->getZExtValue();
	APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
	if (!!DemandedSubElts) {
	Tmp = ComputeNumSignBits(Sub, DemandedSubElts, Depth + 1);
	if (Tmp == 1) return 1; // early-out
	}
	APInt SubMask = APInt::getBitsSet(NumElts, Idx, Idx + NumSubElts);
	APInt DemandedSrcElts = DemandedElts & ~SubMask;
	if (!!DemandedSrcElts) {
	Tmp2 = ComputeNumSignBits(Src, DemandedSrcElts, Depth + 1);
	Tmp = std::min(Tmp, Tmp2);
	}
	assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
	return Tmp;
	}

	// Not able to determine the index so just assume worst case.
	Tmp = ComputeNumSignBits(Sub, Depth + 1);
	if (Tmp == 1) return 1; // early-out
	Tmp2 = ComputeNumSignBits(Src, Depth + 1);
	Tmp = std::min(Tmp, Tmp2);
	assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
	return Tmp;
	}
	}

	// If we are looking at the loaded value of the SDNode.
	if (Op.getResNo() == 0) {
	// Handle LOADX separately here. EXTLOAD case will fallthrough.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
	unsigned ExtType = LD->getExtensionType();
	switch (ExtType) {
	default: break;
	case ISD::SEXTLOAD: // e.g. i16->i32 = '17' bits known.
	Tmp = LD->getMemoryVT().getScalarSizeInBits();
	return VTBits - Tmp + 1;
	case ISD::ZEXTLOAD: // e.g. i16->i32 = '16' bits known.
	Tmp = LD->getMemoryVT().getScalarSizeInBits();
	return VTBits - Tmp;
	case ISD::NON_EXTLOAD:
	if (const Constant *Cst = TLI->getTargetConstantFromLoad(LD)) {
	// We only need to handle vectors - computeKnownBits should handle
	// scalar cases.
	Type *CstTy = Cst->getType();
	if (CstTy->isVectorTy() &&
	(NumElts * VTBits) == CstTy->getPrimitiveSizeInBits()) {
	Tmp = VTBits;
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;
	if (Constant *Elt = Cst->getAggregateElement(i)) {
	if (auto *CInt = dyn_cast<ConstantInt>(Elt)) {
	const APInt &Value = CInt->getValue();
	Tmp = std::min(Tmp, Value.getNumSignBits());
	continue;
	}
	if (auto *CFP = dyn_cast<ConstantFP>(Elt)) {
	APInt Value = CFP->getValueAPF().bitcastToAPInt();
	Tmp = std::min(Tmp, Value.getNumSignBits());
	continue;
	}
	}
	// Unknown type. Conservatively assume no bits match sign bit.
	return 1;
	}
	return Tmp;
	}
	}
	break;
	}
	}
	}

	// Allow the target to implement this method for its nodes.
	if (Opcode >= ISD::BUILTIN_OP_END \|\|
	Opcode == ISD::INTRINSIC_WO_CHAIN \|\|
	Opcode == ISD::INTRINSIC_W_CHAIN \|\|
	Opcode == ISD::INTRINSIC_VOID) {
	unsigned NumBits =
	TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth);
	if (NumBits > 1)
	FirstAnswer = std::max(FirstAnswer, NumBits);
	}

	// Finally, if we can prove that the top bits of the result are 0's or 1's,
	// use this information.
	KnownBits Known = computeKnownBits(Op, DemandedElts, Depth);

	APInt Mask;
	if (Known.isNonNegative()) { // sign bit is 0
	Mask = Known.Zero;
	} else if (Known.isNegative()) { // sign bit is 1;
	Mask = Known.One;
	} else {
	// Nothing known.
	return FirstAnswer;
	}

	// Okay, we know that the sign bit in Mask is set. Use CLZ to determine
	// the number of identical bits in the top of the input value.
	Mask = ~Mask;
	Mask <<= Mask.getBitWidth()-VTBits;
	// Return # leading zeros. We use 'min' here in case Val was zero before
	// shifting. We don't want to return '64' as for an i32 "0".
	return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros()));
	}

	bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const {
	if ((Op.getOpcode() != ISD::ADD && Op.getOpcode() != ISD::OR) \|\|
	!isa<ConstantSDNode>(Op.getOperand(1)))
	return false;

	if (Op.getOpcode() == ISD::OR &&
	!MaskedValueIsZero(Op.getOperand(0), Op.getConstantOperandAPInt(1)))
	return false;

	return true;
	}

	bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const {
	// If we're told that NaNs won't happen, assume they won't.
	if (getTarget().Options.NoNaNsFPMath \|\| Op->getFlags().hasNoNaNs())
	return true;

	if (Depth == 6)
	return false; // Limit search depth.

	// TODO: Handle vectors.
	// If the value is a constant, we can obviously see if it is a NaN or not.
	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
	return !C->getValueAPF().isNaN() \|\|
	(SNaN && !C->getValueAPF().isSignaling());
	}

	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	case ISD::FSIN:
	case ISD::FCOS: {
	if (SNaN)
	return true;
	// TODO: Need isKnownNeverInfinity
	return false;
	}
	case ISD::FCANONICALIZE:
	case ISD::FEXP:
	case ISD::FEXP2:
	case ISD::FTRUNC:
	case ISD::FFLOOR:
	case ISD::FCEIL:
	case ISD::FROUND:
	case ISD::FRINT:
	case ISD::FNEARBYINT: {
	if (SNaN)
	return true;
	return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
	}
	case ISD::FABS:
	case ISD::FNEG:
	case ISD::FCOPYSIGN: {
	return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
	}
	case ISD::SELECT:
	return isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
	isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
	case ISD::FP_EXTEND:
	case ISD::FP_ROUND: {
	if (SNaN)
	return true;
	return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
	}
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return true;
	case ISD::FMA:
	case ISD::FMAD: {
	if (SNaN)
	return true;
	return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
	isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
	isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
	}
	case ISD::FSQRT: // Need is known positive
	case ISD::FLOG:
	case ISD::FLOG2:
	case ISD::FLOG10:
	case ISD::FPOWI:
	case ISD::FPOW: {
	if (SNaN)
	return true;
	// TODO: Refine on operand
	return false;
	}
	case ISD::FMINNUM:
	case ISD::FMAXNUM: {
	// Only one needs to be known not-nan, since it will be returned if the
	// other ends up being one.
	return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) \|\|
	isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
	}
	case ISD::FMINNUM_IEEE:
	case ISD::FMAXNUM_IEEE: {
	if (SNaN)
	return true;
	// This can return a NaN if either operand is an sNaN, or if both operands
	// are NaN.
	return (isKnownNeverNaN(Op.getOperand(0), false, Depth + 1) &&
	isKnownNeverSNaN(Op.getOperand(1), Depth + 1)) \|\|
	(isKnownNeverNaN(Op.getOperand(1), false, Depth + 1) &&
	isKnownNeverSNaN(Op.getOperand(0), Depth + 1));
	}
	case ISD::FMINIMUM:
	case ISD::FMAXIMUM: {
	// TODO: Does this quiet or return the origina NaN as-is?
	return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
	isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
	}
	case ISD::EXTRACT_VECTOR_ELT: {
	return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
	}
	default:
	if (Opcode >= ISD::BUILTIN_OP_END \|\|
	Opcode == ISD::INTRINSIC_WO_CHAIN \|\|
	Opcode == ISD::INTRINSIC_W_CHAIN \|\|
	Opcode == ISD::INTRINSIC_VOID) {
	return TLI->isKnownNeverNaNForTargetNode(Op, *this, SNaN, Depth);
	}

	return false;
	}
	}

	bool SelectionDAG::isKnownNeverZeroFloat(SDValue Op) const {
	assert(Op.getValueType().isFloatingPoint() &&
	"Floating point type expected");

	// If the value is a constant, we can obviously see if it is a zero or not.
	// TODO: Add BuildVector support.
	if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
	return !C->isZero();
	return false;
	}

	bool SelectionDAG::isKnownNeverZero(SDValue Op) const {
	assert(!Op.getValueType().isFloatingPoint() &&
	"Floating point types unsupported - use isKnownNeverZeroFloat");

	// If the value is a constant, we can obviously see if it is a zero or not.
	if (ISD::matchUnaryPredicate(
	Op, [](ConstantSDNode *C) { return !C->isNullValue(); }))
	return true;

	// TODO: Recognize more cases here.
	switch (Op.getOpcode()) {
	default: break;
	case ISD::OR:
	if (isKnownNeverZero(Op.getOperand(1)) \|\|
	isKnownNeverZero(Op.getOperand(0)))
	return true;
	break;
	}

	return false;
	}

	bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
	// Check the obvious case.
	if (A == B) return true;

	// For for negative and positive zero.
	if (const ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A))
	if (const ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B))
	if (CA->isZero() && CB->isZero()) return true;

	// Otherwise they may not be equal.
	return false;
	}

	// FIXME: unify with llvm::haveNoCommonBitsSet.
	// FIXME: could also handle masked merge pattern (X & ~M) op (Y & M)
	bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
	assert(A.getValueType() == B.getValueType() &&
	"Values must have the same type");
	return (computeKnownBits(A).Zero \| computeKnownBits(B).Zero).isAllOnesValue();
	}

	static SDValue FoldBUILD_VECTOR(const SDLoc &DL, EVT VT,
	ArrayRef<SDValue> Ops,
	SelectionDAG &DAG) {
	int NumOps = Ops.size();
	assert(NumOps != 0 && "Can't build an empty vector!");
	assert(VT.getVectorNumElements() == (unsigned)NumOps &&
	"Incorrect element count in BUILD_VECTOR!");

	// BUILD_VECTOR of UNDEFs is UNDEF.
	if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
	return DAG.getUNDEF(VT);

	// BUILD_VECTOR of seq extract/insert from the same vector + type is Identity.
	SDValue IdentitySrc;
	bool IsIdentity = true;
	for (int i = 0; i != NumOps; ++i) {
	if (Ops[i].getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Ops[i].getOperand(0).getValueType() != VT \|\|
	(IdentitySrc && Ops[i].getOperand(0) != IdentitySrc) \|\|
	!isa<ConstantSDNode>(Ops[i].getOperand(1)) \|\|
	cast<ConstantSDNode>(Ops[i].getOperand(1))->getAPIntValue() != i) {
	IsIdentity = false;
	break;
	}
	IdentitySrc = Ops[i].getOperand(0);
	}
	if (IsIdentity)
	return IdentitySrc;

	return SDValue();
	}

	/// Try to simplify vector concatenation to an input value, undef, or build
	/// vector.
	static SDValue foldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
	ArrayRef<SDValue> Ops,
	SelectionDAG &DAG) {
	assert(!Ops.empty() && "Can't concatenate an empty list of vectors!");
	assert(llvm::all_of(Ops,
	[Ops](SDValue Op) {
	return Ops[0].getValueType() == Op.getValueType();
	}) &&
	"Concatenation of vectors with inconsistent value types!");
	assert((Ops.size() * Ops[0].getValueType().getVectorNumElements()) ==
	VT.getVectorNumElements() &&
	"Incorrect element count in vector concatenation!");

	if (Ops.size() == 1)
	return Ops[0];

	// Concat of UNDEFs is UNDEF.
	if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
	return DAG.getUNDEF(VT);

	// Scan the operands and look for extract operations from a single source
	// that correspond to insertion at the same location via this concatenation:
	// concat (extract X, 0subvec_elts), (extract X, 1subvec_elts), ...
	SDValue IdentitySrc;
	bool IsIdentity = true;
	for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
	SDValue Op = Ops[i];
	unsigned IdentityIndex = i * Op.getValueType().getVectorNumElements();
	if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
	Op.getOperand(0).getValueType() != VT \|\|
	(IdentitySrc && Op.getOperand(0) != IdentitySrc) \|\|
	!isa<ConstantSDNode>(Op.getOperand(1)) \|\|
	Op.getConstantOperandVal(1) != IdentityIndex) {
	IsIdentity = false;
	break;
	}
	assert((!IdentitySrc \|\| IdentitySrc == Op.getOperand(0)) &&
	"Unexpected identity source vector for concat of extracts");
	IdentitySrc = Op.getOperand(0);
	}
	if (IsIdentity) {
	assert(IdentitySrc && "Failed to set source vector of extracts");
	return IdentitySrc;
	}

	// A CONCAT_VECTOR with all UNDEF/BUILD_VECTOR operands can be
	// simplified to one big BUILD_VECTOR.
	// FIXME: Add support for SCALAR_TO_VECTOR as well.
	EVT SVT = VT.getScalarType();
	SmallVector<SDValue, 16> Elts;
	for (SDValue Op : Ops) {
	EVT OpVT = Op.getValueType();
	if (Op.isUndef())
	Elts.append(OpVT.getVectorNumElements(), DAG.getUNDEF(SVT));
	else if (Op.getOpcode() == ISD::BUILD_VECTOR)
	Elts.append(Op->op_begin(), Op->op_end());
	else
	return SDValue();
	}

	// BUILD_VECTOR requires all inputs to be of the same type, find the
	// maximum type and extend them all.
	for (SDValue Op : Elts)
	SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);

	if (SVT.bitsGT(VT.getScalarType()))
	for (SDValue &Op : Elts)
	Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT)
	? DAG.getZExtOrTrunc(Op, DL, SVT)
	: DAG.getSExtOrTrunc(Op, DL, SVT);

	SDValue V = DAG.getBuildVector(VT, DL, Elts);
	NewSDValueDbgMsg(V, "New node fold concat vectors: ", &DAG);
	return V;
	}

	/// Gets or creates the specified node.
	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, getVTList(VT), None);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(),
	getVTList(VT));
	CSEMap.InsertNode(N, IP);

	InsertNode(N);
	SDValue V = SDValue(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue Operand, const SDNodeFlags Flags) {
	// Constant fold unary operations with an integer constant operand. Even
	// opaque constant will be folded, because the folding of unary operations
	// doesn't create new constants with different values. Nevertheless, the
	// opaque flag is preserved during folding to prevent future folding with
	// other constants.
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Operand)) {
	const APInt &Val = C->getAPIntValue();
	switch (Opcode) {
	default: break;
	case ISD::SIGN_EXTEND:
	return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), DL, VT,
	C->isTargetOpcode(), C->isOpaque());
	case ISD::TRUNCATE:
	if (C->isOpaque())
	break;
	LLVM_FALLTHROUGH;
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND:
	return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), DL, VT,
	C->isTargetOpcode(), C->isOpaque());
	case ISD::UINT_TO_FP:
	case ISD::SINT_TO_FP: {
	APFloat apf(EVTToAPFloatSemantics(VT),
	APInt::getNullValue(VT.getSizeInBits()));
	(void)apf.convertFromAPInt(Val,
	Opcode==ISD::SINT_TO_FP,
	APFloat::rmNearestTiesToEven);
	return getConstantFP(apf, DL, VT);
	}
	case ISD::BITCAST:
	if (VT == MVT::f16 && C->getValueType(0) == MVT::i16)
	return getConstantFP(APFloat(APFloat::IEEEhalf(), Val), DL, VT);
	if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
	return getConstantFP(APFloat(APFloat::IEEEsingle(), Val), DL, VT);
	if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
	return getConstantFP(APFloat(APFloat::IEEEdouble(), Val), DL, VT);
	if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
	return getConstantFP(APFloat(APFloat::IEEEquad(), Val), DL, VT);
	break;
	case ISD::ABS:
	return getConstant(Val.abs(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::BITREVERSE:
	return getConstant(Val.reverseBits(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::BSWAP:
	return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::CTPOP:
	return getConstant(Val.countPopulation(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF:
	return getConstant(Val.countLeadingZeros(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF:
	return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(),
	C->isOpaque());
	case ISD::FP16_TO_FP: {
	bool Ignored;
	APFloat FPV(APFloat::IEEEhalf(),
	(Val.getBitWidth() == 16) ? Val : Val.trunc(16));

	// This can return overflow, underflow, or inexact; we don't care.
	// FIXME need to be more flexible about rounding mode.
	(void)FPV.convert(EVTToAPFloatSemantics(VT),
	APFloat::rmNearestTiesToEven, &Ignored);
	return getConstantFP(FPV, DL, VT);
	}
	}
	}

	// Constant fold unary operations with a floating point constant operand.
	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Operand)) {
	APFloat V = C->getValueAPF(); // make copy
	switch (Opcode) {
	case ISD::FNEG:
	V.changeSign();
	return getConstantFP(V, DL, VT);
	case ISD::FABS:
	V.clearSign();
	return getConstantFP(V, DL, VT);
	case ISD::FCEIL: {
	APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive);
	if (fs == APFloat::opOK \|\| fs == APFloat::opInexact)
	return getConstantFP(V, DL, VT);
	break;
	}
	case ISD::FTRUNC: {
	APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero);
	if (fs == APFloat::opOK \|\| fs == APFloat::opInexact)
	return getConstantFP(V, DL, VT);
	break;
	}
	case ISD::FFLOOR: {
	APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative);
	if (fs == APFloat::opOK \|\| fs == APFloat::opInexact)
	return getConstantFP(V, DL, VT);
	break;
	}
	case ISD::FP_EXTEND: {
	bool ignored;
	// This can return overflow, underflow, or inexact; we don't care.
	// FIXME need to be more flexible about rounding mode.
	(void)V.convert(EVTToAPFloatSemantics(VT),
	APFloat::rmNearestTiesToEven, &ignored);
	return getConstantFP(V, DL, VT);
	}
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: {
	bool ignored;
	APSInt IntVal(VT.getSizeInBits(), Opcode == ISD::FP_TO_UINT);
	// FIXME need to be more flexible about rounding mode.
	APFloat::opStatus s =
	V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored);
	if (s == APFloat::opInvalidOp) // inexact is OK, in fact usual
	break;
	return getConstant(IntVal, DL, VT);
	}
	case ISD::BITCAST:
	if (VT == MVT::i16 && C->getValueType(0) == MVT::f16)
	return getConstant((uint16_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
	else if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
	return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
	else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
	return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);
	break;
	case ISD::FP_TO_FP16: {
	bool Ignored;
	// This can return overflow, underflow, or inexact; we don't care.
	// FIXME need to be more flexible about rounding mode.
	(void)V.convert(APFloat::IEEEhalf(),
	APFloat::rmNearestTiesToEven, &Ignored);
	return getConstant(V.bitcastToAPInt(), DL, VT);
	}
	}
	}

	// Constant fold unary operations with a vector integer or float operand.
	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Operand)) {
	if (BV->isConstant()) {
	switch (Opcode) {
	default:
	// FIXME: Entirely reasonable to perform folding of other unary
	// operations here as the need arises.
	break;
	case ISD::FNEG:
	case ISD::FABS:
	case ISD::FCEIL:
	case ISD::FTRUNC:
	case ISD::FFLOOR:
	case ISD::FP_EXTEND:
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	case ISD::TRUNCATE:
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::UINT_TO_FP:
	case ISD::SINT_TO_FP:
	case ISD::ABS:
	case ISD::BITREVERSE:
	case ISD::BSWAP:
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF:
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF:
	case ISD::CTPOP: {
	SDValue Ops = { Operand };
	if (SDValue Fold = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops))
	return Fold;
	}
	}
	}
	}

	unsigned OpOpcode = Operand.getNode()->getOpcode();
	switch (Opcode) {
	case ISD::TokenFactor:
	case ISD::MERGE_VALUES:
	case ISD::CONCAT_VECTORS:
	return Operand; // Factor, merge or concat of one node? No need.
	case ISD::BUILD_VECTOR: {
	// Attempt to simplify BUILD_VECTOR.
	SDValue Ops[] = {Operand};
	if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
	return V;
	break;
	}
	case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node");
	case ISD::FP_EXTEND:
	assert(VT.isFloatingPoint() &&
	Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
	if (Operand.getValueType() == VT) return Operand; // noop conversion.
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsLT(VT) &&
	"Invalid fpext node, dst < src!");
	if (Operand.isUndef())
	return getUNDEF(VT);
	break;
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	if (Operand.isUndef())
	return getUNDEF(VT);
	break;
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	// [us]itofp(undef) = 0, because the result value is bounded.
	if (Operand.isUndef())
	return getConstantFP(0.0, DL, VT);
	break;
	case ISD::SIGN_EXTEND:
	assert(VT.isInteger() && Operand.getValueType().isInteger() &&
	"Invalid SIGN_EXTEND!");
	assert(VT.isVector() == Operand.getValueType().isVector() &&
	"SIGN_EXTEND result type type should be vector iff the operand "
	"type is vector!");
	if (Operand.getValueType() == VT) return Operand; // noop extension
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsLT(VT) &&
	"Invalid sext node, dst < src!");
	if (OpOpcode == ISD::SIGN_EXTEND \|\| OpOpcode == ISD::ZERO_EXTEND)
	return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
	else if (OpOpcode == ISD::UNDEF)
	// sext(undef) = 0, because the top bits will all be the same.
	return getConstant(0, DL, VT);
	break;
	case ISD::ZERO_EXTEND:
	assert(VT.isInteger() && Operand.getValueType().isInteger() &&
	"Invalid ZERO_EXTEND!");
	assert(VT.isVector() == Operand.getValueType().isVector() &&
	"ZERO_EXTEND result type type should be vector iff the operand "
	"type is vector!");
	if (Operand.getValueType() == VT) return Operand; // noop extension
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsLT(VT) &&
	"Invalid zext node, dst < src!");
	if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x)
	return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0));
	else if (OpOpcode == ISD::UNDEF)
	// zext(undef) = 0, because the top bits will be zero.
	return getConstant(0, DL, VT);
	break;
	case ISD::ANY_EXTEND:
	assert(VT.isInteger() && Operand.getValueType().isInteger() &&
	"Invalid ANY_EXTEND!");
	assert(VT.isVector() == Operand.getValueType().isVector() &&
	"ANY_EXTEND result type type should be vector iff the operand "
	"type is vector!");
	if (Operand.getValueType() == VT) return Operand; // noop extension
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsLT(VT) &&
	"Invalid anyext node, dst < src!");

	if (OpOpcode == ISD::ZERO_EXTEND \|\| OpOpcode == ISD::SIGN_EXTEND \|\|
	OpOpcode == ISD::ANY_EXTEND)
	// (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x)
	return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
	else if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);

	// (ext (trunc x)) -> x
	if (OpOpcode == ISD::TRUNCATE) {
	SDValue OpOp = Operand.getOperand(0);
	if (OpOp.getValueType() == VT) {
	transferDbgValues(Operand, OpOp);
	return OpOp;
	}
	}
	break;
	case ISD::TRUNCATE:
	assert(VT.isInteger() && Operand.getValueType().isInteger() &&
	"Invalid TRUNCATE!");
	assert(VT.isVector() == Operand.getValueType().isVector() &&
	"TRUNCATE result type type should be vector iff the operand "
	"type is vector!");
	if (Operand.getValueType() == VT) return Operand; // noop truncate
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() ==
	Operand.getValueType().getVectorNumElements()) &&
	"Vector element count mismatch!");
	assert(Operand.getValueType().bitsGT(VT) &&
	"Invalid truncate node, src < dst!");
	if (OpOpcode == ISD::TRUNCATE)
	return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
	if (OpOpcode == ISD::ZERO_EXTEND \|\| OpOpcode == ISD::SIGN_EXTEND \|\|
	OpOpcode == ISD::ANY_EXTEND) {
	// If the source is smaller than the dest, we still need an extend.
	if (Operand.getOperand(0).getValueType().getScalarType()
	.bitsLT(VT.getScalarType()))
	return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
	if (Operand.getOperand(0).getValueType().bitsGT(VT))
	return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
	return Operand.getOperand(0);
	}
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	assert(VT.isVector() && "This DAG node is restricted to vector types.");
	assert(Operand.getValueType().bitsLE(VT) &&
	"The input must be the same size or smaller than the result.");
	assert(VT.getVectorNumElements() <
	Operand.getValueType().getVectorNumElements() &&
	"The destination vector type must have fewer lanes than the input.");
	break;
	case ISD::ABS:
	assert(VT.isInteger() && VT == Operand.getValueType() &&
	"Invalid ABS!");
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::BSWAP:
	assert(VT.isInteger() && VT == Operand.getValueType() &&
	"Invalid BSWAP!");
	assert((VT.getScalarSizeInBits() % 16 == 0) &&
	"BSWAP types must be a multiple of 16 bits!");
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::BITREVERSE:
	assert(VT.isInteger() && VT == Operand.getValueType() &&
	"Invalid BITREVERSE!");
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::BITCAST:
	// Basic sanity checking.
	assert(VT.getSizeInBits() == Operand.getValueSizeInBits() &&
	"Cannot BITCAST between types of different sizes!");
	if (VT == Operand.getValueType()) return Operand; // noop conversion.
	if (OpOpcode == ISD::BITCAST) // bitconv(bitconv(x)) -> bitconv(x)
	return getNode(ISD::BITCAST, DL, VT, Operand.getOperand(0));
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	break;
	case ISD::SCALAR_TO_VECTOR:
	assert(VT.isVector() && !Operand.getValueType().isVector() &&
	(VT.getVectorElementType() == Operand.getValueType() \|\|
	(VT.getVectorElementType().isInteger() &&
	Operand.getValueType().isInteger() &&
	VT.getVectorElementType().bitsLE(Operand.getValueType()))) &&
	"Illegal SCALAR_TO_VECTOR node!");
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);
	// scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined.
	if (OpOpcode == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(Operand.getOperand(1)) &&
	Operand.getConstantOperandVal(1) == 0 &&
	Operand.getOperand(0).getValueType() == VT)
	return Operand.getOperand(0);
	break;
	case ISD::FNEG:
	// Negation of an unknown bag of bits is still completely undefined.
	if (OpOpcode == ISD::UNDEF)
	return getUNDEF(VT);

	// -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
	if ((getTarget().Options.UnsafeFPMath \|\| Flags.hasNoSignedZeros()) &&
	OpOpcode == ISD::FSUB)
	return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
	Operand.getOperand(0), Flags);
	if (OpOpcode == ISD::FNEG) // --X -> X
	return Operand.getOperand(0);
	break;
	case ISD::FABS:
	if (OpOpcode == ISD::FNEG) // abs(-X) -> abs(X)
	return getNode(ISD::FABS, DL, VT, Operand.getOperand(0));
	break;
	}

	SDNode *N;
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = {Operand};
	if (VT != MVT::Glue) { // Don't CSE flag producing nodes
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTs, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
	E->intersectFlagsWith(Flags);
	return SDValue(E, 0);
	}

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	N->setFlags(Flags);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);
	}

	InsertNode(N);
	SDValue V = SDValue(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	static std::pair<APInt, bool> FoldValue(unsigned Opcode, const APInt &C1,
	const APInt &C2) {
	switch (Opcode) {
	case ISD::ADD: return std::make_pair(C1 + C2, true);
	case ISD::SUB: return std::make_pair(C1 - C2, true);
	case ISD::MUL: return std::make_pair(C1 * C2, true);
	case ISD::AND: return std::make_pair(C1 & C2, true);
	case ISD::OR: return std::make_pair(C1 \| C2, true);
	case ISD::XOR: return std::make_pair(C1 ^ C2, true);
	case ISD::SHL: return std::make_pair(C1 << C2, true);
	case ISD::SRL: return std::make_pair(C1.lshr(C2), true);
	case ISD::SRA: return std::make_pair(C1.ashr(C2), true);
	case ISD::ROTL: return std::make_pair(C1.rotl(C2), true);
	case ISD::ROTR: return std::make_pair(C1.rotr(C2), true);
	case ISD::SMIN: return std::make_pair(C1.sle(C2) ? C1 : C2, true);
	case ISD::SMAX: return std::make_pair(C1.sge(C2) ? C1 : C2, true);
	case ISD::UMIN: return std::make_pair(C1.ule(C2) ? C1 : C2, true);
	case ISD::UMAX: return std::make_pair(C1.uge(C2) ? C1 : C2, true);
	case ISD::SADDSAT: return std::make_pair(C1.sadd_sat(C2), true);
	case ISD::UADDSAT: return std::make_pair(C1.uadd_sat(C2), true);
	case ISD::SSUBSAT: return std::make_pair(C1.ssub_sat(C2), true);
	case ISD::USUBSAT: return std::make_pair(C1.usub_sat(C2), true);
	case ISD::UDIV:
	if (!C2.getBoolValue())
	break;
	return std::make_pair(C1.udiv(C2), true);
	case ISD::UREM:
	if (!C2.getBoolValue())
	break;
	return std::make_pair(C1.urem(C2), true);
	case ISD::SDIV:
	if (!C2.getBoolValue())
	break;
	return std::make_pair(C1.sdiv(C2), true);
	case ISD::SREM:
	if (!C2.getBoolValue())
	break;
	return std::make_pair(C1.srem(C2), true);
	}
	return std::make_pair(APInt(1, 0), false);
	}

	SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
	EVT VT, const ConstantSDNode *C1,
	const ConstantSDNode *C2) {
	if (C1->isOpaque() \|\| C2->isOpaque())
	return SDValue();

	std::pair<APInt, bool> Folded = FoldValue(Opcode, C1->getAPIntValue(),
	C2->getAPIntValue());
	if (!Folded.second)
	return SDValue();
	return getConstant(Folded.first, DL, VT);
	}

	SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
	const GlobalAddressSDNode *GA,
	const SDNode *N2) {
	if (GA->getOpcode() != ISD::GlobalAddress)
	return SDValue();
	if (!TLI->isOffsetFoldingLegal(GA))
	return SDValue();
	auto *C2 = dyn_cast<ConstantSDNode>(N2);
	if (!C2)
	return SDValue();
	int64_t Offset = C2->getSExtValue();
	switch (Opcode) {
	case ISD::ADD: break;
	case ISD::SUB: Offset = -uint64_t(Offset); break;
	default: return SDValue();
	}
	return getGlobalAddress(GA->getGlobal(), SDLoc(C2), VT,
	GA->getOffset() + uint64_t(Offset));
	}

	bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
	switch (Opcode) {
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM: {
	// If a divisor is zero/undef or any element of a divisor vector is
	// zero/undef, the whole op is undef.
	assert(Ops.size() == 2 && "Div/rem should have 2 operands");
	SDValue Divisor = Ops[1];
	if (Divisor.isUndef() \|\| isNullConstant(Divisor))
	return true;

	return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) &&
	llvm::any_of(Divisor->op_values(),
	[](SDValue V) { return V.isUndef() \|\|
	isNullConstant(V); });
	// TODO: Handle signed overflow.
	}
	// TODO: Handle oversized shifts.
	default:
	return false;
	}
	}

	SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
	EVT VT, SDNode N1, SDNode N2) {
	// If the opcode is a target-specific ISD node, there's nothing we can
	// do here and the operand rules may not line up with the below, so
	// bail early.
	if (Opcode >= ISD::BUILTIN_OP_END)
	return SDValue();

	if (isUndef(Opcode, {SDValue(N1, 0), SDValue(N2, 0)}))
	return getUNDEF(VT);

	// Handle the case of two scalars.
	if (auto *C1 = dyn_cast<ConstantSDNode>(N1)) {
	if (auto *C2 = dyn_cast<ConstantSDNode>(N2)) {
	SDValue Folded = FoldConstantArithmetic(Opcode, DL, VT, C1, C2);
	assert((!Folded \|\| !VT.isVector()) &&
	"Can't fold vectors ops with scalar operands");
	return Folded;
	}
	}

	// fold (add Sym, c) -> Sym+c
	if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N1))
	return FoldSymbolOffset(Opcode, VT, GA, N2);
	if (TLI->isCommutativeBinOp(Opcode))
	if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N2))
	return FoldSymbolOffset(Opcode, VT, GA, N1);

	// For vectors, extract each constant element and fold them individually.
	// Either input may be an undef value.
	auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
	if (!BV1 && !N1->isUndef())
	return SDValue();
	auto *BV2 = dyn_cast<BuildVectorSDNode>(N2);
	if (!BV2 && !N2->isUndef())
	return SDValue();
	// If both operands are undef, that's handled the same way as scalars.
	if (!BV1 && !BV2)
	return SDValue();

	assert((!BV1 \|\| !BV2 \|\| BV1->getNumOperands() == BV2->getNumOperands()) &&
	"Vector binop with different number of elements in operands?");

	EVT SVT = VT.getScalarType();
	EVT LegalSVT = SVT;
	if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
	LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
	if (LegalSVT.bitsLT(SVT))
	return SDValue();
	}
	SmallVector<SDValue, 4> Outputs;
	unsigned NumOps = BV1 ? BV1->getNumOperands() : BV2->getNumOperands();
	for (unsigned I = 0; I != NumOps; ++I) {
	SDValue V1 = BV1 ? BV1->getOperand(I) : getUNDEF(SVT);
	SDValue V2 = BV2 ? BV2->getOperand(I) : getUNDEF(SVT);
	if (SVT.isInteger()) {
	if (V1->getValueType(0).bitsGT(SVT))
	V1 = getNode(ISD::TRUNCATE, DL, SVT, V1);
	if (V2->getValueType(0).bitsGT(SVT))
	V2 = getNode(ISD::TRUNCATE, DL, SVT, V2);
	}

	if (V1->getValueType(0) != SVT \|\| V2->getValueType(0) != SVT)
	return SDValue();

	// Fold one vector element.
	SDValue ScalarResult = getNode(Opcode, DL, SVT, V1, V2);
	if (LegalSVT != SVT)
	ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);

	// Scalar folding only succeeded if the result is a constant or UNDEF.
	if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
	ScalarResult.getOpcode() != ISD::ConstantFP)
	return SDValue();
	Outputs.push_back(ScalarResult);
	}

	assert(VT.getVectorNumElements() == Outputs.size() &&
	"Vector size mismatch!");

	// We may have a vector type but a scalar result. Create a splat.
	Outputs.resize(VT.getVectorNumElements(), Outputs.back());

	// Build a big vector out of the scalar elements we generated.
	return getBuildVector(VT, SDLoc(), Outputs);
	}

	// TODO: Merge with FoldConstantArithmetic
	SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
	const SDLoc &DL, EVT VT,
	ArrayRef<SDValue> Ops,
	const SDNodeFlags Flags) {
	// If the opcode is a target-specific ISD node, there's nothing we can
	// do here and the operand rules may not line up with the below, so
	// bail early.
	if (Opcode >= ISD::BUILTIN_OP_END)
	return SDValue();

	if (isUndef(Opcode, Ops))
	return getUNDEF(VT);

	// We can only fold vectors - maybe merge with FoldConstantArithmetic someday?
	if (!VT.isVector())
	return SDValue();

	unsigned NumElts = VT.getVectorNumElements();

	auto IsScalarOrSameVectorSize = [&](const SDValue &Op) {
	return !Op.getValueType().isVector() \|\|
	Op.getValueType().getVectorNumElements() == NumElts;
	};

	auto IsConstantBuildVectorOrUndef = [&](const SDValue &Op) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op);
	return (Op.isUndef()) \|\| (Op.getOpcode() == ISD::CONDCODE) \|\|
	(BV && BV->isConstant());
	};

	// All operands must be vector types with the same number of elements as
	// the result type and must be either UNDEF or a build vector of constant
	// or UNDEF scalars.
	if (!llvm::all_of(Ops, IsConstantBuildVectorOrUndef) \|\|
	!llvm::all_of(Ops, IsScalarOrSameVectorSize))
	return SDValue();

	// If we are comparing vectors, then the result needs to be a i1 boolean
	// that is then sign-extended back to the legal result type.
	EVT SVT = (Opcode == ISD::SETCC ? MVT::i1 : VT.getScalarType());

	// Find legal integer scalar type for constant promotion and
	// ensure that its scalar size is at least as large as source.
	EVT LegalSVT = VT.getScalarType();
	if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
	LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
	if (LegalSVT.bitsLT(VT.getScalarType()))
	return SDValue();
	}

	// Constant fold each scalar lane separately.
	SmallVector<SDValue, 4> ScalarResults;
	for (unsigned i = 0; i != NumElts; i++) {
	SmallVector<SDValue, 4> ScalarOps;
	for (SDValue Op : Ops) {
	EVT InSVT = Op.getValueType().getScalarType();
	BuildVectorSDNode *InBV = dyn_cast<BuildVectorSDNode>(Op);
	if (!InBV) {
	// We've checked that this is UNDEF or a constant of some kind.
	if (Op.isUndef())
	ScalarOps.push_back(getUNDEF(InSVT));
	else
	ScalarOps.push_back(Op);
	continue;
	}

	SDValue ScalarOp = InBV->getOperand(i);
	EVT ScalarVT = ScalarOp.getValueType();

	// Build vector (integer) scalar operands may need implicit
	// truncation - do this before constant folding.
	if (ScalarVT.isInteger() && ScalarVT.bitsGT(InSVT))
	ScalarOp = getNode(ISD::TRUNCATE, DL, InSVT, ScalarOp);

	ScalarOps.push_back(ScalarOp);
	}

	// Constant fold the scalar operands.
	SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags);

	// Legalize the (integer) scalar constant if necessary.
	if (LegalSVT != SVT)
	ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);

	// Scalar folding only succeeded if the result is a constant or UNDEF.
	if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
	ScalarResult.getOpcode() != ISD::ConstantFP)
	return SDValue();
	ScalarResults.push_back(ScalarResult);
	}

	SDValue V = getBuildVector(VT, DL, ScalarResults);
	NewSDValueDbgMsg(V, "New node fold constant vector: ", this);
	return V;
	}

	SDValue SelectionDAG::foldConstantFPMath(unsigned Opcode, const SDLoc &DL,
	EVT VT, SDValue N1, SDValue N2) {
	// TODO: We don't do any constant folding for strict FP opcodes here, but we
	// should. That will require dealing with a potentially non-default
	// rounding mode, checking the "opStatus" return value from the APFloat
	// math calculations, and possibly other variations.
	auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1.getNode());
	auto *N2CFP = dyn_cast<ConstantFPSDNode>(N2.getNode());
	if (N1CFP && N2CFP) {
	APFloat C1 = N1CFP->getValueAPF(), C2 = N2CFP->getValueAPF();
	switch (Opcode) {
	case ISD::FADD:
	C1.add(C2, APFloat::rmNearestTiesToEven);
	return getConstantFP(C1, DL, VT);
	case ISD::FSUB:
	C1.subtract(C2, APFloat::rmNearestTiesToEven);
	return getConstantFP(C1, DL, VT);
	case ISD::FMUL:
	C1.multiply(C2, APFloat::rmNearestTiesToEven);
	return getConstantFP(C1, DL, VT);
	case ISD::FDIV:
	C1.divide(C2, APFloat::rmNearestTiesToEven);
	return getConstantFP(C1, DL, VT);
	case ISD::FREM:
	C1.mod(C2);
	return getConstantFP(C1, DL, VT);
	case ISD::FCOPYSIGN:
	C1.copySign(C2);
	return getConstantFP(C1, DL, VT);
	default: break;
	}
	}
	if (N1CFP && Opcode == ISD::FP_ROUND) {
	APFloat C1 = N1CFP->getValueAPF(); // make copy
	bool Unused;
	// This can return overflow, underflow, or inexact; we don't care.
	// FIXME need to be more flexible about rounding mode.
	(void) C1.convert(EVTToAPFloatSemantics(VT), APFloat::rmNearestTiesToEven,
	&Unused);
	return getConstantFP(C1, DL, VT);
	}

	switch (Opcode) {
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	// If both operands are undef, the result is undef. If 1 operand is undef,
	// the result is NaN. This should match the behavior of the IR optimizer.
	if (N1.isUndef() && N2.isUndef())
	return getUNDEF(VT);
	if (N1.isUndef() \|\| N2.isUndef())
	return getConstantFP(APFloat::getNaN(EVTToAPFloatSemantics(VT)), DL, VT);
	}
	return SDValue();
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue N1, SDValue N2, const SDNodeFlags Flags) {
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);

	// Canonicalize constant to RHS if commutative.
	if (TLI->isCommutativeBinOp(Opcode)) {
	if (N1C && !N2C) {
	std::swap(N1C, N2C);
	std::swap(N1, N2);
	} else if (N1CFP && !N2CFP) {
	std::swap(N1CFP, N2CFP);
	std::swap(N1, N2);
	}
	}

	switch (Opcode) {
	default: break;
	case ISD::TokenFactor:
	assert(VT == MVT::Other && N1.getValueType() == MVT::Other &&
	N2.getValueType() == MVT::Other && "Invalid token factor!");
	// Fold trivial token factors.
	if (N1.getOpcode() == ISD::EntryToken) return N2;
	if (N2.getOpcode() == ISD::EntryToken) return N1;
	if (N1 == N2) return N1;
	break;
	case ISD::BUILD_VECTOR: {
	// Attempt to simplify BUILD_VECTOR.
	SDValue Ops[] = {N1, N2};
	if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
	return V;
	break;
	}
	case ISD::CONCAT_VECTORS: {
	SDValue Ops[] = {N1, N2};
	if (SDValue V = foldCONCAT_VECTORS(DL, VT, Ops, *this))
	return V;
	break;
	}
	case ISD::AND:
	assert(VT.isInteger() && "This operator does not apply to FP types!");
	assert(N1.getValueType() == N2.getValueType() &&
	N1.getValueType() == VT && "Binary operator types must match!");
	// (X & 0) -> 0. This commonly occurs when legalizing i64 values, so it's
	// worth handling here.
	if (N2C && N2C->isNullValue())
	return N2;
	if (N2C && N2C->isAllOnesValue()) // X & -1 -> X
	return N1;
	break;
	case ISD::OR:
	case ISD::XOR:
	case ISD::ADD:
	case ISD::SUB:
	assert(VT.isInteger() && "This operator does not apply to FP types!");
	assert(N1.getValueType() == N2.getValueType() &&
	N1.getValueType() == VT && "Binary operator types must match!");
	// (X ^\|+- 0) -> X. This commonly occurs when legalizing i64 values, so
	// it's worth handling here.
	if (N2C && N2C->isNullValue())
	return N1;
	break;
	case ISD::UDIV:
	case ISD::UREM:
	case ISD::MULHU:
	case ISD::MULHS:
	case ISD::MUL:
	case ISD::SDIV:
	case ISD::SREM:
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX:
	case ISD::SADDSAT:
	case ISD::SSUBSAT:
	case ISD::UADDSAT:
	case ISD::USUBSAT:
	assert(VT.isInteger() && "This operator does not apply to FP types!");
	assert(N1.getValueType() == N2.getValueType() &&
	N1.getValueType() == VT && "Binary operator types must match!");
	break;
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
	assert(N1.getValueType() == N2.getValueType() &&
	N1.getValueType() == VT && "Binary operator types must match!");
	if (SDValue V = simplifyFPBinop(Opcode, N1, N2))
	return V;
	break;
	case ISD::FCOPYSIGN: // N1 and result must match. N1/N2 need not match.
	assert(N1.getValueType() == VT &&
	N1.getValueType().isFloatingPoint() &&
	N2.getValueType().isFloatingPoint() &&
	"Invalid FCOPYSIGN!");
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	if (SDValue V = simplifyShift(N1, N2))
	return V;
	LLVM_FALLTHROUGH;
	case ISD::ROTL:
	case ISD::ROTR:
	assert(VT == N1.getValueType() &&
	"Shift operators return type must be the same as their first arg");
	assert(VT.isInteger() && N2.getValueType().isInteger() &&
	"Shifts only work on integers");
	assert((!VT.isVector() \|\| VT == N2.getValueType()) &&
	"Vector shift amounts must be in the same as their first arg");
	// Verify that the shift amount VT is big enough to hold valid shift
	// amounts. This catches things like trying to shift an i1024 value by an
	// i8, which is easy to fall into in generic code that uses
	// TLI.getShiftAmount().
	assert(N2.getValueSizeInBits() >= Log2_32_Ceil(N1.getValueSizeInBits()) &&
	"Invalid use of small shift amount with oversized value!");

	// Always fold shifts of i1 values so the code generator doesn't need to
	// handle them. Since we know the size of the shift has to be less than the
	// size of the value, the shift/rotate count is guaranteed to be zero.
	if (VT == MVT::i1)
	return N1;
	if (N2C && N2C->isNullValue())
	return N1;
	break;
	case ISD::FP_ROUND_INREG: {
	EVT EVT = cast<VTSDNode>(N2)->getVT();
	assert(VT == N1.getValueType() && "Not an inreg round!");
	assert(VT.isFloatingPoint() && EVT.isFloatingPoint() &&
	"Cannot FP_ROUND_INREG integer types");
	assert(EVT.isVector() == VT.isVector() &&
	"FP_ROUND_INREG type should be vector iff the operand "
	"type is vector!");
	assert((!EVT.isVector() \|\|
	EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
	"Vector element counts must match in FP_ROUND_INREG");
	assert(EVT.bitsLE(VT) && "Not rounding down!");
	(void)EVT;
	if (cast<VTSDNode>(N2)->getVT() == VT) return N1; // Not actually rounding.
	break;
	}
	case ISD::FP_ROUND:
	assert(VT.isFloatingPoint() &&
	N1.getValueType().isFloatingPoint() &&
	VT.bitsLE(N1.getValueType()) &&
	N2C && (N2C->getZExtValue() == 0 \|\| N2C->getZExtValue() == 1) &&
	"Invalid FP_ROUND!");
	if (N1.getValueType() == VT) return N1; // noop conversion.
	break;
	case ISD::AssertSext:
	case ISD::AssertZext: {
	EVT EVT = cast<VTSDNode>(N2)->getVT();
	assert(VT == N1.getValueType() && "Not an inreg extend!");
	assert(VT.isInteger() && EVT.isInteger() &&
	"Cannot *_EXTEND_INREG FP types");
	assert(!EVT.isVector() &&
	"AssertSExt/AssertZExt type should be the vector element type "
	"rather than the vector type!");
	assert(EVT.bitsLE(VT.getScalarType()) && "Not extending!");
	if (VT.getScalarType() == EVT) return N1; // noop assertion.
	break;
	}
	case ISD::SIGN_EXTEND_INREG: {
	EVT EVT = cast<VTSDNode>(N2)->getVT();
	assert(VT == N1.getValueType() && "Not an inreg extend!");
	assert(VT.isInteger() && EVT.isInteger() &&
	"Cannot *_EXTEND_INREG FP types");
	assert(EVT.isVector() == VT.isVector() &&
	"SIGN_EXTEND_INREG type should be vector iff the operand "
	"type is vector!");
	assert((!EVT.isVector() \|\|
	EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
	"Vector element counts must match in SIGN_EXTEND_INREG");
	assert(EVT.bitsLE(VT) && "Not extending!");
	if (EVT == VT) return N1; // Not actually extending

	auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) {
	unsigned FromBits = EVT.getScalarSizeInBits();
	Val <<= Val.getBitWidth() - FromBits;
	Val.ashrInPlace(Val.getBitWidth() - FromBits);
	return getConstant(Val, DL, ConstantVT);
	};

	if (N1C) {
	const APInt &Val = N1C->getAPIntValue();
	return SignExtendInReg(Val, VT);
	}
	if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
	SmallVector<SDValue, 8> Ops;
	llvm::EVT OpVT = N1.getOperand(0).getValueType();
	for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	SDValue Op = N1.getOperand(i);
	if (Op.isUndef()) {
	Ops.push_back(getUNDEF(OpVT));
	continue;
	}
	ConstantSDNode *C = cast<ConstantSDNode>(Op);
	APInt Val = C->getAPIntValue();
	Ops.push_back(SignExtendInReg(Val, OpVT));
	}
	return getBuildVector(VT, DL, Ops);
	}
	break;
	}
	case ISD::EXTRACT_VECTOR_ELT:
	assert(VT.getSizeInBits() >= N1.getValueType().getScalarSizeInBits() &&
	"The result of EXTRACT_VECTOR_ELT must be at least as wide as the \
	element type of the vector.");

	// EXTRACT_VECTOR_ELT of an UNDEF is an UNDEF.
	if (N1.isUndef())
	return getUNDEF(VT);

	// EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF
	if (N2C && N2C->getAPIntValue().uge(N1.getValueType().getVectorNumElements()))
	return getUNDEF(VT);

	// EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
	// expanding copies of large vectors from registers.
	if (N2C &&
	N1.getOpcode() == ISD::CONCAT_VECTORS &&
	N1.getNumOperands() > 0) {
	unsigned Factor =
	N1.getOperand(0).getValueType().getVectorNumElements();
	return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	N1.getOperand(N2C->getZExtValue() / Factor),
	getConstant(N2C->getZExtValue() % Factor, DL,
	N2.getValueType()));
	}

	// EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is
	// expanding large vector constants.
	if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
	SDValue Elt = N1.getOperand(N2C->getZExtValue());

	if (VT != Elt.getValueType())
	// If the vector element type is not legal, the BUILD_VECTOR operands
	// are promoted and implicitly truncated, and the result implicitly
	// extended. Make that explicit here.
	Elt = getAnyExtOrTrunc(Elt, DL, VT);

	return Elt;
	}

	// EXTRACT_VECTOR_ELT of INSERT_VECTOR_ELT is often formed when vector
	// operations are lowered to scalars.
	if (N1.getOpcode() == ISD::INSERT_VECTOR_ELT) {
	// If the indices are the same, return the inserted element else
	// if the indices are known different, extract the element from
	// the original vector.
	SDValue N1Op2 = N1.getOperand(2);
	ConstantSDNode *N1Op2C = dyn_cast<ConstantSDNode>(N1Op2);

	if (N1Op2C && N2C) {
	if (N1Op2C->getZExtValue() == N2C->getZExtValue()) {
	if (VT == N1.getOperand(1).getValueType())
	return N1.getOperand(1);
	else
	return getSExtOrTrunc(N1.getOperand(1), DL, VT);
	}

	return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2);
	}
	}

	// EXTRACT_VECTOR_ELT of v1iX EXTRACT_SUBVECTOR could be formed
	// when vector types are scalarized and v1iX is legal.
	// vextract (v1iX extract_subvector(vNiX, Idx)) -> vextract(vNiX,Idx)
	if (N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N1.getValueType().getVectorNumElements() == 1) {
	return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0),
	N1.getOperand(1));
	}
	break;
	case ISD::EXTRACT_ELEMENT:
	assert(N2C && (unsigned)N2C->getZExtValue() < 2 && "Bad EXTRACT_ELEMENT!");
	assert(!N1.getValueType().isVector() && !VT.isVector() &&
	(N1.getValueType().isInteger() == VT.isInteger()) &&
	N1.getValueType() != VT &&
	"Wrong types for EXTRACT_ELEMENT!");

	// EXTRACT_ELEMENT of BUILD_PAIR is often formed while legalize is expanding
	// 64-bit integers into 32-bit parts. Instead of building the extract of
	// the BUILD_PAIR, only to have legalize rip it apart, just do it now.
	if (N1.getOpcode() == ISD::BUILD_PAIR)
	return N1.getOperand(N2C->getZExtValue());

	// EXTRACT_ELEMENT of a constant int is also very common.
	if (N1C) {
	unsigned ElementSize = VT.getSizeInBits();
	unsigned Shift = ElementSize * N2C->getZExtValue();
	APInt ShiftedVal = N1C->getAPIntValue().lshr(Shift);
	return getConstant(ShiftedVal.trunc(ElementSize), DL, VT);
	}
	break;
	case ISD::EXTRACT_SUBVECTOR:
	if (VT.isSimple() && N1.getValueType().isSimple()) {
	assert(VT.isVector() && N1.getValueType().isVector() &&
	"Extract subvector VTs must be a vectors!");
	assert(VT.getVectorElementType() ==
	N1.getValueType().getVectorElementType() &&
	"Extract subvector VTs must have the same element type!");
	assert(VT.getSimpleVT() <= N1.getSimpleValueType() &&
	"Extract subvector must be from larger vector to smaller vector!");

	if (N2C) {
	assert((VT.getVectorNumElements() + N2C->getZExtValue()
	<= N1.getValueType().getVectorNumElements())
	&& "Extract subvector overflow!");
	}

	// Trivial extraction.
	if (VT.getSimpleVT() == N1.getSimpleValueType())
	return N1;

	// EXTRACT_SUBVECTOR of an UNDEF is an UNDEF.
	if (N1.isUndef())
	return getUNDEF(VT);

	// EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of
	// the concat have the same type as the extract.
	if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS &&
	N1.getNumOperands() > 0 &&
	VT == N1.getOperand(0).getValueType()) {
	unsigned Factor = VT.getVectorNumElements();
	return N1.getOperand(N2C->getZExtValue() / Factor);
	}

	// EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created
	// during shuffle legalization.
	if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) &&
	VT == N1.getOperand(1).getValueType())
	return N1.getOperand(1);
	}
	break;
	}

	// Perform trivial constant folding.
	if (SDValue SV =
	FoldConstantArithmetic(Opcode, DL, VT, N1.getNode(), N2.getNode()))
	return SV;

	if (SDValue V = foldConstantFPMath(Opcode, DL, VT, N1, N2))
	return V;

	// Canonicalize an UNDEF to the RHS, even over a constant.
	if (N1.isUndef()) {
	if (TLI->isCommutativeBinOp(Opcode)) {
	std::swap(N1, N2);
	} else {
	switch (Opcode) {
	case ISD::FP_ROUND_INREG:
	case ISD::SIGN_EXTEND_INREG:
	case ISD::SUB:
	return getUNDEF(VT); // fold op(undef, arg2) -> undef
	case ISD::UDIV:
	case ISD::SDIV:
	case ISD::UREM:
	case ISD::SREM:
	case ISD::SSUBSAT:
	case ISD::USUBSAT:
	return getConstant(0, DL, VT); // fold op(undef, arg2) -> 0
	}
	}
	}

	// Fold a bunch of operators when the RHS is undef.
	if (N2.isUndef()) {
	switch (Opcode) {
	case ISD::XOR:
	if (N1.isUndef())
	// Handle undef ^ undef -> 0 special case. This is a common
	// idiom (misuse).
	return getConstant(0, DL, VT);
	LLVM_FALLTHROUGH;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::UDIV:
	case ISD::SDIV:
	case ISD::UREM:
	case ISD::SREM:
	return getUNDEF(VT); // fold op(arg1, undef) -> undef
	case ISD::MUL:
	case ISD::AND:
	case ISD::SSUBSAT:
	case ISD::USUBSAT:
	return getConstant(0, DL, VT); // fold op(arg1, undef) -> 0
	case ISD::OR:
	case ISD::SADDSAT:
	case ISD::UADDSAT:
	return getAllOnesConstant(DL, VT);
	}
	}

	// Memoize this node if possible.
	SDNode *N;
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = {N1, N2};
	if (VT != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTs, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
	E->intersectFlagsWith(Flags);
	return SDValue(E, 0);
	}

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	N->setFlags(Flags);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);
	}

	InsertNode(N);
	SDValue V = SDValue(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue N1, SDValue N2, SDValue N3,
	const SDNodeFlags Flags) {
	// Perform various simplifications.
	switch (Opcode) {
	case ISD::FMA: {
	assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
	assert(N1.getValueType() == VT && N2.getValueType() == VT &&
	N3.getValueType() == VT && "FMA types must match!");
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
	ConstantFPSDNode *N3CFP = dyn_cast<ConstantFPSDNode>(N3);
	if (N1CFP && N2CFP && N3CFP) {
	APFloat V1 = N1CFP->getValueAPF();
	const APFloat &V2 = N2CFP->getValueAPF();
	const APFloat &V3 = N3CFP->getValueAPF();
	V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven);
	return getConstantFP(V1, DL, VT);
	}
	break;
	}
	case ISD::BUILD_VECTOR: {
	// Attempt to simplify BUILD_VECTOR.
	SDValue Ops[] = {N1, N2, N3};
	if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
	return V;
	break;
	}
	case ISD::CONCAT_VECTORS: {
	SDValue Ops[] = {N1, N2, N3};
	if (SDValue V = foldCONCAT_VECTORS(DL, VT, Ops, *this))
	return V;
	break;
	}
	case ISD::SETCC: {
	assert(VT.isInteger() && "SETCC result type must be an integer!");
	assert(N1.getValueType() == N2.getValueType() &&
	"SETCC operands must have the same type!");
	assert(VT.isVector() == N1.getValueType().isVector() &&
	"SETCC type should be vector iff the operand type is vector!");
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() == N1.getValueType().getVectorNumElements()) &&
	"SETCC vector element counts must match!");
	// Use FoldSetCC to simplify SETCC's.
	if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL))
	return V;
	// Vector constant folding.
	SDValue Ops[] = {N1, N2, N3};
	if (SDValue V = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops)) {
	NewSDValueDbgMsg(V, "New node vector constant folding: ", this);
	return V;
	}
	break;
	}
	case ISD::SELECT:
	case ISD::VSELECT:
	if (SDValue V = simplifySelect(N1, N2, N3))
	return V;
	break;
	case ISD::VECTOR_SHUFFLE:
	llvm_unreachable("should use getVectorShuffle constructor!");
	case ISD::INSERT_VECTOR_ELT: {
	ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3);
	// INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF
	if (N3C && N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
	return getUNDEF(VT);
	break;
	}
	case ISD::INSERT_SUBVECTOR: {
	// Inserting undef into undef is still undef.
	if (N1.isUndef() && N2.isUndef())
	return getUNDEF(VT);
	SDValue Index = N3;
	if (VT.isSimple() && N1.getValueType().isSimple()
	&& N2.getValueType().isSimple()) {
	assert(VT.isVector() && N1.getValueType().isVector() &&
	N2.getValueType().isVector() &&
	"Insert subvector VTs must be a vectors");
	assert(VT == N1.getValueType() &&
	"Dest and insert subvector source types must match!");
	assert(N2.getSimpleValueType() <= N1.getSimpleValueType() &&
	"Insert subvector must be from smaller vector to larger vector!");
	if (isa<ConstantSDNode>(Index)) {
	assert((N2.getValueType().getVectorNumElements() +
	cast<ConstantSDNode>(Index)->getZExtValue()
	<= VT.getVectorNumElements())
	&& "Insert subvector overflow!");
	}

	// Trivial insertion.
	if (VT.getSimpleVT() == N2.getSimpleValueType())
	return N2;

	// If this is an insert of an extracted vector into an undef vector, we
	// can just use the input to the extract.
	if (N1.isUndef() && N2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT)
	return N2.getOperand(0);
	}
	break;
	}
	case ISD::BITCAST:
	// Fold bit_convert nodes from a type to themselves.
	if (N1.getValueType() == VT)
	return N1;
	break;
	}

	// Memoize node if it doesn't produce a flag.
	SDNode *N;
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = {N1, N2, N3};
	if (VT != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTs, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
	E->intersectFlagsWith(Flags);
	return SDValue(E, 0);
	}

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	N->setFlags(Flags);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);
	}

	InsertNode(N);
	SDValue V = SDValue(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
	SDValue Ops[] = { N1, N2, N3, N4 };
	return getNode(Opcode, DL, VT, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue N1, SDValue N2, SDValue N3, SDValue N4,
	SDValue N5) {
	SDValue Ops[] = { N1, N2, N3, N4, N5 };
	return getNode(Opcode, DL, VT, Ops);
	}

	/// getStackArgumentTokenFactor - Compute a TokenFactor to force all
	/// the incoming stack arguments to be loaded from the stack.
	SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
	SmallVector<SDValue, 8> ArgChains;

	// Include the original chain at the beginning of the list. When this is
	// used by target LowerCall hooks, this helps legalize find the
	// CALLSEQ_BEGIN node.
	ArgChains.push_back(Chain);

	// Add a chain value for each stack argument.
	for (SDNode::use_iterator U = getEntryNode().getNode()->use_begin(),
	UE = getEntryNode().getNode()->use_end(); U != UE; ++U)
	if (LoadSDNode L = dyn_cast<LoadSDNode>(U))
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
	if (FI->getIndex() < 0)
	ArgChains.push_back(SDValue(L, 1));

	// Build a tokenfactor for all the chains.
	return getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
	}

	/// getMemsetValue - Vectorized representation of the memset value
	/// operand.
	static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
	const SDLoc &dl) {
	assert(!Value.isUndef());

	unsigned NumBits = VT.getScalarSizeInBits();
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
	assert(C->getAPIntValue().getBitWidth() == 8);
	APInt Val = APInt::getSplat(NumBits, C->getAPIntValue());
	if (VT.isInteger()) {
	bool IsOpaque = VT.getSizeInBits() > 64 \|\|
	!DAG.getTargetLoweringInfo().isLegalStoreImmediate(C->getSExtValue());
	return DAG.getConstant(Val, dl, VT, false, IsOpaque);
	}
	return DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(VT), Val), dl,
	VT);
	}

	assert(Value.getValueType() == MVT::i8 && "memset with non-byte fill value?");
	EVT IntVT = VT.getScalarType();
	if (!IntVT.isInteger())
	IntVT = EVT::getIntegerVT(*DAG.getContext(), IntVT.getSizeInBits());

	Value = DAG.getNode(ISD::ZERO_EXTEND, dl, IntVT, Value);
	if (NumBits > 8) {
	// Use a multiplication with 0x010101... to extend the input to the
	// required length.
	APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
	Value = DAG.getNode(ISD::MUL, dl, IntVT, Value,
	DAG.getConstant(Magic, dl, IntVT));
	}

	if (VT != Value.getValueType() && !VT.isInteger())
	Value = DAG.getBitcast(VT.getScalarType(), Value);
	if (VT != Value.getValueType())
	Value = DAG.getSplatBuildVector(VT, dl, Value);

	return Value;
	}

	/// getMemsetStringVal - Similar to getMemsetValue. Except this is only
	/// used when a memcpy is turned into a memset when the source is a constant
	/// string ptr.
	static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
	const TargetLowering &TLI,
	const ConstantDataArraySlice &Slice) {
	// Handle vector with all elements zero.
	if (Slice.Array == nullptr) {
	if (VT.isInteger())
	return DAG.getConstant(0, dl, VT);
	else if (VT == MVT::f32 \|\| VT == MVT::f64 \|\| VT == MVT::f128)
	return DAG.getConstantFP(0.0, dl, VT);
	else if (VT.isVector()) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
	return DAG.getNode(ISD::BITCAST, dl, VT,
	DAG.getConstant(0, dl,
	EVT::getVectorVT(*DAG.getContext(),
	EltVT, NumElts)));
	} else
	llvm_unreachable("Expected type!");
	}

	assert(!VT.isVector() && "Can't handle vector type here!");
	unsigned NumVTBits = VT.getSizeInBits();
	unsigned NumVTBytes = NumVTBits / 8;
	unsigned NumBytes = std::min(NumVTBytes, unsigned(Slice.Length));

	APInt Val(NumVTBits, 0);
	if (DAG.getDataLayout().isLittleEndian()) {
	for (unsigned i = 0; i != NumBytes; ++i)
	Val \|= (uint64_t)(unsigned char)Slice[i] << i*8;
	} else {
	for (unsigned i = 0; i != NumBytes; ++i)
	Val \|= (uint64_t)(unsigned char)Slice[i] << (NumVTBytes-i-1)*8;
	}

	// If the "cost" of materializing the integer immediate is less than the cost
	// of a load, then it is cost effective to turn the load into the immediate.
	Type Ty = VT.getTypeForEVT(DAG.getContext());
	if (TLI.shouldConvertConstantLoadToIntImm(Val, Ty))
	return DAG.getConstant(Val, dl, VT);
	return SDValue(nullptr, 0);
	}

	SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, unsigned Offset,
	const SDLoc &DL) {
	EVT VT = Base.getValueType();
	return getNode(ISD::ADD, DL, VT, Base, getConstant(Offset, DL, VT));
	}

	/// Returns true if memcpy source is constant data.
	static bool isMemSrcFromConstant(SDValue Src, ConstantDataArraySlice &Slice) {
	uint64_t SrcDelta = 0;
	GlobalAddressSDNode *G = nullptr;
	if (Src.getOpcode() == ISD::GlobalAddress)
	G = cast<GlobalAddressSDNode>(Src);
	else if (Src.getOpcode() == ISD::ADD &&
	Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
	Src.getOperand(1).getOpcode() == ISD::Constant) {
	G = cast<GlobalAddressSDNode>(Src.getOperand(0));
	SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getZExtValue();
	}
	if (!G)
	return false;

	return getConstantDataArrayInfo(G->getGlobal(), Slice, 8,
	SrcDelta + G->getOffset());
	}

	static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
	// On Darwin, -Os means optimize for size without hurting performance, so
	// only really optimize for size when -Oz (MinSize) is used.
	if (MF.getTarget().getTargetTriple().isOSDarwin())
	return MF.getFunction().hasMinSize();
	return MF.getFunction().hasOptSize();
	}

	static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
	SmallVector<SDValue, 32> &OutChains, unsigned From,
	unsigned To, SmallVector<SDValue, 16> &OutLoadChains,
	SmallVector<SDValue, 16> &OutStoreChains) {
	assert(OutLoadChains.size() && "Missing loads in memcpy inlining");
	assert(OutStoreChains.size() && "Missing stores in memcpy inlining");
	SmallVector<SDValue, 16> GluedLoadChains;
	for (unsigned i = From; i < To; ++i) {
	OutChains.push_back(OutLoadChains[i]);
	GluedLoadChains.push_back(OutLoadChains[i]);
	}

	// Chain for all loads.
	SDValue LoadToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	GluedLoadChains);

	for (unsigned i = From; i < To; ++i) {
	StoreSDNode *ST = dyn_cast<StoreSDNode>(OutStoreChains[i]);
	SDValue NewStore = DAG.getTruncStore(LoadToken, dl, ST->getValue(),
	ST->getBasePtr(), ST->getMemoryVT(),
	ST->getMemOperand());
	OutChains.push_back(NewStore);
	}
	}

	static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
	SDValue Chain, SDValue Dst, SDValue Src,
	uint64_t Size, unsigned Align,
	bool isVol, bool AlwaysInline,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo) {
	// Turn a memcpy of undef to nop.
	// FIXME: We need to honor volatile even is Src is undef.
	if (Src.isUndef())
	return Chain;

	// Expand memcpy to a series of load and store ops if the size operand falls
	// below a certain threshold.
	// TODO: In the AlwaysInline case, if the size is big then generate a loop
	// rather than maybe a humongous number of loads and stores.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const DataLayout &DL = DAG.getDataLayout();
	LLVMContext &C = *DAG.getContext();
	std::vector<EVT> MemOps;
	bool DstAlignCanChange = false;
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool OptSize = shouldLowerMemFuncForSize(MF);
	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
	if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
	DstAlignCanChange = true;
	unsigned SrcAlign = DAG.InferPtrAlignment(Src);
	if (Align > SrcAlign)
	SrcAlign = Align;
	ConstantDataArraySlice Slice;
	bool CopyFromConstant = isMemSrcFromConstant(Src, Slice);
	bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr;
	unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);

	if (!TLI.findOptimalMemOpLowering(
	MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align),
	(isZeroConstant ? 0 : SrcAlign), /IsMemset=/false,
	/ZeroMemset=/false, /MemcpyStrSrc=/CopyFromConstant,
	/AllowOverlap=/!isVol, DstPtrInfo.getAddrSpace(),
	SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes()))
	return SDValue();

	if (DstAlignCanChange) {
	Type *Ty = MemOps[0].getTypeForEVT(C);
	unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);

	// Don't promote to an alignment that would require dynamic stack
	// realignment.
	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
	if (!TRI->needsStackRealignment(MF))
	while (NewAlign > Align &&
	DL.exceedsNaturalStackAlignment(NewAlign))
	NewAlign /= 2;

	if (NewAlign > Align) {
	// Give the stack frame object a larger alignment if needed.
	if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
	MFI.setObjectAlignment(FI->getIndex(), NewAlign);
	Align = NewAlign;
	}
	}

	MachineMemOperand::Flags MMOFlags =
	isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
	SmallVector<SDValue, 16> OutLoadChains;
	SmallVector<SDValue, 16> OutStoreChains;
	SmallVector<SDValue, 32> OutChains;
	unsigned NumMemOps = MemOps.size();
	uint64_t SrcOff = 0, DstOff = 0;
	for (unsigned i = 0; i != NumMemOps; ++i) {
	EVT VT = MemOps[i];
	unsigned VTSize = VT.getSizeInBits() / 8;
	SDValue Value, Store;

	if (VTSize > Size) {
	// Issuing an unaligned load / store pair that overlaps with the previous
	// pair. Adjust the offset accordingly.
	assert(i == NumMemOps-1 && i != 0);
	SrcOff -= VTSize - Size;
	DstOff -= VTSize - Size;
	}

	if (CopyFromConstant &&
	(isZeroConstant \|\| (VT.isInteger() && !VT.isVector()))) {
	// It's unlikely a store of a vector immediate can be done in a single
	// instruction. It would require a load from a constantpool first.
	// We only handle zero vectors here.
	// FIXME: Handle other cases where store of vector immediate is done in
	// a single instruction.
	ConstantDataArraySlice SubSlice;
	if (SrcOff < Slice.Length) {
	SubSlice = Slice;
	SubSlice.move(SrcOff);
	} else {
	// This is an out-of-bounds access and hence UB. Pretend we read zero.
	SubSlice.Array = nullptr;
	SubSlice.Offset = 0;
	SubSlice.Length = VTSize;
	}
	Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
	if (Value.getNode()) {
	Store = DAG.getStore(Chain, dl, Value,
	DAG.getMemBasePlusOffset(Dst, DstOff, dl),
	DstPtrInfo.getWithOffset(DstOff), Align,
	MMOFlags);
	OutChains.push_back(Store);
	}
	}

	if (!Store.getNode()) {
	// The type might not be legal for the target. This should only happen
	// if the type is smaller than a legal type, as on PPC, so the right
	// thing to do is generate a LoadExt/StoreTrunc pair. These simplify
	// to Load/Store if NVT==VT.
	// FIXME does the case above also need this?
	EVT NVT = TLI.getTypeToTransformTo(C, VT);
	assert(NVT.bitsGE(VT));

	bool isDereferenceable =
	SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
	MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
	if (isDereferenceable)
	SrcMMOFlags \|= MachineMemOperand::MODereferenceable;

	Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
	DAG.getMemBasePlusOffset(Src, SrcOff, dl),
	SrcPtrInfo.getWithOffset(SrcOff), VT,
	MinAlign(SrcAlign, SrcOff), SrcMMOFlags);
	OutLoadChains.push_back(Value.getValue(1));

	Store = DAG.getTruncStore(
	Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
	DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
	OutStoreChains.push_back(Store);
	}
	SrcOff += VTSize;
	DstOff += VTSize;
	Size -= VTSize;
	}

	unsigned GluedLdStLimit = MaxLdStGlue == 0 ?
	TLI.getMaxGluedStoresPerMemcpy() : MaxLdStGlue;
	unsigned NumLdStInMemcpy = OutStoreChains.size();

	if (NumLdStInMemcpy) {
	// It may be that memcpy might be converted to memset if it's memcpy
	// of constants. In such a case, we won't have loads and stores, but
	// just stores. In the absence of loads, there is nothing to gang up.
	if ((GluedLdStLimit <= 1) \|\| !EnableMemCpyDAGOpt) {
	// If target does not care, just leave as it.
	for (unsigned i = 0; i < NumLdStInMemcpy; ++i) {
	OutChains.push_back(OutLoadChains[i]);
	OutChains.push_back(OutStoreChains[i]);
	}
	} else {
	// Ld/St less than/equal limit set by target.
	if (NumLdStInMemcpy <= GluedLdStLimit) {
	chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
	NumLdStInMemcpy, OutLoadChains,
	OutStoreChains);
	} else {
	unsigned NumberLdChain = NumLdStInMemcpy / GluedLdStLimit;
	unsigned RemainingLdStInMemcpy = NumLdStInMemcpy % GluedLdStLimit;
	unsigned GlueIter = 0;

	for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
	unsigned IndexFrom = NumLdStInMemcpy - GlueIter - GluedLdStLimit;
	unsigned IndexTo = NumLdStInMemcpy - GlueIter;

	chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
	OutLoadChains, OutStoreChains);
	GlueIter += GluedLdStLimit;
	}

	// Residual ld/st.
	if (RemainingLdStInMemcpy) {
	chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
	RemainingLdStInMemcpy, OutLoadChains,
	OutStoreChains);
	}
	}
	}
	}
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}

	static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
	SDValue Chain, SDValue Dst, SDValue Src,
	uint64_t Size, unsigned Align,
	bool isVol, bool AlwaysInline,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo) {
	// Turn a memmove of undef to nop.
	// FIXME: We need to honor volatile even is Src is undef.
	if (Src.isUndef())
	return Chain;

	// Expand memmove to a series of load and store ops if the size operand falls
	// below a certain threshold.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const DataLayout &DL = DAG.getDataLayout();
	LLVMContext &C = *DAG.getContext();
	std::vector<EVT> MemOps;
	bool DstAlignCanChange = false;
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool OptSize = shouldLowerMemFuncForSize(MF);
	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
	if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
	DstAlignCanChange = true;
	unsigned SrcAlign = DAG.InferPtrAlignment(Src);
	if (Align > SrcAlign)
	SrcAlign = Align;
	unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize);
	// FIXME: `AllowOverlap` should really be `!isVol` but there is a bug in
	// findOptimalMemOpLowering. Meanwhile, setting it to `false` produces the
	// correct code.
	bool AllowOverlap = false;
	if (!TLI.findOptimalMemOpLowering(
	MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), SrcAlign,
	/IsMemset=/false, /ZeroMemset=/false, /MemcpyStrSrc=/false,
	AllowOverlap, DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
	MF.getFunction().getAttributes()))
	return SDValue();

	if (DstAlignCanChange) {
	Type *Ty = MemOps[0].getTypeForEVT(C);
	unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
	if (NewAlign > Align) {
	// Give the stack frame object a larger alignment if needed.
	if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
	MFI.setObjectAlignment(FI->getIndex(), NewAlign);
	Align = NewAlign;
	}
	}

	MachineMemOperand::Flags MMOFlags =
	isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
	uint64_t SrcOff = 0, DstOff = 0;
	SmallVector<SDValue, 8> LoadValues;
	SmallVector<SDValue, 8> LoadChains;
	SmallVector<SDValue, 8> OutChains;
	unsigned NumMemOps = MemOps.size();
	for (unsigned i = 0; i < NumMemOps; i++) {
	EVT VT = MemOps[i];
	unsigned VTSize = VT.getSizeInBits() / 8;
	SDValue Value;

	bool isDereferenceable =
	SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
	MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
	if (isDereferenceable)
	SrcMMOFlags \|= MachineMemOperand::MODereferenceable;

	Value =
	DAG.getLoad(VT, dl, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl),
	SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, SrcMMOFlags);
	LoadValues.push_back(Value);
	LoadChains.push_back(Value.getValue(1));
	SrcOff += VTSize;
	}
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
	OutChains.clear();
	for (unsigned i = 0; i < NumMemOps; i++) {
	EVT VT = MemOps[i];
	unsigned VTSize = VT.getSizeInBits() / 8;
	SDValue Store;

	Store = DAG.getStore(Chain, dl, LoadValues[i],
	DAG.getMemBasePlusOffset(Dst, DstOff, dl),
	DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags);
	OutChains.push_back(Store);
	DstOff += VTSize;
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}

	/// Lower the call to 'memset' intrinsic function into a series of store
	/// operations.
	///
	/// \param DAG Selection DAG where lowered code is placed.
	/// \param dl Link to corresponding IR location.
	/// \param Chain Control flow dependency.
	/// \param Dst Pointer to destination memory location.
	/// \param Src Value of byte to write into the memory.
	/// \param Size Number of bytes to write.
	/// \param Align Alignment of the destination in bytes.
	/// \param isVol True if destination is volatile.
	/// \param DstPtrInfo IR information on the memory pointer.
	/// \returns New head in the control flow, if lowering was successful, empty
	/// SDValue otherwise.
	///
	/// The function tries to replace 'llvm.memset' intrinsic with several store
	/// operations and value calculation code. This is usually profitable for small
	/// memory size.
	static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
	SDValue Chain, SDValue Dst, SDValue Src,
	uint64_t Size, unsigned Align, bool isVol,
	MachinePointerInfo DstPtrInfo) {
	// Turn a memset of undef to nop.
	// FIXME: We need to honor volatile even is Src is undef.
	if (Src.isUndef())
	return Chain;

	// Expand memset to a series of load/store ops if the size operand
	// falls below a certain threshold.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	std::vector<EVT> MemOps;
	bool DstAlignCanChange = false;
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool OptSize = shouldLowerMemFuncForSize(MF);
	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
	if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
	DstAlignCanChange = true;
	bool IsZeroVal =
	isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
	if (!TLI.findOptimalMemOpLowering(
	MemOps, TLI.getMaxStoresPerMemset(OptSize), Size,
	(DstAlignCanChange ? 0 : Align), 0, /IsMemset=/true,
	/ZeroMemset=/IsZeroVal, /MemcpyStrSrc=/false,
	/AllowOverlap=/!isVol, DstPtrInfo.getAddrSpace(), ~0u,
	MF.getFunction().getAttributes()))
	return SDValue();

	if (DstAlignCanChange) {
	Type Ty = MemOps[0].getTypeForEVT(DAG.getContext());
	unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
	if (NewAlign > Align) {
	// Give the stack frame object a larger alignment if needed.
	if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
	MFI.setObjectAlignment(FI->getIndex(), NewAlign);
	Align = NewAlign;
	}
	}

	SmallVector<SDValue, 8> OutChains;
	uint64_t DstOff = 0;
	unsigned NumMemOps = MemOps.size();

	// Find the largest store and generate the bit pattern for it.
	EVT LargestVT = MemOps[0];
	for (unsigned i = 1; i < NumMemOps; i++)
	if (MemOps[i].bitsGT(LargestVT))
	LargestVT = MemOps[i];
	SDValue MemSetValue = getMemsetValue(Src, LargestVT, DAG, dl);

	for (unsigned i = 0; i < NumMemOps; i++) {
	EVT VT = MemOps[i];
	unsigned VTSize = VT.getSizeInBits() / 8;
	if (VTSize > Size) {
	// Issuing an unaligned load / store pair that overlaps with the previous
	// pair. Adjust the offset accordingly.
	assert(i == NumMemOps-1 && i != 0);
	DstOff -= VTSize - Size;
	}

	// If this store is smaller than the largest store see whether we can get
	// the smaller value for free with a truncate.
	SDValue Value = MemSetValue;
	if (VT.bitsLT(LargestVT)) {
	if (!LargestVT.isVector() && !VT.isVector() &&
	TLI.isTruncateFree(LargestVT, VT))
	Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue);
	else
	Value = getMemsetValue(Src, VT, DAG, dl);
	}
	assert(Value.getValueType() == VT && "Value with wrong type.");
	SDValue Store = DAG.getStore(
	Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
	DstPtrInfo.getWithOffset(DstOff), Align,
	isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone);
	OutChains.push_back(Store);
	DstOff += VT.getSizeInBits() / 8;
	Size -= VTSize;
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}

	static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
	unsigned AS) {
	// Lowering memcpy / memset / memmove intrinsics to calls is only valid if all
	// pointer operands can be losslessly bitcasted to pointers of address space 0
	if (AS != 0 && !TLI->isNoopAddrSpaceCast(AS, 0)) {
	report_fatal_error("cannot lower memory intrinsic in address space " +
	Twine(AS));
	}
	}

	SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
	SDValue Src, SDValue Size, unsigned Align,
	bool isVol, bool AlwaysInline, bool isTailCall,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo) {
	assert(Align && "The SDAG layer expects explicit alignment and reserves 0");

	// Check to see if we should lower the memcpy to loads and stores first.
	// For cases within the target-specified limits, this is the best choice.
	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
	if (ConstantSize) {
	// Memcpy with size zero? Just return the original chain.
	if (ConstantSize->isNullValue())
	return Chain;

	SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
	ConstantSize->getZExtValue(),Align,
	isVol, false, DstPtrInfo, SrcPtrInfo);
	if (Result.getNode())
	return Result;
	}

	// Then check to see if we should lower the memcpy with target-specific
	// code. If the target chooses to do this, this is the next best.
	if (TSI) {
	SDValue Result = TSI->EmitTargetCodeForMemcpy(
	*this, dl, Chain, Dst, Src, Size, Align, isVol, AlwaysInline,
	DstPtrInfo, SrcPtrInfo);
	if (Result.getNode())
	return Result;
	}

	// If we really need inline code and the target declined to provide it,
	// use a (potentially long) sequence of loads and stores.
	if (AlwaysInline) {
	assert(ConstantSize && "AlwaysInline requires a constant size!");
	return getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
	ConstantSize->getZExtValue(), Align, isVol,
	true, DstPtrInfo, SrcPtrInfo);
	}

	checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
	checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());

	// FIXME: If the memcpy is volatile (isVol), lowering it to a plain libc
	// memcpy is not guaranteed to be safe. libc memcpys aren't required to
	// respect volatile, so they may do things like read or write memory
	// beyond the given memory regions. But fixing this isn't easy, and most
	// people don't care.

	// Emit a library call.
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Ty = Type::getInt8PtrTy(*getContext());
	Entry.Node = Dst; Args.push_back(Entry);
	Entry.Node = Src; Args.push_back(Entry);

	Entry.Ty = getDataLayout().getIntPtrType(*getContext());
	Entry.Node = Size; Args.push_back(Entry);
	// FIXME: pass in SDLoc
	TargetLowering::CallLoweringInfo CLI(*this);
	CLI.setDebugLoc(dl)
	.setChain(Chain)
	.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
	Dst.getValueType().getTypeForEVT(*getContext()),
	getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
	TLI->getPointerTy(getDataLayout())),
	std::move(Args))
	.setDiscardResult()
	.setTailCall(isTailCall);

	std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue SelectionDAG::getAtomicMemcpy(SDValue Chain, const SDLoc &dl,
	SDValue Dst, unsigned DstAlign,
	SDValue Src, unsigned SrcAlign,
	SDValue Size, Type *SizeTy,
	unsigned ElemSz, bool isTailCall,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo) {
	// Emit a library call.
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Ty = getDataLayout().getIntPtrType(*getContext());
	Entry.Node = Dst;
	Args.push_back(Entry);

	Entry.Node = Src;
	Args.push_back(Entry);

	Entry.Ty = SizeTy;
	Entry.Node = Size;
	Args.push_back(Entry);

	RTLIB::Libcall LibraryCall =
	RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(ElemSz);
	if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
	report_fatal_error("Unsupported element size");

	TargetLowering::CallLoweringInfo CLI(*this);
	CLI.setDebugLoc(dl)
	.setChain(Chain)
	.setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
	Type::getVoidTy(*getContext()),
	getExternalSymbol(TLI->getLibcallName(LibraryCall),
	TLI->getPointerTy(getDataLayout())),
	std::move(Args))
	.setDiscardResult()
	.setTailCall(isTailCall);

	std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
	SDValue Src, SDValue Size, unsigned Align,
	bool isVol, bool isTailCall,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo) {
	assert(Align && "The SDAG layer expects explicit alignment and reserves 0");

	// Check to see if we should lower the memmove to loads and stores first.
	// For cases within the target-specified limits, this is the best choice.
	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
	if (ConstantSize) {
	// Memmove with size zero? Just return the original chain.
	if (ConstantSize->isNullValue())
	return Chain;

	SDValue Result =
	getMemmoveLoadsAndStores(*this, dl, Chain, Dst, Src,
	ConstantSize->getZExtValue(), Align, isVol,
	false, DstPtrInfo, SrcPtrInfo);
	if (Result.getNode())
	return Result;
	}

	// Then check to see if we should lower the memmove with target-specific
	// code. If the target chooses to do this, this is the next best.
	if (TSI) {
	SDValue Result = TSI->EmitTargetCodeForMemmove(
	*this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo);
	if (Result.getNode())
	return Result;
	}

	checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
	checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());

	// FIXME: If the memmove is volatile, lowering it to plain libc memmove may
	// not be safe. See memcpy above for more details.

	// Emit a library call.
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Ty = Type::getInt8PtrTy(*getContext());
	Entry.Node = Dst; Args.push_back(Entry);
	Entry.Node = Src; Args.push_back(Entry);

	Entry.Ty = getDataLayout().getIntPtrType(*getContext());
	Entry.Node = Size; Args.push_back(Entry);
	// FIXME: pass in SDLoc
	TargetLowering::CallLoweringInfo CLI(*this);
	CLI.setDebugLoc(dl)
	.setChain(Chain)
	.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
	Dst.getValueType().getTypeForEVT(*getContext()),
	getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
	TLI->getPointerTy(getDataLayout())),
	std::move(Args))
	.setDiscardResult()
	.setTailCall(isTailCall);

	std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue SelectionDAG::getAtomicMemmove(SDValue Chain, const SDLoc &dl,
	SDValue Dst, unsigned DstAlign,
	SDValue Src, unsigned SrcAlign,
	SDValue Size, Type *SizeTy,
	unsigned ElemSz, bool isTailCall,
	MachinePointerInfo DstPtrInfo,
	MachinePointerInfo SrcPtrInfo) {
	// Emit a library call.
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Ty = getDataLayout().getIntPtrType(*getContext());
	Entry.Node = Dst;
	Args.push_back(Entry);

	Entry.Node = Src;
	Args.push_back(Entry);

	Entry.Ty = SizeTy;
	Entry.Node = Size;
	Args.push_back(Entry);

	RTLIB::Libcall LibraryCall =
	RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(ElemSz);
	if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
	report_fatal_error("Unsupported element size");

	TargetLowering::CallLoweringInfo CLI(*this);
	CLI.setDebugLoc(dl)
	.setChain(Chain)
	.setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
	Type::getVoidTy(*getContext()),
	getExternalSymbol(TLI->getLibcallName(LibraryCall),
	TLI->getPointerTy(getDataLayout())),
	std::move(Args))
	.setDiscardResult()
	.setTailCall(isTailCall);

	std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
	SDValue Src, SDValue Size, unsigned Align,
	bool isVol, bool isTailCall,
	MachinePointerInfo DstPtrInfo) {
	assert(Align && "The SDAG layer expects explicit alignment and reserves 0");

	// Check to see if we should lower the memset to stores first.
	// For cases within the target-specified limits, this is the best choice.
	ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
	if (ConstantSize) {
	// Memset with size zero? Just return the original chain.
	if (ConstantSize->isNullValue())
	return Chain;

	SDValue Result =
	getMemsetStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
	Align, isVol, DstPtrInfo);

	if (Result.getNode())
	return Result;
	}

	// Then check to see if we should lower the memset with target-specific
	// code. If the target chooses to do this, this is the next best.
	if (TSI) {
	SDValue Result = TSI->EmitTargetCodeForMemset(
	*this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo);
	if (Result.getNode())
	return Result;
	}

	checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());

	// Emit a library call.
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Node = Dst; Entry.Ty = Type::getInt8PtrTy(*getContext());
	Args.push_back(Entry);
	Entry.Node = Src;
	Entry.Ty = Src.getValueType().getTypeForEVT(*getContext());
	Args.push_back(Entry);
	Entry.Node = Size;
	Entry.Ty = getDataLayout().getIntPtrType(*getContext());
	Args.push_back(Entry);

	// FIXME: pass in SDLoc
	TargetLowering::CallLoweringInfo CLI(*this);
	CLI.setDebugLoc(dl)
	.setChain(Chain)
	.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
	Dst.getValueType().getTypeForEVT(*getContext()),
	getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
	TLI->getPointerTy(getDataLayout())),
	std::move(Args))
	.setDiscardResult()
	.setTailCall(isTailCall);

	std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue SelectionDAG::getAtomicMemset(SDValue Chain, const SDLoc &dl,
	SDValue Dst, unsigned DstAlign,
	SDValue Value, SDValue Size, Type *SizeTy,
	unsigned ElemSz, bool isTailCall,
	MachinePointerInfo DstPtrInfo) {
	// Emit a library call.
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Ty = getDataLayout().getIntPtrType(*getContext());
	Entry.Node = Dst;
	Args.push_back(Entry);

	Entry.Ty = Type::getInt8Ty(*getContext());
	Entry.Node = Value;
	Args.push_back(Entry);

	Entry.Ty = SizeTy;
	Entry.Node = Size;
	Args.push_back(Entry);

	RTLIB::Libcall LibraryCall =
	RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(ElemSz);
	if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
	report_fatal_error("Unsupported element size");

	TargetLowering::CallLoweringInfo CLI(*this);
	CLI.setDebugLoc(dl)
	.setChain(Chain)
	.setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
	Type::getVoidTy(*getContext()),
	getExternalSymbol(TLI->getLibcallName(LibraryCall),
	TLI->getPointerTy(getDataLayout())),
	std::move(Args))
	.setDiscardResult()
	.setTailCall(isTailCall);

	std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
	return CallResult.second;
	}

	SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	SDVTList VTList, ArrayRef<SDValue> Ops,
	MachineMemOperand *MMO) {
	FoldingSetNodeID ID;
	ID.AddInteger(MemVT.getRawBits());
	AddNodeIDNode(ID, Opcode, VTList, Ops);
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void* IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<AtomicSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}

	auto *N = newSDNode<AtomicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
	VTList, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl,
	EVT MemVT, SDVTList VTs, SDValue Chain,
	SDValue Ptr, SDValue Cmp, SDValue Swp,
	MachineMemOperand *MMO) {
	assert(Opcode == ISD::ATOMIC_CMP_SWAP \|\|
	Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
	assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");

	SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
	return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
	}

	SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	SDValue Chain, SDValue Ptr, SDValue Val,
	MachineMemOperand *MMO) {
	assert((Opcode == ISD::ATOMIC_LOAD_ADD \|\|
	Opcode == ISD::ATOMIC_LOAD_SUB \|\|
	Opcode == ISD::ATOMIC_LOAD_AND \|\|
	Opcode == ISD::ATOMIC_LOAD_CLR \|\|
	Opcode == ISD::ATOMIC_LOAD_OR \|\|
	Opcode == ISD::ATOMIC_LOAD_XOR \|\|
	Opcode == ISD::ATOMIC_LOAD_NAND \|\|
	Opcode == ISD::ATOMIC_LOAD_MIN \|\|
	Opcode == ISD::ATOMIC_LOAD_MAX \|\|
	Opcode == ISD::ATOMIC_LOAD_UMIN \|\|
	Opcode == ISD::ATOMIC_LOAD_UMAX \|\|
	Opcode == ISD::ATOMIC_LOAD_FADD \|\|
	Opcode == ISD::ATOMIC_LOAD_FSUB \|\|
	Opcode == ISD::ATOMIC_SWAP \|\|
	Opcode == ISD::ATOMIC_STORE) &&
	"Invalid Atomic Op");

	EVT VT = Val.getValueType();

	SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) :
	getVTList(VT, MVT::Other);
	SDValue Ops[] = {Chain, Ptr, Val};
	return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
	}

	SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
	EVT VT, SDValue Chain, SDValue Ptr,
	MachineMemOperand *MMO) {
	assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op");

	SDVTList VTs = getVTList(VT, MVT::Other);
	SDValue Ops[] = {Chain, Ptr};
	return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
	}

	/// getMergeValues - Create a MERGE_VALUES node from the given operands.
	SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
	if (Ops.size() == 1)
	return Ops[0];

	SmallVector<EVT, 4> VTs;
	VTs.reserve(Ops.size());
	for (unsigned i = 0; i < Ops.size(); ++i)
	VTs.push_back(Ops[i].getValueType());
	return getNode(ISD::MERGE_VALUES, dl, getVTList(VTs), Ops);
	}

	SDValue SelectionDAG::getMemIntrinsicNode(
	unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
	EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align,
	MachineMemOperand::Flags Flags, unsigned Size, const AAMDNodes &AAInfo) {
	if (Align == 0) // Ensure that codegen never sees alignment 0
	Align = getEVTAlignment(MemVT);

	if (!Size)
	Size = MemVT.getStoreSize();

	MachineFunction &MF = getMachineFunction();
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(PtrInfo, Flags, Size, Align, AAInfo);

	return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
	}

	SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
	SDVTList VTList,
	ArrayRef<SDValue> Ops, EVT MemVT,
	MachineMemOperand *MMO) {
	assert((Opcode == ISD::INTRINSIC_VOID \|\|
	Opcode == ISD::INTRINSIC_W_CHAIN \|\|
	Opcode == ISD::PREFETCH \|\|
	Opcode == ISD::LIFETIME_START \|\|
	Opcode == ISD::LIFETIME_END \|\|
	((int)Opcode <= std::numeric_limits<int>::max() &&
	(int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
	"Opcode is not a memory-accessing opcode!");

	// Memoize the node unless it returns a flag.
	MemIntrinsicSDNode *N;
	if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTList, Ops);
	ID.AddInteger(getSyntheticNodeSubclassData<MemIntrinsicSDNode>(
	Opcode, dl.getIROrder(), VTList, MemVT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}

	N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
	VTList, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
	VTList, MemVT, MMO);
	createOperands(N, Ops);
	}
	InsertNode(N);
	return SDValue(N, 0);
	}

	SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
	SDValue Chain, int FrameIndex,
	int64_t Size, int64_t Offset) {
	const unsigned Opcode = IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END;
	const auto VTs = getVTList(MVT::Other);
	SDValue Ops[2] = {
	Chain,
	getFrameIndex(FrameIndex,
	getTargetLoweringInfo().getFrameIndexTy(getDataLayout()),
	true)};

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTs, Ops);
	ID.AddInteger(FrameIndex);
	ID.AddInteger(Size);
	ID.AddInteger(Offset);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
	return SDValue(E, 0);

	LifetimeSDNode *N = newSDNode<LifetimeSDNode>(
	Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs, Size, Offset);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	/// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
	/// MachinePointerInfo record from it. This is particularly useful because the
	/// code generator has many cases where it doesn't bother passing in a
	/// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
	static MachinePointerInfo InferPointerInfo(const MachinePointerInfo &Info,
	SelectionDAG &DAG, SDValue Ptr,
	int64_t Offset = 0) {
	// If this is FI+Offset, we can model it.
	if (const FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr))
	return MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
	FI->getIndex(), Offset);

	// If this is (FI+Offset1)+Offset2, we can model it.
	if (Ptr.getOpcode() != ISD::ADD \|\|
	!isa<ConstantSDNode>(Ptr.getOperand(1)) \|\|
	!isa<FrameIndexSDNode>(Ptr.getOperand(0)))
	return Info;

	int FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	return MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), FI,
	Offset + cast<ConstantSDNode>(Ptr.getOperand(1))->getSExtValue());
	}

	/// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
	/// MachinePointerInfo record from it. This is particularly useful because the
	/// code generator has many cases where it doesn't bother passing in a
	/// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
	static MachinePointerInfo InferPointerInfo(const MachinePointerInfo &Info,
	SelectionDAG &DAG, SDValue Ptr,
	SDValue OffsetOp) {
	// If the 'Offset' value isn't a constant, we can't handle this.
	if (ConstantSDNode *OffsetNode = dyn_cast<ConstantSDNode>(OffsetOp))
	return InferPointerInfo(Info, DAG, Ptr, OffsetNode->getSExtValue());
	if (OffsetOp.isUndef())
	return InferPointerInfo(Info, DAG, Ptr);
	return Info;
	}

	SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
	EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, SDValue Offset,
	MachinePointerInfo PtrInfo, EVT MemVT,
	unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo, const MDNode *Ranges) {
	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = getEVTAlignment(MemVT);

	MMOFlags \|= MachineMemOperand::MOLoad;
	assert((MMOFlags & MachineMemOperand::MOStore) == 0);
	// If we don't have a PtrInfo, infer the trivial frame index case to simplify
	// clients.
	if (PtrInfo.V.isNull())
	PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);

	MachineFunction &MF = getMachineFunction();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	PtrInfo, MMOFlags, MemVT.getStoreSize(), Alignment, AAInfo, Ranges);
	return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
	}

	SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
	EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, SDValue Offset, EVT MemVT,
	MachineMemOperand *MMO) {
	if (VT == MemVT) {
	ExtType = ISD::NON_EXTLOAD;
	} else if (ExtType == ISD::NON_EXTLOAD) {
	assert(VT == MemVT && "Non-extending load from different memory type!");
	} else {
	// Extending load.
	assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) &&
	"Should only be an extending load, not truncating!");
	assert(VT.isInteger() == MemVT.isInteger() &&
	"Cannot convert from FP to Int or Int -> FP!");
	assert(VT.isVector() == MemVT.isVector() &&
	"Cannot use an ext load to convert to or from a vector!");
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() == MemVT.getVectorNumElements()) &&
	"Cannot use an ext load to change the number of vector elements!");
	}

	bool Indexed = AM != ISD::UNINDEXED;
	assert((Indexed \|\| Offset.isUndef()) && "Unindexed load with an offset!");

	SDVTList VTs = Indexed ?
	getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other);
	SDValue Ops[] = { Chain, Ptr, Offset };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::LOAD, VTs, Ops);
	ID.AddInteger(MemVT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<LoadSDNode>(
	dl.getIROrder(), VTs, AM, ExtType, MemVT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<LoadSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<LoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
	ExtType, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, MachinePointerInfo PtrInfo,
	unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo, const MDNode *Ranges) {
	SDValue Undef = getUNDEF(Ptr.getValueType());
	return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
	PtrInfo, VT, Alignment, MMOFlags, AAInfo, Ranges);
	}

	SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, MachineMemOperand *MMO) {
	SDValue Undef = getUNDEF(Ptr.getValueType());
	return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
	VT, MMO);
	}

	SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
	EVT VT, SDValue Chain, SDValue Ptr,
	MachinePointerInfo PtrInfo, EVT MemVT,
	unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo) {
	SDValue Undef = getUNDEF(Ptr.getValueType());
	return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, PtrInfo,
	MemVT, Alignment, MMOFlags, AAInfo);
	}

	SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
	EVT VT, SDValue Chain, SDValue Ptr, EVT MemVT,
	MachineMemOperand *MMO) {
	SDValue Undef = getUNDEF(Ptr.getValueType());
	return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
	MemVT, MMO);
	}

	SDValue SelectionDAG::getIndexedLoad(SDValue OrigLoad, const SDLoc &dl,
	SDValue Base, SDValue Offset,
	ISD::MemIndexedMode AM) {
	LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
	assert(LD->getOffset().isUndef() && "Load is already a indexed load!");
	// Don't propagate the invariant or dereferenceable flags.
	auto MMOFlags =
	LD->getMemOperand()->getFlags() &
	~(MachineMemOperand::MOInvariant \| MachineMemOperand::MODereferenceable);
	return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
	LD->getChain(), Base, Offset, LD->getPointerInfo(),
	LD->getMemoryVT(), LD->getAlignment(), MMOFlags,
	LD->getAAInfo());
	}

	SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, MachinePointerInfo PtrInfo,
	unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo) {
	assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = getEVTAlignment(Val.getValueType());

	MMOFlags \|= MachineMemOperand::MOStore;
	assert((MMOFlags & MachineMemOperand::MOLoad) == 0);

	if (PtrInfo.V.isNull())
	PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);

	MachineFunction &MF = getMachineFunction();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	PtrInfo, MMOFlags, Val.getValueType().getStoreSize(), Alignment, AAInfo);
	return getStore(Chain, dl, Val, Ptr, MMO);
	}

	SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, MachineMemOperand *MMO) {
	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	EVT VT = Val.getValueType();
	SDVTList VTs = getVTList(MVT::Other);
	SDValue Undef = getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
	ID.AddInteger(VT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
	dl.getIROrder(), VTs, ISD::UNINDEXED, false, VT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<StoreSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
	ISD::UNINDEXED, false, VT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, MachinePointerInfo PtrInfo,
	EVT SVT, unsigned Alignment,
	MachineMemOperand::Flags MMOFlags,
	const AAMDNodes &AAInfo) {
	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	if (Alignment == 0) // Ensure that codegen never sees alignment 0
	Alignment = getEVTAlignment(SVT);

	MMOFlags \|= MachineMemOperand::MOStore;
	assert((MMOFlags & MachineMemOperand::MOLoad) == 0);

	if (PtrInfo.V.isNull())
	PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);

	MachineFunction &MF = getMachineFunction();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	PtrInfo, MMOFlags, SVT.getStoreSize(), Alignment, AAInfo);
	return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
	}

	SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
	SDValue Ptr, EVT SVT,
	MachineMemOperand *MMO) {
	EVT VT = Val.getValueType();

	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	if (VT == SVT)
	return getStore(Chain, dl, Val, Ptr, MMO);

	assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
	"Should only be a truncating store, not extending!");
	assert(VT.isInteger() == SVT.isInteger() &&
	"Can't do FP-INT conversion!");
	assert(VT.isVector() == SVT.isVector() &&
	"Cannot use trunc store to convert to or from a vector!");
	assert((!VT.isVector() \|\|
	VT.getVectorNumElements() == SVT.getVectorNumElements()) &&
	"Cannot use trunc store to change the number of vector elements!");

	SDVTList VTs = getVTList(MVT::Other);
	SDValue Undef = getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
	ID.AddInteger(SVT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
	dl.getIROrder(), VTs, ISD::UNINDEXED, true, SVT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<StoreSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
	ISD::UNINDEXED, true, SVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
	SDValue Base, SDValue Offset,
	ISD::MemIndexedMode AM) {
	StoreSDNode *ST = cast<StoreSDNode>(OrigStore);
	assert(ST->getOffset().isUndef() && "Store is already a indexed store!");
	SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
	SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
	ID.AddInteger(ST->getMemoryVT().getRawBits());
	ID.AddInteger(ST->getRawSubclassData());
	ID.AddInteger(ST->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
	return SDValue(E, 0);

	auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
	ST->isTruncatingStore(), ST->getMemoryVT(),
	ST->getMemOperand());
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, SDValue Mask, SDValue PassThru,
	EVT MemVT, MachineMemOperand *MMO,
	ISD::LoadExtType ExtTy, bool isExpanding) {
	SDVTList VTs = getVTList(VT, MVT::Other);
	SDValue Ops[] = { Chain, Ptr, Mask, PassThru };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
	ID.AddInteger(MemVT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<MaskedLoadSDNode>(
	dl.getIROrder(), VTs, ExtTy, isExpanding, MemVT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MaskedLoadSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<MaskedLoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
	ExtTy, isExpanding, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
	SDValue Val, SDValue Ptr, SDValue Mask,
	EVT MemVT, MachineMemOperand *MMO,
	bool IsTruncating, bool IsCompressing) {
	assert(Chain.getValueType() == MVT::Other &&
	"Invalid chain type");
	SDVTList VTs = getVTList(MVT::Other);
	SDValue Ops[] = { Chain, Val, Ptr, Mask };
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
	ID.AddInteger(MemVT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<MaskedStoreSDNode>(
	dl.getIROrder(), VTs, IsTruncating, IsCompressing, MemVT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<MaskedStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
	IsTruncating, IsCompressing, MemVT, MMO);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
	ArrayRef<SDValue> Ops,
	MachineMemOperand *MMO) {
	assert(Ops.size() == 6 && "Incompatible number of operands");

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops);
	ID.AddInteger(VT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<MaskedGatherSDNode>(
	dl.getIROrder(), VTs, VT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}

	auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
	VTs, VT, MMO);
	createOperands(N, Ops);

	assert(N->getPassThru().getValueType() == N->getValueType(0) &&
	"Incompatible type of the PassThru value in MaskedGatherSDNode");
	assert(N->getMask().getValueType().getVectorNumElements() ==
	N->getValueType(0).getVectorNumElements() &&
	"Vector width mismatch between mask and data");
	assert(N->getIndex().getValueType().getVectorNumElements() >=
	N->getValueType(0).getVectorNumElements() &&
	"Vector width mismatch between index and data");
	assert(isa<ConstantSDNode>(N->getScale()) &&
	cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
	"Scale should be a constant power of 2");

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
	ArrayRef<SDValue> Ops,
	MachineMemOperand *MMO) {
	assert(Ops.size() == 6 && "Incompatible number of operands");

	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops);
	ID.AddInteger(VT.getRawBits());
	ID.AddInteger(getSyntheticNodeSubclassData<MaskedScatterSDNode>(
	dl.getIROrder(), VTs, VT, MMO));
	ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
	cast<MaskedScatterSDNode>(E)->refineAlignment(MMO);
	return SDValue(E, 0);
	}
	auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
	VTs, VT, MMO);
	createOperands(N, Ops);

	assert(N->getMask().getValueType().getVectorNumElements() ==
	N->getValue().getValueType().getVectorNumElements() &&
	"Vector width mismatch between mask and data");
	assert(N->getIndex().getValueType().getVectorNumElements() >=
	N->getValue().getValueType().getVectorNumElements() &&
	"Vector width mismatch between index and data");
	assert(isa<ConstantSDNode>(N->getScale()) &&
	cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
	"Scale should be a constant power of 2");

	CSEMap.InsertNode(N, IP);
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::simplifySelect(SDValue Cond, SDValue T, SDValue F) {
	// select undef, T, F --> T (if T is a constant), otherwise F
	// select, ?, undef, F --> F
	// select, ?, T, undef --> T
	if (Cond.isUndef())
	return isConstantValueOfAnyType(T) ? T : F;
	if (T.isUndef())
	return F;
	if (F.isUndef())
	return T;

	// select true, T, F --> T
	// select false, T, F --> F
	if (auto *CondC = dyn_cast<ConstantSDNode>(Cond))
	return CondC->isNullValue() ? F : T;

	// TODO: This should simplify VSELECT with constant condition using something
	// like this (but check boolean contents to be complete?):
	// if (ISD::isBuildVectorAllOnes(Cond.getNode()))
	// return T;
	// if (ISD::isBuildVectorAllZeros(Cond.getNode()))
	// return F;

	// select ?, T, T --> T
	if (T == F)
	return T;

	return SDValue();
	}

	SDValue SelectionDAG::simplifyShift(SDValue X, SDValue Y) {
	// shift undef, Y --> 0 (can always assume that the undef value is 0)
	if (X.isUndef())
	return getConstant(0, SDLoc(X.getNode()), X.getValueType());
	// shift X, undef --> undef (because it may shift by the bitwidth)
	if (Y.isUndef())
	return getUNDEF(X.getValueType());

	// shift 0, Y --> 0
	// shift X, 0 --> X
	if (isNullOrNullSplat(X) \|\| isNullOrNullSplat(Y))
	return X;

	// shift X, C >= bitwidth(X) --> undef
	// All vector elements must be too big (or undef) to avoid partial undefs.
	auto isShiftTooBig = [X](ConstantSDNode *Val) {
	return !Val \|\| Val->getAPIntValue().uge(X.getScalarValueSizeInBits());
	};
	if (ISD::matchUnaryPredicate(Y, isShiftTooBig, true))
	return getUNDEF(X.getValueType());

	return SDValue();
	}

	// TODO: Use fast-math-flags to enable more simplifications.
	SDValue SelectionDAG::simplifyFPBinop(unsigned Opcode, SDValue X, SDValue Y) {
	ConstantFPSDNode YC = isConstOrConstSplatFP(Y, / AllowUndefs */ true);
	if (!YC)
	return SDValue();

	// X + -0.0 --> X
	if (Opcode == ISD::FADD)
	if (YC->getValueAPF().isNegZero())
	return X;

	// X - +0.0 --> X
	if (Opcode == ISD::FSUB)
	if (YC->getValueAPF().isPosZero())
	return X;

	// X * 1.0 --> X
	// X / 1.0 --> X
	if (Opcode == ISD::FMUL \|\| Opcode == ISD::FDIV)
	if (YC->getValueAPF().isExactlyValue(1.0))
	return X;

	return SDValue();
	}

	SDValue SelectionDAG::getVAArg(EVT VT, const SDLoc &dl, SDValue Chain,
	SDValue Ptr, SDValue SV, unsigned Align) {
	SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, dl, MVT::i32) };
	return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	ArrayRef<SDUse> Ops) {
	switch (Ops.size()) {
	case 0: return getNode(Opcode, DL, VT);
	case 1: return getNode(Opcode, DL, VT, static_cast<const SDValue>(Ops[0]));
	case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
	case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
	default: break;
	}

	// Copy from an SDUse array into an SDValue array for use with
	// the regular getNode logic.
	SmallVector<SDValue, 8> NewOps(Ops.begin(), Ops.end());
	return getNode(Opcode, DL, VT, NewOps);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
	ArrayRef<SDValue> Ops, const SDNodeFlags Flags) {
	unsigned NumOps = Ops.size();
	switch (NumOps) {
	case 0: return getNode(Opcode, DL, VT);
	case 1: return getNode(Opcode, DL, VT, Ops[0], Flags);
	case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
	case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2], Flags);
	default: break;
	}

	switch (Opcode) {
	default: break;
	case ISD::BUILD_VECTOR:
	// Attempt to simplify BUILD_VECTOR.
	if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
	return V;
	break;
	case ISD::CONCAT_VECTORS:
	if (SDValue V = foldCONCAT_VECTORS(DL, VT, Ops, *this))
	return V;
	break;
	case ISD::SELECT_CC:
	assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
	assert(Ops[0].getValueType() == Ops[1].getValueType() &&
	"LHS and RHS of condition must have same type!");
	assert(Ops[2].getValueType() == Ops[3].getValueType() &&
	"True and False arms of SelectCC must have same type!");
	assert(Ops[2].getValueType() == VT &&
	"select_cc node must be of same type as true and false value!");
	break;
	case ISD::BR_CC:
	assert(NumOps == 5 && "BR_CC takes 5 operands!");
	assert(Ops[2].getValueType() == Ops[3].getValueType() &&
	"LHS/RHS of comparison should match types!");
	break;
	}

	// Memoize nodes.
	SDNode *N;
	SDVTList VTs = getVTList(VT);

	if (VT != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTs, Ops);
	void *IP = nullptr;

	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
	return SDValue(E, 0);

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);

	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);
	}

	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
	ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops) {
	return getNode(Opcode, DL, getVTList(ResultTys), Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	ArrayRef<SDValue> Ops) {
	if (VTList.NumVTs == 1)
	return getNode(Opcode, DL, VTList.VTs[0], Ops);

	#if 0
	switch (Opcode) {
	// FIXME: figure out how to safely handle things like
	// int foo(int x) { return 1 << (x & 255); }
	// int bar() { return foo(256); }
	case ISD::SRA_PARTS:
	case ISD::SRL_PARTS:
	case ISD::SHL_PARTS:
	if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG &&
	cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1)
	return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
	else if (N3.getOpcode() == ISD::AND)
	if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) {
	// If the and is only masking out bits that cannot effect the shift,
	// eliminate the and.
	unsigned NumBits = VT.getScalarSizeInBits()*2;
	if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1)
	return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
	}
	break;
	}
	#endif

	// Memoize the node unless it returns a flag.
	SDNode *N;
	if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTList, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
	return SDValue(E, 0);

	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
	createOperands(N, Ops);
	CSEMap.InsertNode(N, IP);
	} else {
	N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
	createOperands(N, Ops);
	}
	InsertNode(N);
	SDValue V(N, 0);
	NewSDValueDbgMsg(V, "Creating new node: ", this);
	return V;
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
	SDVTList VTList) {
	return getNode(Opcode, DL, VTList, None);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1) {
	SDValue Ops[] = { N1 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1, SDValue N2) {
	SDValue Ops[] = { N1, N2 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1, SDValue N2, SDValue N3) {
	SDValue Ops[] = { N1, N2, N3 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
	SDValue Ops[] = { N1, N2, N3, N4 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
	SDValue N1, SDValue N2, SDValue N3, SDValue N4,
	SDValue N5) {
	SDValue Ops[] = { N1, N2, N3, N4, N5 };
	return getNode(Opcode, DL, VTList, Ops);
	}

	SDVTList SelectionDAG::getVTList(EVT VT) {
	return makeVTList(SDNode::getValueTypeList(VT), 1);
	}

	SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) {
	FoldingSetNodeID ID;
	ID.AddInteger(2U);
	ID.AddInteger(VT1.getRawBits());
	ID.AddInteger(VT2.getRawBits());

	void *IP = nullptr;
	SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
	if (!Result) {
	EVT *Array = Allocator.Allocate<EVT>(2);
	Array[0] = VT1;
	Array[1] = VT2;
	Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 2);
	VTListMap.InsertNode(Result, IP);
	}
	return Result->getSDVTList();
	}

	SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) {
	FoldingSetNodeID ID;
	ID.AddInteger(3U);
	ID.AddInteger(VT1.getRawBits());
	ID.AddInteger(VT2.getRawBits());
	ID.AddInteger(VT3.getRawBits());

	void *IP = nullptr;
	SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
	if (!Result) {
	EVT *Array = Allocator.Allocate<EVT>(3);
	Array[0] = VT1;
	Array[1] = VT2;
	Array[2] = VT3;
	Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 3);
	VTListMap.InsertNode(Result, IP);
	}
	return Result->getSDVTList();
	}

	SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
	FoldingSetNodeID ID;
	ID.AddInteger(4U);
	ID.AddInteger(VT1.getRawBits());
	ID.AddInteger(VT2.getRawBits());
	ID.AddInteger(VT3.getRawBits());
	ID.AddInteger(VT4.getRawBits());

	void *IP = nullptr;
	SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
	if (!Result) {
	EVT *Array = Allocator.Allocate<EVT>(4);
	Array[0] = VT1;
	Array[1] = VT2;
	Array[2] = VT3;
	Array[3] = VT4;
	Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 4);
	VTListMap.InsertNode(Result, IP);
	}
	return Result->getSDVTList();
	}

	SDVTList SelectionDAG::getVTList(ArrayRef<EVT> VTs) {
	unsigned NumVTs = VTs.size();
	FoldingSetNodeID ID;
	ID.AddInteger(NumVTs);
	for (unsigned index = 0; index < NumVTs; index++) {
	ID.AddInteger(VTs[index].getRawBits());
	}

	void *IP = nullptr;
	SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
	if (!Result) {
	EVT *Array = Allocator.Allocate<EVT>(NumVTs);
	llvm::copy(VTs, Array);
	Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs);
	VTListMap.InsertNode(Result, IP);
	}
	return Result->getSDVTList();
	}


	/// UpdateNodeOperands - Mutate the specified node in-place to have the
	/// specified operands. If the resultant node already exists in the DAG,
	/// this does not modify the specified node, instead it returns the node that
	/// already exists. If the resultant node does not exist in the DAG, the
	/// input node is returned. As a degenerate case, if you specify the same
	/// input operands as the node already has, the input node is returned.
	SDNode SelectionDAG::UpdateNodeOperands(SDNode N, SDValue Op) {
	assert(N->getNumOperands() == 1 && "Update with wrong number of operands");

	// Check to see if there is no change.
	if (Op == N->getOperand(0)) return N;

	// See if the modified node already exists.
	void *InsertPos = nullptr;
	if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos))
	return Existing;

	// Nope it doesn't. Remove the node from its current place in the maps.
	if (InsertPos)
	if (!RemoveNodeFromCSEMaps(N))
	InsertPos = nullptr;

	// Now we update the operands.
	N->OperandList[0].set(Op);

	updateDivergence(N);
	// If this gets put into a CSE map, add it.
	if (InsertPos) CSEMap.InsertNode(N, InsertPos);
	return N;
	}

	SDNode SelectionDAG::UpdateNodeOperands(SDNode N, SDValue Op1, SDValue Op2) {
	assert(N->getNumOperands() == 2 && "Update with wrong number of operands");

	// Check to see if there is no change.
	if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1))
	return N; // No operands changed, just return the input node.

	// See if the modified node already exists.
	void *InsertPos = nullptr;
	if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos))
	return Existing;

	// Nope it doesn't. Remove the node from its current place in the maps.
	if (InsertPos)
	if (!RemoveNodeFromCSEMaps(N))
	InsertPos = nullptr;

	// Now we update the operands.
	if (N->OperandList[0] != Op1)
	N->OperandList[0].set(Op1);
	if (N->OperandList[1] != Op2)
	N->OperandList[1].set(Op2);

	updateDivergence(N);
	// If this gets put into a CSE map, add it.
	if (InsertPos) CSEMap.InsertNode(N, InsertPos);
	return N;
	}

	SDNode *SelectionDAG::
	UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) {
	SDValue Ops[] = { Op1, Op2, Op3 };
	return UpdateNodeOperands(N, Ops);
	}

	SDNode *SelectionDAG::
	UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
	SDValue Op3, SDValue Op4) {
	SDValue Ops[] = { Op1, Op2, Op3, Op4 };
	return UpdateNodeOperands(N, Ops);
	}

	SDNode *SelectionDAG::
	UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
	SDValue Op3, SDValue Op4, SDValue Op5) {
	SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 };
	return UpdateNodeOperands(N, Ops);
	}

	SDNode *SelectionDAG::
	UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops) {
	unsigned NumOps = Ops.size();
	assert(N->getNumOperands() == NumOps &&
	"Update with wrong number of operands");

	// If no operands changed just return the input node.
	if (std::equal(Ops.begin(), Ops.end(), N->op_begin()))
	return N;

	// See if the modified node already exists.
	void *InsertPos = nullptr;
	if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, InsertPos))
	return Existing;

	// Nope it doesn't. Remove the node from its current place in the maps.
	if (InsertPos)
	if (!RemoveNodeFromCSEMaps(N))
	InsertPos = nullptr;

	// Now we update the operands.
	for (unsigned i = 0; i != NumOps; ++i)
	if (N->OperandList[i] != Ops[i])
	N->OperandList[i].set(Ops[i]);

	updateDivergence(N);
	// If this gets put into a CSE map, add it.
	if (InsertPos) CSEMap.InsertNode(N, InsertPos);
	return N;
	}

	/// DropOperands - Release the operands and set this node to have
	/// zero operands.
	void SDNode::DropOperands() {
	// Unlike the code in MorphNodeTo that does this, we don't need to
	// watch for dead nodes here.
	for (op_iterator I = op_begin(), E = op_end(); I != E; ) {
	SDUse &Use = *I++;
	Use.set(SDValue());
	}
	}

	void SelectionDAG::setNodeMemRefs(MachineSDNode *N,
	ArrayRef<MachineMemOperand *> NewMemRefs) {
	if (NewMemRefs.empty()) {
	N->clearMemRefs();
	return;
	}

	// Check if we can avoid allocating by storing a single reference directly.
	if (NewMemRefs.size() == 1) {
	N->MemRefs = NewMemRefs[0];
	N->NumMemRefs = 1;
	return;
	}

	MachineMemOperand **MemRefsBuffer =
	Allocator.template Allocate<MachineMemOperand *>(NewMemRefs.size());
	llvm::copy(NewMemRefs, MemRefsBuffer);
	N->MemRefs = MemRefsBuffer;
	N->NumMemRefs = static_cast<int>(NewMemRefs.size());
	}

	/// SelectNodeTo - These are wrappers around MorphNodeTo that accept a
	/// machine opcode.
	///
	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT) {
	SDVTList VTs = getVTList(VT);
	return SelectNodeTo(N, MachineOpc, VTs, None);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT, SDValue Op1) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1 };
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT, SDValue Op1,
	SDValue Op2) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1, Op2 };
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT, SDValue Op1,
	SDValue Op2, SDValue Op3) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1, Op2, Op3 };
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT, ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT);
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT1, EVT VT2, ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT1, VT2);
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT1, EVT VT2) {
	SDVTList VTs = getVTList(VT1, VT2);
	return SelectNodeTo(N, MachineOpc, VTs, None);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT1, EVT VT2, EVT VT3,
	ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT1, VT2, VT3);
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	EVT VT1, EVT VT2,
	SDValue Op1, SDValue Op2) {
	SDVTList VTs = getVTList(VT1, VT2);
	SDValue Ops[] = { Op1, Op2 };
	return SelectNodeTo(N, MachineOpc, VTs, Ops);
	}

	SDNode SelectionDAG::SelectNodeTo(SDNode N, unsigned MachineOpc,
	SDVTList VTs,ArrayRef<SDValue> Ops) {
	SDNode *New = MorphNodeTo(N, ~MachineOpc, VTs, Ops);
	// Reset the NodeID to -1.
	New->setNodeId(-1);
	if (New != N) {
	ReplaceAllUsesWith(N, New);
	RemoveDeadNode(N);
	}
	return New;
	}

	/// UpdateSDLocOnMergeSDNode - If the opt level is -O0 then it throws away
	/// the line number information on the merged node since it is not possible to
	/// preserve the information that operation is associated with multiple lines.
	/// This will make the debugger working better at -O0, were there is a higher
	/// probability having other instructions associated with that line.
	///
	/// For IROrder, we keep the smaller of the two
	SDNode SelectionDAG::UpdateSDLocOnMergeSDNode(SDNode N, const SDLoc &OLoc) {
	DebugLoc NLoc = N->getDebugLoc();
	if (NLoc && OptLevel == CodeGenOpt::None && OLoc.getDebugLoc() != NLoc) {
	N->setDebugLoc(DebugLoc());
	}
	unsigned Order = std::min(N->getIROrder(), OLoc.getIROrder());
	N->setIROrder(Order);
	return N;
	}

	/// MorphNodeTo - This mutates the specified node to have the specified
	/// return type, opcode, and operands.
	///
	/// Note that MorphNodeTo returns the resultant node. If there is already a
	/// node of the specified opcode and operands, it returns that node instead of
	/// the current one. Note that the SDLoc need not be the same.
	///
	/// Using MorphNodeTo is faster than creating a new node and swapping it in
	/// with ReplaceAllUsesWith both because it often avoids allocating a new
	/// node, and because it doesn't require CSE recalculation for any of
	/// the node's users.
	///
	/// However, note that MorphNodeTo recursively deletes dead nodes from the DAG.
	/// As a consequence it isn't appropriate to use from within the DAG combiner or
	/// the legalizer which maintain worklists that would need to be updated when
	/// deleting things.
	SDNode SelectionDAG::MorphNodeTo(SDNode N, unsigned Opc,
	SDVTList VTs, ArrayRef<SDValue> Ops) {
	// If an identical node already exists, use it.
	void *IP = nullptr;
	if (VTs.VTs[VTs.NumVTs-1] != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opc, VTs, Ops);
	if (SDNode *ON = FindNodeOrInsertPos(ID, SDLoc(N), IP))
	return UpdateSDLocOnMergeSDNode(ON, SDLoc(N));
	}

	if (!RemoveNodeFromCSEMaps(N))
	IP = nullptr;

	// Start the morphing.
	N->NodeType = Opc;
	N->ValueList = VTs.VTs;
	N->NumValues = VTs.NumVTs;

	// Clear the operands list, updating used nodes to remove this from their
	// use list. Keep track of any operands that become dead as a result.
	SmallPtrSet<SDNode*, 16> DeadNodeSet;
	for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
	SDUse &Use = *I++;
	SDNode *Used = Use.getNode();
	Use.set(SDValue());
	if (Used->use_empty())
	DeadNodeSet.insert(Used);
	}

	// For MachineNode, initialize the memory references information.
	if (MachineSDNode *MN = dyn_cast<MachineSDNode>(N))
	MN->clearMemRefs();

	// Swap for an appropriately sized array from the recycler.
	removeOperands(N);
	createOperands(N, Ops);

	// Delete any nodes that are still dead after adding the uses for the
	// new operands.
	if (!DeadNodeSet.empty()) {
	SmallVector<SDNode *, 16> DeadNodes;
	for (SDNode *N : DeadNodeSet)
	if (N->use_empty())
	DeadNodes.push_back(N);
	RemoveDeadNodes(DeadNodes);
	}

	if (IP)
	CSEMap.InsertNode(N, IP); // Memoize the new node.
	return N;
	}

	SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
	unsigned OrigOpc = Node->getOpcode();
	unsigned NewOpc;
	switch (OrigOpc) {
	default:
	llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!");
	case ISD::STRICT_FADD: NewOpc = ISD::FADD; break;
	case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; break;
	case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; break;
	case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; break;
	case ISD::STRICT_FREM: NewOpc = ISD::FREM; break;
	case ISD::STRICT_FMA: NewOpc = ISD::FMA; break;
	case ISD::STRICT_FSQRT: NewOpc = ISD::FSQRT; break;
	case ISD::STRICT_FPOW: NewOpc = ISD::FPOW; break;
	case ISD::STRICT_FPOWI: NewOpc = ISD::FPOWI; break;
	case ISD::STRICT_FSIN: NewOpc = ISD::FSIN; break;
	case ISD::STRICT_FCOS: NewOpc = ISD::FCOS; break;
	case ISD::STRICT_FEXP: NewOpc = ISD::FEXP; break;
	case ISD::STRICT_FEXP2: NewOpc = ISD::FEXP2; break;
	case ISD::STRICT_FLOG: NewOpc = ISD::FLOG; break;
	case ISD::STRICT_FLOG10: NewOpc = ISD::FLOG10; break;
	case ISD::STRICT_FLOG2: NewOpc = ISD::FLOG2; break;
	case ISD::STRICT_FRINT: NewOpc = ISD::FRINT; break;
	case ISD::STRICT_FNEARBYINT: NewOpc = ISD::FNEARBYINT; break;
	case ISD::STRICT_FMAXNUM: NewOpc = ISD::FMAXNUM; break;
	case ISD::STRICT_FMINNUM: NewOpc = ISD::FMINNUM; break;
	case ISD::STRICT_FCEIL: NewOpc = ISD::FCEIL; break;
	case ISD::STRICT_FFLOOR: NewOpc = ISD::FFLOOR; break;
	case ISD::STRICT_FROUND: NewOpc = ISD::FROUND; break;
	case ISD::STRICT_FTRUNC: NewOpc = ISD::FTRUNC; break;
	case ISD::STRICT_FP_ROUND: NewOpc = ISD::FP_ROUND; break;
	case ISD::STRICT_FP_EXTEND: NewOpc = ISD::FP_EXTEND; break;
	}

	assert(Node->getNumValues() == 2 && "Unexpected number of results!");

	// We're taking this node out of the chain, so we need to re-link things.
	SDValue InputChain = Node->getOperand(0);
	SDValue OutputChain = SDValue(Node, 1);
	ReplaceAllUsesOfValueWith(OutputChain, InputChain);

	SmallVector<SDValue, 3> Ops;
	for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
	Ops.push_back(Node->getOperand(i));

	SDVTList VTs = getVTList(Node->getValueType(0));
	SDNode *Res = MorphNodeTo(Node, NewOpc, VTs, Ops);

	// MorphNodeTo can operate in two ways: if an existing node with the
	// specified operands exists, it can just return it. Otherwise, it
	// updates the node in place to have the requested operands.
	if (Res == Node) {
	// If we updated the node in place, reset the node ID. To the isel,
	// this should be just like a newly allocated machine node.
	Res->setNodeId(-1);
	} else {
	ReplaceAllUsesWith(Node, Res);
	RemoveDeadNode(Node);
	}

	return Res;
	}

	/// getMachineNode - These are used for target selectors to create a new node
	/// with specified return type(s), MachineInstr opcode, and operands.
	///
	/// Note that getMachineNode returns the resultant node. If there is already a
	/// node of the specified opcode and operands, it returns that node instead of
	/// the current one.
	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT) {
	SDVTList VTs = getVTList(VT);
	return getMachineNode(Opcode, dl, VTs, None);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT, SDValue Op1) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT, SDValue Op1, SDValue Op2) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1, Op2 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT, SDValue Op1, SDValue Op2,
	SDValue Op3) {
	SDVTList VTs = getVTList(VT);
	SDValue Ops[] = { Op1, Op2, Op3 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT, ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT);
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, SDValue Op1,
	SDValue Op2) {
	SDVTList VTs = getVTList(VT1, VT2);
	SDValue Ops[] = { Op1, Op2 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, SDValue Op1,
	SDValue Op2, SDValue Op3) {
	SDVTList VTs = getVTList(VT1, VT2);
	SDValue Ops[] = { Op1, Op2, Op3 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2,
	ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT1, VT2);
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, EVT VT3,
	SDValue Op1, SDValue Op2) {
	SDVTList VTs = getVTList(VT1, VT2, VT3);
	SDValue Ops[] = { Op1, Op2 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, EVT VT3,
	SDValue Op1, SDValue Op2,
	SDValue Op3) {
	SDVTList VTs = getVTList(VT1, VT2, VT3);
	SDValue Ops[] = { Op1, Op2, Op3 };
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	EVT VT1, EVT VT2, EVT VT3,
	ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(VT1, VT2, VT3);
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
	ArrayRef<EVT> ResultTys,
	ArrayRef<SDValue> Ops) {
	SDVTList VTs = getVTList(ResultTys);
	return getMachineNode(Opcode, dl, VTs, Ops);
	}

	MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &DL,
	SDVTList VTs,
	ArrayRef<SDValue> Ops) {
	bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Glue;
	MachineSDNode *N;
	void *IP = nullptr;

	if (DoCSE) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, ~Opcode, VTs, Ops);
	IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
	return cast<MachineSDNode>(UpdateSDLocOnMergeSDNode(E, DL));
	}
	}

	// Allocate a new MachineSDNode.
	N = newSDNode<MachineSDNode>(~Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
	createOperands(N, Ops);

	if (DoCSE)
	CSEMap.InsertNode(N, IP);

	InsertNode(N);
	return N;
	}

	/// getTargetExtractSubreg - A convenience function for creating
	/// TargetOpcode::EXTRACT_SUBREG nodes.
	SDValue SelectionDAG::getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT,
	SDValue Operand) {
	SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
	SDNode *Subreg = getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
	VT, Operand, SRIdxVal);
	return SDValue(Subreg, 0);
	}

	/// getTargetInsertSubreg - A convenience function for creating
	/// TargetOpcode::INSERT_SUBREG nodes.
	SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
	SDValue Operand, SDValue Subreg) {
	SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
	SDNode *Result = getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
	VT, Operand, Subreg, SRIdxVal);
	return SDValue(Result, 0);
	}

	/// getNodeIfExists - Get the specified node if it's already available, or
	/// else return NULL.
	SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
	ArrayRef<SDValue> Ops,
	const SDNodeFlags Flags) {
	if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
	FoldingSetNodeID ID;
	AddNodeIDNode(ID, Opcode, VTList, Ops);
	void *IP = nullptr;
	if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) {
	E->intersectFlagsWith(Flags);
	return E;
	}
	}
	return nullptr;
	}

	/// getDbgValue - Creates a SDDbgValue node.
	///
	/// SDNode
	SDDbgValue SelectionDAG::getDbgValue(DIVariable Var, DIExpression *Expr,
	SDNode *N, unsigned R, bool IsIndirect,
	const DebugLoc &DL, unsigned O) {
	assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	return new (DbgInfo->getAlloc())
	SDDbgValue(Var, Expr, N, R, IsIndirect, DL, O);
	}

	/// Constant
	SDDbgValue SelectionDAG::getConstantDbgValue(DIVariable Var,
	DIExpression *Expr,
	const Value *C,
	const DebugLoc &DL, unsigned O) {
	assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, C, DL, O);
	}

	/// FrameIndex
	SDDbgValue SelectionDAG::getFrameIndexDbgValue(DIVariable Var,
	DIExpression *Expr, unsigned FI,
	bool IsIndirect,
	const DebugLoc &DL,
	unsigned O) {
	assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	return new (DbgInfo->getAlloc())
	SDDbgValue(Var, Expr, FI, IsIndirect, DL, O, SDDbgValue::FRAMEIX);
	}

	/// VReg
	SDDbgValue SelectionDAG::getVRegDbgValue(DIVariable Var,
	DIExpression *Expr,
	unsigned VReg, bool IsIndirect,
	const DebugLoc &DL, unsigned O) {
	assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	return new (DbgInfo->getAlloc())
	SDDbgValue(Var, Expr, VReg, IsIndirect, DL, O, SDDbgValue::VREG);
	}

	void SelectionDAG::transferDbgValues(SDValue From, SDValue To,
	unsigned OffsetInBits, unsigned SizeInBits,
	bool InvalidateDbg) {
	SDNode *FromNode = From.getNode();
	SDNode *ToNode = To.getNode();
	assert(FromNode && ToNode && "Can't modify dbg values");

	// PR35338
	// TODO: assert(From != To && "Redundant dbg value transfer");
	// TODO: assert(FromNode != ToNode && "Intranode dbg value transfer");
	if (From == To \|\| FromNode == ToNode)
	return;

	if (!FromNode->getHasDebugValue())
	return;

	SmallVector<SDDbgValue *, 2> ClonedDVs;
	for (SDDbgValue *Dbg : GetDbgValues(FromNode)) {
	if (Dbg->getKind() != SDDbgValue::SDNODE \|\| Dbg->isInvalidated())
	continue;

	// TODO: assert(!Dbg->isInvalidated() && "Transfer of invalid dbg value");

	// Just transfer the dbg value attached to From.
	if (Dbg->getResNo() != From.getResNo())
	continue;

	DIVariable *Var = Dbg->getVariable();
	auto *Expr = Dbg->getExpression();
	// If a fragment is requested, update the expression.
	if (SizeInBits) {
	// When splitting a larger (e.g., sign-extended) value whose
	// lower bits are described with an SDDbgValue, do not attempt
	// to transfer the SDDbgValue to the upper bits.
	if (auto FI = Expr->getFragmentInfo())
	if (OffsetInBits + SizeInBits > FI->SizeInBits)
	continue;
	auto Fragment = DIExpression::createFragmentExpression(Expr, OffsetInBits,
	SizeInBits);
	if (!Fragment)
	continue;
	Expr = *Fragment;
	}
	// Clone the SDDbgValue and move it to To.
	SDDbgValue *Clone =
	getDbgValue(Var, Expr, ToNode, To.getResNo(), Dbg->isIndirect(),
	Dbg->getDebugLoc(), Dbg->getOrder());
	ClonedDVs.push_back(Clone);

	if (InvalidateDbg) {
	// Invalidate value and indicate the SDDbgValue should not be emitted.
	Dbg->setIsInvalidated();
	Dbg->setIsEmitted();
	}
	}

	for (SDDbgValue *Dbg : ClonedDVs)
	AddDbgValue(Dbg, ToNode, false);
	}

	void SelectionDAG::salvageDebugInfo(SDNode &N) {
	if (!N.getHasDebugValue())
	return;

	SmallVector<SDDbgValue *, 2> ClonedDVs;
	for (auto DV : GetDbgValues(&N)) {
	if (DV->isInvalidated())
	continue;
	switch (N.getOpcode()) {
	default:
	break;
	case ISD::ADD:
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	if (!isConstantIntBuildVectorOrConstantInt(N0) &&
	isConstantIntBuildVectorOrConstantInt(N1)) {
	uint64_t Offset = N.getConstantOperandVal(1);
	// Rewrite an ADD constant node into a DIExpression. Since we are
	// performing arithmetic to compute the variable's value in the
	// DIExpression, we need to mark the expression with a
	// DW_OP_stack_value.
	auto *DIExpr = DV->getExpression();
	DIExpr =
	DIExpression::prepend(DIExpr, DIExpression::StackValue, Offset);
	SDDbgValue *Clone =
	getDbgValue(DV->getVariable(), DIExpr, N0.getNode(), N0.getResNo(),
	DV->isIndirect(), DV->getDebugLoc(), DV->getOrder());
	ClonedDVs.push_back(Clone);
	DV->setIsInvalidated();
	DV->setIsEmitted();
	LLVM_DEBUG(dbgs() << "SALVAGE: Rewriting";
	N0.getNode()->dumprFull(this);
	dbgs() << " into " << *DIExpr << '\n');
	}
	}
	}

	for (SDDbgValue *Dbg : ClonedDVs)
	AddDbgValue(Dbg, Dbg->getSDNode(), false);
	}

	/// Creates a SDDbgLabel node.
	SDDbgLabel SelectionDAG::getDbgLabel(DILabel Label,
	const DebugLoc &DL, unsigned O) {
	assert(cast<DILabel>(Label)->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	return new (DbgInfo->getAlloc()) SDDbgLabel(Label, DL, O);
	}

	namespace {

	/// RAUWUpdateListener - Helper for ReplaceAllUsesWith - When the node
	/// pointed to by a use iterator is deleted, increment the use iterator
	/// so that it doesn't dangle.
	///
	class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
	SDNode::use_iterator &UI;
	SDNode::use_iterator &UE;

	void NodeDeleted(SDNode N, SDNode E) override {
	// Increment the iterator as needed.
	while (UI != UE && N == *UI)
	++UI;
	}

	public:
	RAUWUpdateListener(SelectionDAG &d,
	SDNode::use_iterator &ui,
	SDNode::use_iterator &ue)
	: SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {}
	};

	} // end anonymous namespace

	/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
	/// This can cause recursive merging of nodes in the DAG.
	///
	/// This version assumes From has a single result value.
	///
	void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
	SDNode *From = FromN.getNode();
	assert(From->getNumValues() == 1 && FromN.getResNo() == 0 &&
	"Cannot replace with this method!");
	assert(From != To.getNode() && "Cannot replace uses of with self");

	// Preserve Debug Values
	transferDbgValues(FromN, To);

	// Iterate over all the existing uses of From. New uses will be added
	// to the beginning of the use list, which we avoid visiting.
	// This specifically avoids visiting uses of From that arise while the
	// replacement is happening, because any such uses would be the result
	// of CSE: If an existing node looks like From after one of its operands
	// is replaced by To, we don't want to replace of all its users with To
	// too. See PR3018 for more info.
	SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
	RAUWUpdateListener Listener(*this, UI, UE);
	while (UI != UE) {
	SDNode User = UI;

	// This node is about to morph, remove its old self from the CSE maps.
	RemoveNodeFromCSEMaps(User);

	// A user can appear in a use list multiple times, and when this
	// happens the uses are usually next to each other in the list.
	// To help reduce the number of CSE recomputations, process all
	// the uses of this user that we can find this way.
	do {
	SDUse &Use = UI.getUse();
	++UI;
	Use.set(To);
	if (To->isDivergent() != From->isDivergent())
	updateDivergence(User);
	} while (UI != UE && *UI == User);
	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}

	// If we just RAUW'd the root, take note.
	if (FromN == getRoot())
	setRoot(To);
	}

	/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
	/// This can cause recursive merging of nodes in the DAG.
	///
	/// This version assumes that for each value of From, there is a
	/// corresponding value in To in the same position with the same type.
	///
	void SelectionDAG::ReplaceAllUsesWith(SDNode From, SDNode To) {
	#ifndef NDEBUG
	for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
	assert((!From->hasAnyUseOfValue(i) \|\|
	From->getValueType(i) == To->getValueType(i)) &&
	"Cannot use this version of ReplaceAllUsesWith!");
	#endif

	// Handle the trivial case.
	if (From == To)
	return;

	// Preserve Debug Info. Only do this if there's a use.
	for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
	if (From->hasAnyUseOfValue(i)) {
	assert((i < To->getNumValues()) && "Invalid To location");
	transferDbgValues(SDValue(From, i), SDValue(To, i));
	}

	// Iterate over just the existing users of From. See the comments in
	// the ReplaceAllUsesWith above.
	SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
	RAUWUpdateListener Listener(*this, UI, UE);
	while (UI != UE) {
	SDNode User = UI;

	// This node is about to morph, remove its old self from the CSE maps.
	RemoveNodeFromCSEMaps(User);

	// A user can appear in a use list multiple times, and when this
	// happens the uses are usually next to each other in the list.
	// To help reduce the number of CSE recomputations, process all
	// the uses of this user that we can find this way.
	do {
	SDUse &Use = UI.getUse();
	++UI;
	Use.setNode(To);
	if (To->isDivergent() != From->isDivergent())
	updateDivergence(User);
	} while (UI != UE && *UI == User);

	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}

	// If we just RAUW'd the root, take note.
	if (From == getRoot().getNode())
	setRoot(SDValue(To, getRoot().getResNo()));
	}

	/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
	/// This can cause recursive merging of nodes in the DAG.
	///
	/// This version can replace From with any result values. To must match the
	/// number and types of values returned by From.
	void SelectionDAG::ReplaceAllUsesWith(SDNode From, const SDValue To) {
	if (From->getNumValues() == 1) // Handle the simple case efficiently.
	return ReplaceAllUsesWith(SDValue(From, 0), To[0]);

	// Preserve Debug Info.
	for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
	transferDbgValues(SDValue(From, i), To[i]);

	// Iterate over just the existing users of From. See the comments in
	// the ReplaceAllUsesWith above.
	SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
	RAUWUpdateListener Listener(*this, UI, UE);
	while (UI != UE) {
	SDNode User = UI;

	// This node is about to morph, remove its old self from the CSE maps.
	RemoveNodeFromCSEMaps(User);

	// A user can appear in a use list multiple times, and when this happens the
	// uses are usually next to each other in the list. To help reduce the
	// number of CSE and divergence recomputations, process all the uses of this
	// user that we can find this way.
	bool To_IsDivergent = false;
	do {
	SDUse &Use = UI.getUse();
	const SDValue &ToOp = To[Use.getResNo()];
	++UI;
	Use.set(ToOp);
	To_IsDivergent \|= ToOp->isDivergent();
	} while (UI != UE && *UI == User);

	if (To_IsDivergent != From->isDivergent())
	updateDivergence(User);

	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}

	// If we just RAUW'd the root, take note.
	if (From == getRoot().getNode())
	setRoot(SDValue(To[getRoot().getResNo()]));
	}

	/// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving
	/// uses of other values produced by From.getNode() alone. The Deleted
	/// vector is handled the same way as for ReplaceAllUsesWith.
	void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
	// Handle the really simple, really trivial case efficiently.
	if (From == To) return;

	// Handle the simple, trivial, case efficiently.
	if (From.getNode()->getNumValues() == 1) {
	ReplaceAllUsesWith(From, To);
	return;
	}

	// Preserve Debug Info.
	transferDbgValues(From, To);

	// Iterate over just the existing users of From. See the comments in
	// the ReplaceAllUsesWith above.
	SDNode::use_iterator UI = From.getNode()->use_begin(),
	UE = From.getNode()->use_end();
	RAUWUpdateListener Listener(*this, UI, UE);
	while (UI != UE) {
	SDNode User = UI;
	bool UserRemovedFromCSEMaps = false;

	// A user can appear in a use list multiple times, and when this
	// happens the uses are usually next to each other in the list.
	// To help reduce the number of CSE recomputations, process all
	// the uses of this user that we can find this way.
	do {
	SDUse &Use = UI.getUse();

	// Skip uses of different values from the same node.
	if (Use.getResNo() != From.getResNo()) {
	++UI;
	continue;
	}

	// If this node hasn't been modified yet, it's still in the CSE maps,
	// so remove its old self from the CSE maps.
	if (!UserRemovedFromCSEMaps) {
	RemoveNodeFromCSEMaps(User);
	UserRemovedFromCSEMaps = true;
	}

	++UI;
	Use.set(To);
	if (To->isDivergent() != From->isDivergent())
	updateDivergence(User);
	} while (UI != UE && *UI == User);
	// We are iterating over all uses of the From node, so if a use
	// doesn't use the specific value, no changes are made.
	if (!UserRemovedFromCSEMaps)
	continue;

	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}

	// If we just RAUW'd the root, take note.
	if (From == getRoot())
	setRoot(To);
	}

	namespace {

	/// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
	/// to record information about a use.
	struct UseMemo {
	SDNode *User;
	unsigned Index;
	SDUse *Use;
	};

	/// operator< - Sort Memos by User.
	bool operator<(const UseMemo &L, const UseMemo &R) {
	return (intptr_t)L.User < (intptr_t)R.User;
	}

	} // end anonymous namespace

	void SelectionDAG::updateDivergence(SDNode * N)
	{
	if (TLI->isSDNodeAlwaysUniform(N))
	return;
	bool IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, DA);
	for (auto &Op : N->ops()) {
	if (Op.Val.getValueType() != MVT::Other)
	IsDivergent \|= Op.getNode()->isDivergent();
	}
	if (N->SDNodeBits.IsDivergent != IsDivergent) {
	N->SDNodeBits.IsDivergent = IsDivergent;
	for (auto U : N->uses()) {
	updateDivergence(U);
	}
	}
	}

	void SelectionDAG::CreateTopologicalOrder(std::vector<SDNode *> &Order) {
	DenseMap<SDNode *, unsigned> Degree;
	Order.reserve(AllNodes.size());
	for (auto &N : allnodes()) {
	unsigned NOps = N.getNumOperands();
	Degree[&N] = NOps;
	if (0 == NOps)
	Order.push_back(&N);
	}
	for (size_t I = 0; I != Order.size(); ++I) {
	SDNode *N = Order[I];
	for (auto U : N->uses()) {
	unsigned &UnsortedOps = Degree[U];
	if (0 == --UnsortedOps)
	Order.push_back(U);
	}
	}
	}

	#ifndef NDEBUG
	void SelectionDAG::VerifyDAGDiverence() {
	std::vector<SDNode *> TopoOrder;
	CreateTopologicalOrder(TopoOrder);
	const TargetLowering &TLI = getTargetLoweringInfo();
	DenseMap<const SDNode *, bool> DivergenceMap;
	for (auto &N : allnodes()) {
	DivergenceMap[&N] = false;
	}
	for (auto N : TopoOrder) {
	bool IsDivergent = DivergenceMap[N];
	bool IsSDNodeDivergent = TLI.isSDNodeSourceOfDivergence(N, FLI, DA);
	for (auto &Op : N->ops()) {
	if (Op.Val.getValueType() != MVT::Other)
	IsSDNodeDivergent \|= DivergenceMap[Op.getNode()];
	}
	if (!IsDivergent && IsSDNodeDivergent && !TLI.isSDNodeAlwaysUniform(N)) {
	DivergenceMap[N] = true;
	}
	}
	for (auto &N : allnodes()) {
	(void)N;
	assert(DivergenceMap[&N] == N.isDivergent() &&
	"Divergence bit inconsistency detected\n");
	}
	}
	#endif

	/// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving
	/// uses of other values produced by From.getNode() alone. The same value
	/// may appear in both the From and To list. The Deleted vector is
	/// handled the same way as for ReplaceAllUsesWith.
	void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
	const SDValue *To,
	unsigned Num){
	// Handle the simple, trivial case efficiently.
	if (Num == 1)
	return ReplaceAllUsesOfValueWith(From, To);

	transferDbgValues(From, To);

	// Read up all the uses and make records of them. This helps
	// processing new uses that are introduced during the
	// replacement process.
	SmallVector<UseMemo, 4> Uses;
	for (unsigned i = 0; i != Num; ++i) {
	unsigned FromResNo = From[i].getResNo();
	SDNode *FromNode = From[i].getNode();
	for (SDNode::use_iterator UI = FromNode->use_begin(),
	E = FromNode->use_end(); UI != E; ++UI) {
	SDUse &Use = UI.getUse();
	if (Use.getResNo() == FromResNo) {
	UseMemo Memo = { *UI, i, &Use };
	Uses.push_back(Memo);
	}
	}
	}

	// Sort the uses, so that all the uses from a given User are together.
	llvm::sort(Uses);

	for (unsigned UseIndex = 0, UseIndexEnd = Uses.size();
	UseIndex != UseIndexEnd; ) {
	// We know that this user uses some value of From. If it is the right
	// value, update it.
	SDNode *User = Uses[UseIndex].User;

	// This node is about to morph, remove its old self from the CSE maps.
	RemoveNodeFromCSEMaps(User);

	// The Uses array is sorted, so all the uses for a given User
	// are next to each other in the list.
	// To help reduce the number of CSE recomputations, process all
	// the uses of this user that we can find this way.
	do {
	unsigned i = Uses[UseIndex].Index;
	SDUse &Use = *Uses[UseIndex].Use;
	++UseIndex;

	Use.set(To[i]);
	} while (UseIndex != UseIndexEnd && Uses[UseIndex].User == User);

	// Now that we have modified User, add it back to the CSE maps. If it
	// already exists there, recursively merge the results together.
	AddModifiedNodeToCSEMaps(User);
	}
	}

	/// AssignTopologicalOrder - Assign a unique node id for each node in the DAG
	/// based on their topological order. It returns the maximum id and a vector
	/// of the SDNodes* in assigned order by reference.
	unsigned SelectionDAG::AssignTopologicalOrder() {
	unsigned DAGSize = 0;

	// SortedPos tracks the progress of the algorithm. Nodes before it are
	// sorted, nodes after it are unsorted. When the algorithm completes
	// it is at the end of the list.
	allnodes_iterator SortedPos = allnodes_begin();

	// Visit all the nodes. Move nodes with no operands to the front of
	// the list immediately. Annotate nodes that do have operands with their
	// operand count. Before we do this, the Node Id fields of the nodes
	// may contain arbitrary values. After, the Node Id fields for nodes
	// before SortedPos will contain the topological sort index, and the
	// Node Id fields for nodes At SortedPos and after will contain the
	// count of outstanding operands.
	for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) {
	SDNode N = &I++;
	checkForCycles(N, this);
	unsigned Degree = N->getNumOperands();
	if (Degree == 0) {
	// A node with no uses, add it to the result array immediately.
	N->setNodeId(DAGSize++);
	allnodes_iterator Q(N);
	if (Q != SortedPos)
	SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q));
	assert(SortedPos != AllNodes.end() && "Overran node list");
	++SortedPos;
	} else {
	// Temporarily use the Node Id as scratch space for the degree count.
	N->setNodeId(Degree);
	}
	}

	// Visit all the nodes. As we iterate, move nodes into sorted order,
	// such that by the time the end is reached all nodes will be sorted.
	for (SDNode &Node : allnodes()) {
	SDNode *N = &Node;
	checkForCycles(N, this);
	// N is in sorted position, so all its uses have one less operand
	// that needs to be sorted.
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	UI != UE; ++UI) {
	SDNode P = UI;
	unsigned Degree = P->getNodeId();
	assert(Degree != 0 && "Invalid node degree");
	--Degree;
	if (Degree == 0) {
	// All of P's operands are sorted, so P may sorted now.
	P->setNodeId(DAGSize++);
	if (P->getIterator() != SortedPos)
	SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(P));
	assert(SortedPos != AllNodes.end() && "Overran node list");
	++SortedPos;
	} else {
	// Update P's outstanding operand count.
	P->setNodeId(Degree);
	}
	}
	if (Node.getIterator() == SortedPos) {
	#ifndef NDEBUG
	allnodes_iterator I(N);
	SDNode S = &++I;
	dbgs() << "Overran sorted position:\n";
	S->dumprFull(this); dbgs() << "\n";
	dbgs() << "Checking if this is due to cycles\n";
	checkForCycles(this, true);
	#endif
	llvm_unreachable(nullptr);
	}
	}

	assert(SortedPos == AllNodes.end() &&
	"Topological sort incomplete!");
	assert(AllNodes.front().getOpcode() == ISD::EntryToken &&
	"First node in topological sort is not the entry token!");
	assert(AllNodes.front().getNodeId() == 0 &&
	"First node in topological sort has non-zero id!");
	assert(AllNodes.front().getNumOperands() == 0 &&
	"First node in topological sort has operands!");
	assert(AllNodes.back().getNodeId() == (int)DAGSize-1 &&
	"Last node in topologic sort has unexpected id!");
	assert(AllNodes.back().use_empty() &&
	"Last node in topologic sort has users!");
	assert(DAGSize == allnodes_size() && "Node count mismatch!");
	return DAGSize;
	}

	/// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
	/// value is produced by SD.
	void SelectionDAG::AddDbgValue(SDDbgValue DB, SDNode SD, bool isParameter) {
	if (SD) {
	assert(DbgInfo->getSDDbgValues(SD).empty() \|\| SD->getHasDebugValue());
	SD->setHasDebugValue(true);
	}
	DbgInfo->add(DB, SD, isParameter);
	}

	void SelectionDAG::AddDbgLabel(SDDbgLabel *DB) {
	DbgInfo->add(DB);
	}

	SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
	SDValue NewMemOp) {
	assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
	// The new memory operation must have the same position as the old load in
	// terms of memory dependency. Create a TokenFactor for the old load and new
	// memory operation and update uses of the old load's output chain to use that
	// TokenFactor.
	SDValue OldChain = SDValue(OldLoad, 1);
	SDValue NewChain = SDValue(NewMemOp.getNode(), 1);
	if (!OldLoad->hasAnyUseOfValue(1))
	return NewChain;

	SDValue TokenFactor =
	getNode(ISD::TokenFactor, SDLoc(OldLoad), MVT::Other, OldChain, NewChain);
	ReplaceAllUsesOfValueWith(OldChain, TokenFactor);
	UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain);
	return TokenFactor;
	}

	SDValue SelectionDAG::getSymbolFunctionGlobalAddress(SDValue Op,
	Function **OutFunction) {
	assert(isa<ExternalSymbolSDNode>(Op) && "Node should be an ExternalSymbol");

	auto *Symbol = cast<ExternalSymbolSDNode>(Op)->getSymbol();
	auto *Module = MF->getFunction().getParent();
	auto *Function = Module->getFunction(Symbol);

	if (OutFunction != nullptr)
	*OutFunction = Function;

	if (Function != nullptr) {
	auto PtrTy = TLI->getPointerTy(getDataLayout(), Function->getAddressSpace());
	return getGlobalAddress(Function, SDLoc(Op), PtrTy);
	}

	std::string ErrorStr;
	raw_string_ostream ErrorFormatter(ErrorStr);

	ErrorFormatter << "Undefined external symbol ";
	ErrorFormatter << '"' << Symbol << '"';
	ErrorFormatter.flush();

	report_fatal_error(ErrorStr);
	}

	//===----------------------------------------------------------------------===//
	// SDNode Class
	//===----------------------------------------------------------------------===//

	bool llvm::isNullConstant(SDValue V) {
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
	return Const != nullptr && Const->isNullValue();
	}

	bool llvm::isNullFPConstant(SDValue V) {
	ConstantFPSDNode *Const = dyn_cast<ConstantFPSDNode>(V);
	return Const != nullptr && Const->isZero() && !Const->isNegative();
	}

	bool llvm::isAllOnesConstant(SDValue V) {
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
	return Const != nullptr && Const->isAllOnesValue();
	}

	bool llvm::isOneConstant(SDValue V) {
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
	return Const != nullptr && Const->isOne();
	}

	SDValue llvm::peekThroughBitcasts(SDValue V) {
	while (V.getOpcode() == ISD::BITCAST)
	V = V.getOperand(0);
	return V;
	}

	SDValue llvm::peekThroughOneUseBitcasts(SDValue V) {
	while (V.getOpcode() == ISD::BITCAST && V.getOperand(0).hasOneUse())
	V = V.getOperand(0);
	return V;
	}

	SDValue llvm::peekThroughExtractSubvectors(SDValue V) {
	while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
	V = V.getOperand(0);
	return V;
	}

	bool llvm::isBitwiseNot(SDValue V, bool AllowUndefs) {
	if (V.getOpcode() != ISD::XOR)
	return false;
	V = peekThroughBitcasts(V.getOperand(1));
	unsigned NumBits = V.getScalarValueSizeInBits();
	ConstantSDNode *C =
	isConstOrConstSplat(V, AllowUndefs, /AllowTruncation/ true);
	return C && (C->getAPIntValue().countTrailingOnes() >= NumBits);
	}

	ConstantSDNode *llvm::isConstOrConstSplat(SDValue N, bool AllowUndefs,
	bool AllowTruncation) {
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
	return CN;

	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
	BitVector UndefElements;
	ConstantSDNode *CN = BV->getConstantSplatNode(&UndefElements);

	// BuildVectors can truncate their operands. Ignore that case here unless
	// AllowTruncation is set.
	if (CN && (UndefElements.none() \|\| AllowUndefs)) {
	EVT CVT = CN->getValueType(0);
	EVT NSVT = N.getValueType().getScalarType();
	assert(CVT.bitsGE(NSVT) && "Illegal build vector element extension");
	if (AllowTruncation \|\| (CVT == NSVT))
	return CN;
	}
	}

	return nullptr;
	}

	ConstantSDNode *llvm::isConstOrConstSplat(SDValue N, const APInt &DemandedElts,
	bool AllowUndefs,
	bool AllowTruncation) {
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
	return CN;

	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
	BitVector UndefElements;
	ConstantSDNode *CN = BV->getConstantSplatNode(DemandedElts, &UndefElements);

	// BuildVectors can truncate their operands. Ignore that case here unless
	// AllowTruncation is set.
	if (CN && (UndefElements.none() \|\| AllowUndefs)) {
	EVT CVT = CN->getValueType(0);
	EVT NSVT = N.getValueType().getScalarType();
	assert(CVT.bitsGE(NSVT) && "Illegal build vector element extension");
	if (AllowTruncation \|\| (CVT == NSVT))
	return CN;
	}
	}

	return nullptr;
	}

	ConstantFPSDNode *llvm::isConstOrConstSplatFP(SDValue N, bool AllowUndefs) {
	if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
	return CN;

	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
	BitVector UndefElements;
	ConstantFPSDNode *CN = BV->getConstantFPSplatNode(&UndefElements);
	if (CN && (UndefElements.none() \|\| AllowUndefs))
	return CN;
	}

	return nullptr;
	}

	ConstantFPSDNode *llvm::isConstOrConstSplatFP(SDValue N,
	const APInt &DemandedElts,
	bool AllowUndefs) {
	if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
	return CN;

	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
	BitVector UndefElements;
	ConstantFPSDNode *CN =
	BV->getConstantFPSplatNode(DemandedElts, &UndefElements);
	if (CN && (UndefElements.none() \|\| AllowUndefs))
	return CN;
	}

	return nullptr;
	}

	bool llvm::isNullOrNullSplat(SDValue N, bool AllowUndefs) {
	// TODO: may want to use peekThroughBitcast() here.
	ConstantSDNode *C = isConstOrConstSplat(N, AllowUndefs);
	return C && C->isNullValue();
	}

	bool llvm::isOneOrOneSplat(SDValue N) {
	// TODO: may want to use peekThroughBitcast() here.
	unsigned BitWidth = N.getScalarValueSizeInBits();
	ConstantSDNode *C = isConstOrConstSplat(N);
	return C && C->isOne() && C->getValueSizeInBits(0) == BitWidth;
	}

	bool llvm::isAllOnesOrAllOnesSplat(SDValue N) {
	N = peekThroughBitcasts(N);
	unsigned BitWidth = N.getScalarValueSizeInBits();
	ConstantSDNode *C = isConstOrConstSplat(N);
	return C && C->isAllOnesValue() && C->getValueSizeInBits(0) == BitWidth;
	}

	HandleSDNode::~HandleSDNode() {
	DropOperands();
	}

	GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
	const DebugLoc &DL,
	const GlobalValue *GA, EVT VT,
	int64_t o, unsigned char TF)
	: SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
	TheGlobal = GA;
	}

	AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, const DebugLoc &dl,
	EVT VT, unsigned SrcAS,
	unsigned DestAS)
	: SDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT)),
	SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {}

	MemSDNode::MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl,
	SDVTList VTs, EVT memvt, MachineMemOperand *mmo)
	: SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) {
	MemSDNodeBits.IsVolatile = MMO->isVolatile();
	MemSDNodeBits.IsNonTemporal = MMO->isNonTemporal();
	MemSDNodeBits.IsDereferenceable = MMO->isDereferenceable();
	MemSDNodeBits.IsInvariant = MMO->isInvariant();

	// We check here that the size of the memory operand fits within the size of
	// the MMO. This is because the MMO might indicate only a possible address
	// range instead of specifying the affected memory addresses precisely.
	assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!");
	}

	/// Profile - Gather unique data for the node.
	///
	void SDNode::Profile(FoldingSetNodeID &ID) const {
	AddNodeIDNode(ID, this);
	}

	namespace {

	struct EVTArray {
	std::vector<EVT> VTs;

	EVTArray() {
	VTs.reserve(MVT::LAST_VALUETYPE);
	for (unsigned i = 0; i < MVT::LAST_VALUETYPE; ++i)
	VTs.push_back(MVT((MVT::SimpleValueType)i));
	}
	};

	} // end anonymous namespace

	static ManagedStatic<std::set<EVT, EVT::compareRawBits>> EVTs;
	static ManagedStatic<EVTArray> SimpleVTArray;
	static ManagedStatic<sys::SmartMutex<true>> VTMutex;

	/// getValueTypeList - Return a pointer to the specified value type.
	///
	const EVT *SDNode::getValueTypeList(EVT VT) {
	if (VT.isExtended()) {
	sys::SmartScopedLock<true> Lock(*VTMutex);
	return &(*EVTs->insert(VT).first);
	} else {
	assert(VT.getSimpleVT() < MVT::LAST_VALUETYPE &&
	"Value type out of range!");
	return &SimpleVTArray->VTs[VT.getSimpleVT().SimpleTy];
	}
	}

	/// hasNUsesOfValue - Return true if there are exactly NUSES uses of the
	/// indicated value. This method ignores uses of other values defined by this
	/// operation.
	bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
	assert(Value < getNumValues() && "Bad value!");

	// TODO: Only iterate over uses of a given value of the node
	for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
	if (UI.getUse().getResNo() == Value) {
	if (NUses == 0)
	return false;
	--NUses;
	}
	}

	// Found exactly the right number of uses?
	return NUses == 0;
	}

	/// hasAnyUseOfValue - Return true if there are any use of the indicated
	/// value. This method ignores uses of other values defined by this operation.
	bool SDNode::hasAnyUseOfValue(unsigned Value) const {
	assert(Value < getNumValues() && "Bad value!");

	for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI)
	if (UI.getUse().getResNo() == Value)
	return true;

	return false;
	}

	/// isOnlyUserOf - Return true if this node is the only use of N.
	bool SDNode::isOnlyUserOf(const SDNode *N) const {
	bool Seen = false;
	for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
	SDNode User = I;
	if (User == this)
	Seen = true;
	else
	return false;
	}

	return Seen;
	}

	/// Return true if the only users of N are contained in Nodes.
	bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode > Nodes, const SDNode N) {
	bool Seen = false;
	for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
	SDNode User = I;
	if (llvm::any_of(Nodes,
	[&User](const SDNode *Node) { return User == Node; }))
	Seen = true;
	else
	return false;
	}

	return Seen;
	}

	/// isOperand - Return true if this node is an operand of N.
	bool SDValue::isOperandOf(const SDNode *N) const {
	return any_of(N->op_values(), [this](SDValue Op) { return *this == Op; });
	}

	bool SDNode::isOperandOf(const SDNode *N) const {
	return any_of(N->op_values(),
	[this](SDValue Op) { return this == Op.getNode(); });
	}

	/// reachesChainWithoutSideEffects - Return true if this operand (which must
	/// be a chain) reaches the specified operand without crossing any
	/// side-effecting instructions on any chain path. In practice, this looks
	/// through token factors and non-volatile loads. In order to remain efficient,
	/// this only looks a couple of nodes in, it does not do an exhaustive search.
	///
	/// Note that we only need to examine chains when we're searching for
	/// side-effects; SelectionDAG requires that all side-effects are represented
	/// by chains, even if another operand would force a specific ordering. This
	/// constraint is necessary to allow transformations like splitting loads.
	bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
	unsigned Depth) const {
	if (*this == Dest) return true;

	// Don't search too deeply, we just want to be able to see through
	// TokenFactor's etc.
	if (Depth == 0) return false;

	// If this is a token factor, all inputs to the TF happen in parallel.
	if (getOpcode() == ISD::TokenFactor) {
	// First, try a shallow search.
	if (is_contained((*this)->ops(), Dest)) {
	// We found the chain we want as an operand of this TokenFactor.
	// Essentially, we reach the chain without side-effects if we could
	// serialize the TokenFactor into a simple chain of operations with
	// Dest as the last operation. This is automatically true if the
	// chain has one use: there are no other ordering constraints.
	// If the chain has more than one use, we give up: some other
	// use of Dest might force a side-effect between Dest and the current
	// node.
	if (Dest.hasOneUse())
	return true;
	}
	// Next, try a deep search: check whether every operand of the TokenFactor
	// reaches Dest.
	return llvm::all_of((*this)->ops(), [=](SDValue Op) {
	return Op.reachesChainWithoutSideEffects(Dest, Depth - 1);
	});
	}

	// Loads don't have side effects, look through them.
	if (LoadSDNode Ld = dyn_cast<LoadSDNode>(this)) {
	if (!Ld->isVolatile())
	return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1);
	}
	return false;
	}

	bool SDNode::hasPredecessor(const SDNode *N) const {
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Worklist.push_back(this);
	return hasPredecessorHelper(N, Visited, Worklist);
	}

	void SDNode::intersectFlagsWith(const SDNodeFlags Flags) {
	this->Flags.intersectWith(Flags);
	}

	SDValue
	SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
	ArrayRef<ISD::NodeType> CandidateBinOps) {
	// The pattern must end in an extract from index 0.
	if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isNullConstant(Extract->getOperand(1)))
	return SDValue();

	SDValue Op = Extract->getOperand(0);
	unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());

	// Match against one of the candidate binary ops.
	if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
	return Op.getOpcode() == unsigned(BinOp);
	}))
	return SDValue();

	// At each stage, we're looking for something that looks like:
	// %s = shufflevector <8 x i32> %op, <8 x i32> undef,
	// <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
	// i32 undef, i32 undef, i32 undef, i32 undef>
	// %a = binop <8 x i32> %op, %s
	// Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
	// we expect something like:
	// <4,5,6,7,u,u,u,u>
	// <2,3,u,u,u,u,u,u>
	// <1,u,u,u,u,u,u,u>
	unsigned CandidateBinOp = Op.getOpcode();
	for (unsigned i = 0; i < Stages; ++i) {
	if (Op.getOpcode() != CandidateBinOp)
	return SDValue();

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(Op0);
	if (Shuffle) {
	Op = Op1;
	} else {
	Shuffle = dyn_cast<ShuffleVectorSDNode>(Op1);
	Op = Op0;
	}

	// The first operand of the shuffle should be the same as the other operand
	// of the binop.
	if (!Shuffle \|\| Shuffle->getOperand(0) != Op)
	return SDValue();

	// Verify the shuffle has the expected (at this stage of the pyramid) mask.
	for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
	if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
	return SDValue();
	}

	BinOp = (ISD::NodeType)CandidateBinOp;
	return Op;
	}

	SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
	assert(N->getNumValues() == 1 &&
	"Can't unroll a vector with multiple results!");

	EVT VT = N->getValueType(0);
	unsigned NE = VT.getVectorNumElements();
	EVT EltVT = VT.getVectorElementType();
	SDLoc dl(N);

	SmallVector<SDValue, 8> Scalars;
	SmallVector<SDValue, 4> Operands(N->getNumOperands());

	// If ResNE is 0, fully unroll the vector op.
	if (ResNE == 0)
	ResNE = NE;
	else if (NE > ResNE)
	NE = ResNE;

	unsigned i;
	for (i= 0; i != NE; ++i) {
	for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) {
	SDValue Operand = N->getOperand(j);
	EVT OperandVT = Operand.getValueType();
	if (OperandVT.isVector()) {
	// A vector operand; extract a single element.
	EVT OperandEltVT = OperandVT.getVectorElementType();
	Operands[j] =
	getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT, Operand,
	getConstant(i, dl, TLI->getVectorIdxTy(getDataLayout())));
	} else {
	// A scalar operand; just use it as is.
	Operands[j] = Operand;
	}
	}

	switch (N->getOpcode()) {
	default: {
	Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands,
	N->getFlags()));
	break;
	}
	case ISD::VSELECT:
	Scalars.push_back(getNode(ISD::SELECT, dl, EltVT, Operands));
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	case ISD::ROTL:
	case ISD::ROTR:
	Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0],
	getShiftAmountOperand(Operands[0].getValueType(),
	Operands[1])));
	break;
	case ISD::SIGN_EXTEND_INREG:
	case ISD::FP_ROUND_INREG: {
	EVT ExtVT = cast<VTSDNode>(Operands[1])->getVT().getVectorElementType();
	Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
	Operands[0],
	getValueType(ExtVT)));
	}
	}
	}

	for (; i < ResNE; ++i)
	Scalars.push_back(getUNDEF(EltVT));

	EVT VecVT = EVT::getVectorVT(*getContext(), EltVT, ResNE);
	return getBuildVector(VecVT, dl, Scalars);
	}

	std::pair<SDValue, SDValue> SelectionDAG::UnrollVectorOverflowOp(
	SDNode *N, unsigned ResNE) {
	unsigned Opcode = N->getOpcode();
	assert((Opcode == ISD::UADDO \|\| Opcode == ISD::SADDO \|\|
	Opcode == ISD::USUBO \|\| Opcode == ISD::SSUBO \|\|
	Opcode == ISD::UMULO \|\| Opcode == ISD::SMULO) &&
	"Expected an overflow opcode");

	EVT ResVT = N->getValueType(0);
	EVT OvVT = N->getValueType(1);
	EVT ResEltVT = ResVT.getVectorElementType();
	EVT OvEltVT = OvVT.getVectorElementType();
	SDLoc dl(N);

	// If ResNE is 0, fully unroll the vector op.
	unsigned NE = ResVT.getVectorNumElements();
	if (ResNE == 0)
	ResNE = NE;
	else if (NE > ResNE)
	NE = ResNE;

	SmallVector<SDValue, 8> LHSScalars;
	SmallVector<SDValue, 8> RHSScalars;
	ExtractVectorElements(N->getOperand(0), LHSScalars, 0, NE);
	ExtractVectorElements(N->getOperand(1), RHSScalars, 0, NE);

	EVT SVT = TLI->getSetCCResultType(getDataLayout(), *getContext(), ResEltVT);
	SDVTList VTs = getVTList(ResEltVT, SVT);
	SmallVector<SDValue, 8> ResScalars;
	SmallVector<SDValue, 8> OvScalars;
	for (unsigned i = 0; i < NE; ++i) {
	SDValue Res = getNode(Opcode, dl, VTs, LHSScalars[i], RHSScalars[i]);
	SDValue Ov =
	getSelect(dl, OvEltVT, Res.getValue(1),
	getBoolConstant(true, dl, OvEltVT, ResVT),
	getConstant(0, dl, OvEltVT));

	ResScalars.push_back(Res);
	OvScalars.push_back(Ov);
	}

	ResScalars.append(ResNE - NE, getUNDEF(ResEltVT));
	OvScalars.append(ResNE - NE, getUNDEF(OvEltVT));

	EVT NewResVT = EVT::getVectorVT(*getContext(), ResEltVT, ResNE);
	EVT NewOvVT = EVT::getVectorVT(*getContext(), OvEltVT, ResNE);
	return std::make_pair(getBuildVector(NewResVT, dl, ResScalars),
	getBuildVector(NewOvVT, dl, OvScalars));
	}

	bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
	LoadSDNode *Base,
	unsigned Bytes,
	int Dist) const {
	if (LD->isVolatile() \|\| Base->isVolatile())
	return false;
	if (LD->isIndexed() \|\| Base->isIndexed())
	return false;
	if (LD->getChain() != Base->getChain())
	return false;
	EVT VT = LD->getValueType(0);
	if (VT.getSizeInBits() / 8 != Bytes)
	return false;

	auto BaseLocDecomp = BaseIndexOffset::match(Base, *this);
	auto LocDecomp = BaseIndexOffset::match(LD, *this);

	int64_t Offset = 0;
	if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset))
	return (Dist * Bytes == Offset);
	return false;
	}

	/// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
	/// it cannot be inferred.
	unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
	// If this is a GlobalAddress + cst, return the alignment.
	const GlobalValue *GV;
	int64_t GVOffset = 0;
	if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
	unsigned IdxWidth = getDataLayout().getIndexTypeSizeInBits(GV->getType());
	KnownBits Known(IdxWidth);
	llvm::computeKnownBits(GV, Known, getDataLayout());
	unsigned AlignBits = Known.countMinTrailingZeros();
	unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
	if (Align)
	return MinAlign(Align, GVOffset);
	}

	// If this is a direct reference to a stack slot, use information about the
	// stack slot's alignment.
	int FrameIdx = INT_MIN;
	int64_t FrameOffset = 0;
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
	FrameIdx = FI->getIndex();
	} else if (isBaseWithConstantOffset(Ptr) &&
	isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
	// Handle FI+Cst
	FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	FrameOffset = Ptr.getConstantOperandVal(1);
	}

	if (FrameIdx != INT_MIN) {
	const MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
	unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
	FrameOffset);
	return FIInfoAlign;
	}

	return 0;
	}

	/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
	/// which is split (or expanded) into two not necessarily identical pieces.
	std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const {
	// Currently all types are split in half.
	EVT LoVT, HiVT;
	if (!VT.isVector())
	LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT);
	else
	LoVT = HiVT = VT.getHalfNumVectorElementsVT(*getContext());

	return std::make_pair(LoVT, HiVT);
	}

	/// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the
	/// low/high part.
	std::pair<SDValue, SDValue>
	SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
	const EVT &HiVT) {
	assert(LoVT.getVectorNumElements() + HiVT.getVectorNumElements() <=
	N.getValueType().getVectorNumElements() &&
	"More vector elements requested than available!");
	SDValue Lo, Hi;
	Lo = getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
	getConstant(0, DL, TLI->getVectorIdxTy(getDataLayout())));
	Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
	getConstant(LoVT.getVectorNumElements(), DL,
	TLI->getVectorIdxTy(getDataLayout())));
	return std::make_pair(Lo, Hi);
	}

	/// Widen the vector up to the next power of two using INSERT_SUBVECTOR.
	SDValue SelectionDAG::WidenVector(const SDValue &N, const SDLoc &DL) {
	EVT VT = N.getValueType();
	EVT WideVT = EVT::getVectorVT(*getContext(), VT.getVectorElementType(),
	NextPowerOf2(VT.getVectorNumElements()));
	return getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, getUNDEF(WideVT), N,
	getConstant(0, DL, TLI->getVectorIdxTy(getDataLayout())));
	}

	void SelectionDAG::ExtractVectorElements(SDValue Op,
	SmallVectorImpl<SDValue> &Args,
	unsigned Start, unsigned Count) {
	EVT VT = Op.getValueType();
	if (Count == 0)
	Count = VT.getVectorNumElements();

	EVT EltVT = VT.getVectorElementType();
	EVT IdxTy = TLI->getVectorIdxTy(getDataLayout());
	SDLoc SL(Op);
	for (unsigned i = Start, e = Start + Count; i != e; ++i) {
	Args.push_back(getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
	Op, getConstant(i, SL, IdxTy)));
	}
	}

	// getAddressSpace - Return the address space this GlobalAddress belongs to.
	unsigned GlobalAddressSDNode::getAddressSpace() const {
	return getGlobal()->getType()->getAddressSpace();
	}

	Type *ConstantPoolSDNode::getType() const {
	if (isMachineConstantPoolEntry())
	return Val.MachineCPVal->getType();
	return Val.ConstVal->getType();
	}

	bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
	unsigned &SplatBitSize,
	bool &HasAnyUndefs,
	unsigned MinSplatBits,
	bool IsBigEndian) const {
	EVT VT = getValueType(0);
	assert(VT.isVector() && "Expected a vector type");
	unsigned VecWidth = VT.getSizeInBits();
	if (MinSplatBits > VecWidth)
	return false;

	// FIXME: The widths are based on this node's type, but build vectors can
	// truncate their operands.
	SplatValue = APInt(VecWidth, 0);
	SplatUndef = APInt(VecWidth, 0);

	// Get the bits. Bits with undefined values (when the corresponding element
	// of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared
	// in SplatValue. If any of the values are not constant, give up and return
	// false.
	unsigned int NumOps = getNumOperands();
	assert(NumOps > 0 && "isConstantSplat has 0-size build vector");
	unsigned EltWidth = VT.getScalarSizeInBits();

	for (unsigned j = 0; j < NumOps; ++j) {
	unsigned i = IsBigEndian ? NumOps - 1 - j : j;
	SDValue OpVal = getOperand(i);
	unsigned BitPos = j * EltWidth;

	if (OpVal.isUndef())
	SplatUndef.setBits(BitPos, BitPos + EltWidth);
	else if (auto *CN = dyn_cast<ConstantSDNode>(OpVal))
	SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos);
	else if (auto *CN = dyn_cast<ConstantFPSDNode>(OpVal))
	SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos);
	else
	return false;
	}

	// The build_vector is all constants or undefs. Find the smallest element
	// size that splats the vector.
	HasAnyUndefs = (SplatUndef != 0);

	// FIXME: This does not work for vectors with elements less than 8 bits.
	while (VecWidth > 8) {
	unsigned HalfSize = VecWidth / 2;
	APInt HighValue = SplatValue.lshr(HalfSize).trunc(HalfSize);
	APInt LowValue = SplatValue.trunc(HalfSize);
	APInt HighUndef = SplatUndef.lshr(HalfSize).trunc(HalfSize);
	APInt LowUndef = SplatUndef.trunc(HalfSize);

	// If the two halves do not match (ignoring undef bits), stop here.
	if ((HighValue & ~LowUndef) != (LowValue & ~HighUndef) \|\|
	MinSplatBits > HalfSize)
	break;

	SplatValue = HighValue \| LowValue;
	SplatUndef = HighUndef & LowUndef;

	VecWidth = HalfSize;
	}

	SplatBitSize = VecWidth;
	return true;
	}

	SDValue BuildVectorSDNode::getSplatValue(const APInt &DemandedElts,
	BitVector *UndefElements) const {
	if (UndefElements) {
	UndefElements->clear();
	UndefElements->resize(getNumOperands());
	}
	assert(getNumOperands() == DemandedElts.getBitWidth() &&
	"Unexpected vector size");
	if (!DemandedElts)
	return SDValue();
	SDValue Splatted;
	for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
	if (!DemandedElts[i])
	continue;
	SDValue Op = getOperand(i);
	if (Op.isUndef()) {
	if (UndefElements)
	(*UndefElements)[i] = true;
	} else if (!Splatted) {
	Splatted = Op;
	} else if (Splatted != Op) {
	return SDValue();
	}
	}

	if (!Splatted) {
	unsigned FirstDemandedIdx = DemandedElts.countTrailingZeros();
	assert(getOperand(FirstDemandedIdx).isUndef() &&
	"Can only have a splat without a constant for all undefs.");
	return getOperand(FirstDemandedIdx);
	}

	return Splatted;
	}

	SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const {
	APInt DemandedElts = APInt::getAllOnesValue(getNumOperands());
	return getSplatValue(DemandedElts, UndefElements);
	}

	ConstantSDNode *
	BuildVectorSDNode::getConstantSplatNode(const APInt &DemandedElts,
	BitVector *UndefElements) const {
	return dyn_cast_or_null<ConstantSDNode>(
	getSplatValue(DemandedElts, UndefElements));
	}

	ConstantSDNode *
	BuildVectorSDNode::getConstantSplatNode(BitVector *UndefElements) const {
	return dyn_cast_or_null<ConstantSDNode>(getSplatValue(UndefElements));
	}

	ConstantFPSDNode *
	BuildVectorSDNode::getConstantFPSplatNode(const APInt &DemandedElts,
	BitVector *UndefElements) const {
	return dyn_cast_or_null<ConstantFPSDNode>(
	getSplatValue(DemandedElts, UndefElements));
	}

	ConstantFPSDNode *
	BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const {
	return dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements));
	}

	int32_t
	BuildVectorSDNode::getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements,
	uint32_t BitWidth) const {
	if (ConstantFPSDNode *CN =
	dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements))) {
	bool IsExact;
	APSInt IntVal(BitWidth);
	const APFloat &APF = CN->getValueAPF();
	if (APF.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
	APFloat::opOK \|\|
	!IsExact)
	return -1;

	return IntVal.exactLogBase2();
	}
	return -1;
	}

	bool BuildVectorSDNode::isConstant() const {
	for (const SDValue &Op : op_values()) {
	unsigned Opc = Op.getOpcode();
	if (Opc != ISD::UNDEF && Opc != ISD::Constant && Opc != ISD::ConstantFP)
	return false;
	}
	return true;
	}

	bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
	// Find the first non-undef value in the shuffle mask.
	unsigned i, e;
	for (i = 0, e = VT.getVectorNumElements(); i != e && Mask[i] < 0; ++i)
	/* search */;

	// If all elements are undefined, this shuffle can be considered a splat
	// (although it should eventually get simplified away completely).
	if (i == e)
	return true;

	// Make sure all remaining elements are either undef or the same as the first
	// non-undef value.
	for (int Idx = Mask[i]; i != e; ++i)
	if (Mask[i] >= 0 && Mask[i] != Idx)
	return false;
	return true;
	}

	// Returns the SDNode if it is a constant integer BuildVector
	// or constant integer.
	SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) {
	if (isa<ConstantSDNode>(N))
	return N.getNode();
	if (ISD::isBuildVectorOfConstantSDNodes(N.getNode()))
	return N.getNode();
	// Treat a GlobalAddress supporting constant offset folding as a
	// constant integer.
	if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N))
	if (GA->getOpcode() == ISD::GlobalAddress &&
	TLI->isOffsetFoldingLegal(GA))
	return GA;
	return nullptr;
	}

	SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) {
	if (isa<ConstantFPSDNode>(N))
	return N.getNode();

	if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
	return N.getNode();

	return nullptr;
	}

	void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
	assert(!Node->OperandList && "Node already has operands");
	assert(SDNode::getMaxNumOperands() >= Vals.size() &&
	"too many operands to fit into SDNode");
	SDUse *Ops = OperandRecycler.allocate(
	ArrayRecycler<SDUse>::Capacity::get(Vals.size()), OperandAllocator);

	bool IsDivergent = false;
	for (unsigned I = 0; I != Vals.size(); ++I) {
	Ops[I].setUser(Node);
	Ops[I].setInitial(Vals[I]);
	if (Ops[I].Val.getValueType() != MVT::Other) // Skip Chain. It does not carry divergence.
	IsDivergent = IsDivergent \|\| Ops[I].getNode()->isDivergent();
	}
	Node->NumOperands = Vals.size();
	Node->OperandList = Ops;
	IsDivergent \|= TLI->isSDNodeSourceOfDivergence(Node, FLI, DA);
	if (!TLI->isSDNodeAlwaysUniform(Node))
	Node->SDNodeBits.IsDivergent = IsDivergent;
	checkForCycles(Node);
	}

	SDValue SelectionDAG::getTokenFactor(const SDLoc &DL,
	SmallVectorImpl<SDValue> &Vals) {
	size_t Limit = SDNode::getMaxNumOperands();
	while (Vals.size() > Limit) {
	unsigned SliceIdx = Vals.size() - Limit;
	auto ExtractedTFs = ArrayRef<SDValue>(Vals).slice(SliceIdx, Limit);
	SDValue NewTF = getNode(ISD::TokenFactor, DL, MVT::Other, ExtractedTFs);
	Vals.erase(Vals.begin() + SliceIdx, Vals.end());
	Vals.emplace_back(NewTF);
	}
	return getNode(ISD::TokenFactor, DL, MVT::Other, Vals);
	}

	#ifndef NDEBUG
	static void checkForCyclesHelper(const SDNode *N,
	SmallPtrSetImpl<const SDNode*> &Visited,
	SmallPtrSetImpl<const SDNode*> &Checked,
	const llvm::SelectionDAG *DAG) {
	// If this node has already been checked, don't check it again.
	if (Checked.count(N))
	return;

	// If a node has already been visited on this depth-first walk, reject it as
	// a cycle.
	if (!Visited.insert(N).second) {
	errs() << "Detected cycle in SelectionDAG\n";
	dbgs() << "Offending node:\n";
	N->dumprFull(DAG); dbgs() << "\n";
	abort();
	}

	for (const SDValue &Op : N->op_values())
	checkForCyclesHelper(Op.getNode(), Visited, Checked, DAG);

	Checked.insert(N);
	Visited.erase(N);
	}
	#endif

	void llvm::checkForCycles(const llvm::SDNode *N,
	const llvm::SelectionDAG *DAG,
	bool force) {
	#ifndef NDEBUG
	bool check = force;
	#ifdef EXPENSIVE_CHECKS
	check = true;
	#endif // EXPENSIVE_CHECKS
	if (check) {
	assert(N && "Checking nonexistent SDNode");
	SmallPtrSet<const SDNode*, 32> visited;
	SmallPtrSet<const SDNode*, 32> checked;
	checkForCyclesHelper(N, visited, checked, DAG);
	}
	#endif // !NDEBUG
	}

	void llvm::checkForCycles(const llvm::SelectionDAG *DAG, bool force) {
	checkForCycles(DAG->getRoot().getNode(), DAG, force);
	}
	Index: vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (revision 351303)
	@@ -1,10458 +1,10479 @@
	//===- SelectionDAGBuilder.cpp - Selection-DAG building -------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This implements routines for translating from LLVM IR into SelectionDAG IR.
	//
	//===----------------------------------------------------------------------===//

	#include "SelectionDAGBuilder.h"
	#include "SDNodeDbgValue.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/BranchProbabilityInfo.h"
	#include "llvm/Analysis/ConstantFolding.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/Analysis/Loads.h"
	#include "llvm/Analysis/MemoryLocation.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/CodeGen/Analysis.h"
	#include "llvm/CodeGen/FunctionLoweringInfo.h"
	#include "llvm/CodeGen/GCMetadata.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
	#include "llvm/CodeGen/StackMaps.h"
	#include "llvm/CodeGen/SwiftErrorValueTracking.h"
	#include "llvm/CodeGen/TargetFrameLowering.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetOpcodes.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/Argument.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/CFG.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/ConstantRange.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Statepoint.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/AtomicOrdering.h"
	#include "llvm/Support/BranchProbability.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MachineValueType.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetIntrinsicInfo.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <cstring>
	#include <iterator>
	#include <limits>
	#include <numeric>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;
	using namespace PatternMatch;
	using namespace SwitchCG;

	#define DEBUG_TYPE "isel"

	/// LimitFloatPrecision - Generate low-precision inline sequences for
	/// some float libcalls (6, 8 or 12 bits).
	static unsigned LimitFloatPrecision;

	static cl::opt<unsigned, true>
	LimitFPPrecision("limit-float-precision",
	cl::desc("Generate low-precision inline sequences "
	"for some float libcalls"),
	cl::location(LimitFloatPrecision), cl::Hidden,
	cl::init(0));

	static cl::opt<unsigned> SwitchPeelThreshold(
	"switch-peel-threshold", cl::Hidden, cl::init(66),
	cl::desc("Set the case probability threshold for peeling the case from a "
	"switch statement. A value greater than 100 will void this "
	"optimization"));

	// Limit the width of DAG chains. This is important in general to prevent
	// DAG-based analysis from blowing up. For example, alias analysis and
	// load clustering may not complete in reasonable time. It is difficult to
	// recognize and avoid this situation within each individual analysis, and
	// future analyses are likely to have the same behavior. Limiting DAG width is
	// the safe approach and will be especially important with global DAGs.
	//
	// MaxParallelChains default is arbitrarily high to avoid affecting
	// optimization, but could be lowered to improve compile time. Any ld-ld-st-st
	// sequence over this should have been converted to llvm.memcpy by the
	// frontend. It is easy to induce this behavior with .ll code such as:
	// %buffer = alloca [4096 x i8]
	// %data = load [4096 x i8]* %argPtr
	// store [4096 x i8] %data, [4096 x i8]* %buffer
	static const unsigned MaxParallelChains = 64;

	// Return the calling convention if the Value passed requires ABI mangling as it
	// is a parameter to a function or a return value from a function which is not
	// an intrinsic.
	static Optional<CallingConv::ID> getABIRegCopyCC(const Value *V) {
	if (auto *R = dyn_cast<ReturnInst>(V))
	return R->getParent()->getParent()->getCallingConv();

	if (auto *CI = dyn_cast<CallInst>(V)) {
	const bool IsInlineAsm = CI->isInlineAsm();
	const bool IsIndirectFunctionCall =
	!IsInlineAsm && !CI->getCalledFunction();

	// It is possible that the call instruction is an inline asm statement or an
	// indirect function call in which case the return value of
	// getCalledFunction() would be nullptr.
	const bool IsInstrinsicCall =
	!IsInlineAsm && !IsIndirectFunctionCall &&
	CI->getCalledFunction()->getIntrinsicID() != Intrinsic::not_intrinsic;

	if (!IsInlineAsm && !IsInstrinsicCall)
	return CI->getCallingConv();
	}

	return None;
	}

	static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
	const SDValue *Parts, unsigned NumParts,
	MVT PartVT, EVT ValueVT, const Value *V,
	Optional<CallingConv::ID> CC);

	/// getCopyFromParts - Create a value that contains the specified legal parts
	/// combined into the value they represent. If the parts combine to a type
	/// larger than ValueVT then AssertOp can be used to specify whether the extra
	/// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT
	/// (ISD::AssertSext).
	static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL,
	const SDValue *Parts, unsigned NumParts,
	MVT PartVT, EVT ValueVT, const Value *V,
	Optional<CallingConv::ID> CC = None,
	Optional<ISD::NodeType> AssertOp = None) {
	if (ValueVT.isVector())
	return getCopyFromPartsVector(DAG, DL, Parts, NumParts, PartVT, ValueVT, V,
	CC);

	assert(NumParts > 0 && "No parts to assemble!");
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Val = Parts[0];

	if (NumParts > 1) {
	// Assemble the value from multiple parts.
	if (ValueVT.isInteger()) {
	unsigned PartBits = PartVT.getSizeInBits();
	unsigned ValueBits = ValueVT.getSizeInBits();

	// Assemble the power of 2 part.
	unsigned RoundParts =
	(NumParts & (NumParts - 1)) ? 1 << Log2_32(NumParts) : NumParts;
	unsigned RoundBits = PartBits * RoundParts;
	EVT RoundVT = RoundBits == ValueBits ?
	ValueVT : EVT::getIntegerVT(*DAG.getContext(), RoundBits);
	SDValue Lo, Hi;

	EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), RoundBits/2);

	if (RoundParts > 2) {
	Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2,
	PartVT, HalfVT, V);
	Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2,
	RoundParts / 2, PartVT, HalfVT, V);
	} else {
	Lo = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[0]);
	Hi = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[1]);
	}

	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);

	Val = DAG.getNode(ISD::BUILD_PAIR, DL, RoundVT, Lo, Hi);

	if (RoundParts < NumParts) {
	// Assemble the trailing non-power-of-2 part.
	unsigned OddParts = NumParts - RoundParts;
	EVT OddVT = EVT::getIntegerVT(DAG.getContext(), OddParts PartBits);
	Hi = getCopyFromParts(DAG, DL, Parts + RoundParts, OddParts, PartVT,
	OddVT, V, CC);

	// Combine the round and odd parts.
	Lo = Val;
	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);
	EVT TotalVT = EVT::getIntegerVT(DAG.getContext(), NumParts PartBits);
	Hi = DAG.getNode(ISD::ANY_EXTEND, DL, TotalVT, Hi);
	Hi =
	DAG.getNode(ISD::SHL, DL, TotalVT, Hi,
	DAG.getConstant(Lo.getValueSizeInBits(), DL,
	TLI.getPointerTy(DAG.getDataLayout())));
	Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, TotalVT, Lo);
	Val = DAG.getNode(ISD::OR, DL, TotalVT, Lo, Hi);
	}
	} else if (PartVT.isFloatingPoint()) {
	// FP split into multiple FP parts (for ppcf128)
	assert(ValueVT == EVT(MVT::ppcf128) && PartVT == MVT::f64 &&
	"Unexpected split");
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[0]);
	Hi = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[1]);
	if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
	std::swap(Lo, Hi);
	Val = DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, Lo, Hi);
	} else {
	// FP split into integer parts (soft fp)
	assert(ValueVT.isFloatingPoint() && PartVT.isInteger() &&
	!PartVT.isVector() && "Unexpected split");
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
	Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V, CC);
	}
	}

	// There is now one part, held in Val. Correct it to match ValueVT.
	// PartEVT is the type of the register class that holds the value.
	// ValueVT is the type of the inline asm operation.
	EVT PartEVT = Val.getValueType();

	if (PartEVT == ValueVT)
	return Val;

	if (PartEVT.isInteger() && ValueVT.isFloatingPoint() &&
	ValueVT.bitsLT(PartEVT)) {
	// For an FP value in an integer part, we need to truncate to the right
	// width first.
	PartEVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
	Val = DAG.getNode(ISD::TRUNCATE, DL, PartEVT, Val);
	}

	// Handle types that have the same size.
	if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits())
	return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);

	// Handle types with different sizes.
	if (PartEVT.isInteger() && ValueVT.isInteger()) {
	if (ValueVT.bitsLT(PartEVT)) {
	// For a truncate, see if we have any information to
	// indicate whether the truncated bits will always be
	// zero or sign-extension.
	if (AssertOp.hasValue())
	Val = DAG.getNode(*AssertOp, DL, PartEVT, Val,
	DAG.getValueType(ValueVT));
	return DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
	}
	return DAG.getNode(ISD::ANY_EXTEND, DL, ValueVT, Val);
	}

	if (PartEVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
	// FP_ROUND's are always exact here.
	if (ValueVT.bitsLT(Val.getValueType()))
	return DAG.getNode(
	ISD::FP_ROUND, DL, ValueVT, Val,
	DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout())));

	return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val);
	}

	// Handle MMX to a narrower integer type by bitcasting MMX to integer and
	// then truncating.
	if (PartEVT == MVT::x86mmx && ValueVT.isInteger() &&
	ValueVT.bitsLT(PartEVT)) {
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Val);
	return DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
	}

	report_fatal_error("Unknown mismatch in getCopyFromParts!");
	}

	static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V,
	const Twine &ErrMsg) {
	const Instruction *I = dyn_cast_or_null<Instruction>(V);
	if (!V)
	return Ctx.emitError(ErrMsg);

	const char *AsmError = ", possible invalid constraint for vector type";
	if (const CallInst *CI = dyn_cast<CallInst>(I))
	if (isa<InlineAsm>(CI->getCalledValue()))
	return Ctx.emitError(I, ErrMsg + AsmError);

	return Ctx.emitError(I, ErrMsg);
	}

	/// getCopyFromPartsVector - Create a value that contains the specified legal
	/// parts combined into the value they represent. If the parts combine to a
	/// type larger than ValueVT then AssertOp can be used to specify whether the
	/// extra bits are known to be zero (ISD::AssertZext) or sign extended from
	/// ValueVT (ISD::AssertSext).
	static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
	const SDValue *Parts, unsigned NumParts,
	MVT PartVT, EVT ValueVT, const Value *V,
	Optional<CallingConv::ID> CallConv) {
	assert(ValueVT.isVector() && "Not a vector value");
	assert(NumParts > 0 && "No parts to assemble!");
	const bool IsABIRegCopy = CallConv.hasValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Val = Parts[0];

	// Handle a multi-element vector.
	if (NumParts > 1) {
	EVT IntermediateVT;
	MVT RegisterVT;
	unsigned NumIntermediates;
	unsigned NumRegs;

	if (IsABIRegCopy) {
	NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
	*DAG.getContext(), CallConv.getValue(), ValueVT, IntermediateVT,
	NumIntermediates, RegisterVT);
	} else {
	NumRegs =
	TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
	NumIntermediates, RegisterVT);
	}

	assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
	NumParts = NumRegs; // Silence a compiler warning.
	assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
	assert(RegisterVT.getSizeInBits() ==
	Parts[0].getSimpleValueType().getSizeInBits() &&
	"Part type sizes don't match!");

	// Assemble the parts into intermediate operands.
	SmallVector<SDValue, 8> Ops(NumIntermediates);
	if (NumIntermediates == NumParts) {
	// If the register was not expanded, truncate or copy the value,
	// as appropriate.
	for (unsigned i = 0; i != NumParts; ++i)
	Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1,
	PartVT, IntermediateVT, V);
	} else if (NumParts > 0) {
	// If the intermediate type was expanded, build the intermediate
	// operands from the parts.
	assert(NumParts % NumIntermediates == 0 &&
	"Must expand into a divisible number of parts!");
	unsigned Factor = NumParts / NumIntermediates;
	for (unsigned i = 0; i != NumIntermediates; ++i)
	Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor,
	PartVT, IntermediateVT, V);
	}

	// Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
	// intermediate operands.
	EVT BuiltVectorTy =
	EVT::getVectorVT(*DAG.getContext(), IntermediateVT.getScalarType(),
	(IntermediateVT.isVector()
	? IntermediateVT.getVectorNumElements() * NumParts
	: NumIntermediates));
	Val = DAG.getNode(IntermediateVT.isVector() ? ISD::CONCAT_VECTORS
	: ISD::BUILD_VECTOR,
	DL, BuiltVectorTy, Ops);
	}

	// There is now one part, held in Val. Correct it to match ValueVT.
	EVT PartEVT = Val.getValueType();

	if (PartEVT == ValueVT)
	return Val;

	if (PartEVT.isVector()) {
	// If the element type of the source/dest vectors are the same, but the
	// parts vector has more elements than the value vector, then we have a
	// vector widening case (e.g. <2 x float> -> <4 x float>). Extract the
	// elements we want.
	if (PartEVT.getVectorElementType() == ValueVT.getVectorElementType()) {
	assert(PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements() &&
	"Cannot narrow, it would be a lossy transformation");
	return DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	}

	// Vector/Vector bitcast.
	if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits())
	return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);

	assert(PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements() &&
	"Cannot handle this kind of promotion");
	// Promoted vector extract
	return DAG.getAnyExtOrTrunc(Val, DL, ValueVT);

	}

	// Trivial bitcast if the types are the same size and the destination
	// vector type is legal.
	if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits() &&
	TLI.isTypeLegal(ValueVT))
	return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);

	if (ValueVT.getVectorNumElements() != 1) {
	// Certain ABIs require that vectors are passed as integers. For vectors
	// are the same size, this is an obvious bitcast.
	if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits()) {
	return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
	} else if (ValueVT.getSizeInBits() < PartEVT.getSizeInBits()) {
	// Bitcast Val back the original type and extract the corresponding
	// vector we want.
	unsigned Elts = PartEVT.getSizeInBits() / ValueVT.getScalarSizeInBits();
	EVT WiderVecType = EVT::getVectorVT(*DAG.getContext(),
	ValueVT.getVectorElementType(), Elts);
	Val = DAG.getBitcast(WiderVecType, Val);
	return DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	}

	diagnosePossiblyInvalidConstraint(
	*DAG.getContext(), V, "non-trivial scalar-to-vector conversion");
	return DAG.getUNDEF(ValueVT);
	}

	// Handle cases such as i8 -> <1 x i1>
	EVT ValueSVT = ValueVT.getVectorElementType();
	if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT)
	Val = ValueVT.isFloatingPoint() ? DAG.getFPExtendOrRound(Val, DL, ValueSVT)
	: DAG.getAnyExtOrTrunc(Val, DL, ValueSVT);

	return DAG.getBuildVector(ValueVT, DL, Val);
	}

	static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl,
	SDValue Val, SDValue *Parts, unsigned NumParts,
	MVT PartVT, const Value *V,
	Optional<CallingConv::ID> CallConv);

	/// getCopyToParts - Create a series of nodes that contain the specified value
	/// split into legal parts. If the parts contain more bits than Val, then, for
	/// integers, ExtendKind can be used to specify how to generate the extra bits.
	static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
	SDValue *Parts, unsigned NumParts, MVT PartVT,
	const Value *V,
	Optional<CallingConv::ID> CallConv = None,
	ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
	EVT ValueVT = Val.getValueType();

	// Handle the vector case separately.
	if (ValueVT.isVector())
	return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V,
	CallConv);

	unsigned PartBits = PartVT.getSizeInBits();
	unsigned OrigNumParts = NumParts;
	assert(DAG.getTargetLoweringInfo().isTypeLegal(PartVT) &&
	"Copying to an illegal type!");

	if (NumParts == 0)
	return;

	assert(!ValueVT.isVector() && "Vector case handled elsewhere");
	EVT PartEVT = PartVT;
	if (PartEVT == ValueVT) {
	assert(NumParts == 1 && "No-op copy with multiple parts!");
	Parts[0] = Val;
	return;
	}

	if (NumParts * PartBits > ValueVT.getSizeInBits()) {
	// If the parts cover more bits than the value has, promote the value.
	if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
	assert(NumParts == 1 && "Do not know what to promote to!");
	Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val);
	} else {
	if (ValueVT.isFloatingPoint()) {
	// FP values need to be bitcast, then extended if they are being put
	// into a larger container.
	ValueVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
	Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
	}
	assert((PartVT.isInteger() \|\| PartVT == MVT::x86mmx) &&
	ValueVT.isInteger() &&
	"Unknown mismatch!");
	ValueVT = EVT::getIntegerVT(DAG.getContext(), NumParts PartBits);
	Val = DAG.getNode(ExtendKind, DL, ValueVT, Val);
	if (PartVT == MVT::x86mmx)
	Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
	}
	} else if (PartBits == ValueVT.getSizeInBits()) {
	// Different types of the same size.
	assert(NumParts == 1 && PartEVT != ValueVT);
	Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
	} else if (NumParts * PartBits < ValueVT.getSizeInBits()) {
	// If the parts cover less bits than value has, truncate the value.
	assert((PartVT.isInteger() \|\| PartVT == MVT::x86mmx) &&
	ValueVT.isInteger() &&
	"Unknown mismatch!");
	ValueVT = EVT::getIntegerVT(DAG.getContext(), NumParts PartBits);
	Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
	if (PartVT == MVT::x86mmx)
	Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
	}

	// The value may have changed - recompute ValueVT.
	ValueVT = Val.getValueType();
	assert(NumParts * PartBits == ValueVT.getSizeInBits() &&
	"Failed to tile the value with PartVT!");

	if (NumParts == 1) {
	if (PartEVT != ValueVT) {
	diagnosePossiblyInvalidConstraint(*DAG.getContext(), V,
	"scalar-to-vector conversion failed");
	Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
	}

	Parts[0] = Val;
	return;
	}

	// Expand the value into multiple parts.
	if (NumParts & (NumParts - 1)) {
	// The number of parts is not a power of 2. Split off and copy the tail.
	assert(PartVT.isInteger() && ValueVT.isInteger() &&
	"Do not know what to expand to!");
	unsigned RoundParts = 1 << Log2_32(NumParts);
	unsigned RoundBits = RoundParts * PartBits;
	unsigned OddParts = NumParts - RoundParts;
	SDValue OddVal = DAG.getNode(ISD::SRL, DL, ValueVT, Val,
	DAG.getShiftAmountConstant(RoundBits, ValueVT, DL, /LegalTypes/false));

	getCopyToParts(DAG, DL, OddVal, Parts + RoundParts, OddParts, PartVT, V,
	CallConv);

	if (DAG.getDataLayout().isBigEndian())
	// The odd parts were reversed by getCopyToParts - unreverse them.
	std::reverse(Parts + RoundParts, Parts + NumParts);

	NumParts = RoundParts;
	ValueVT = EVT::getIntegerVT(DAG.getContext(), NumParts PartBits);
	Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
	}

	// The number of parts is a power of 2. Repeatedly bisect the value using
	// EXTRACT_ELEMENT.
	Parts[0] = DAG.getNode(ISD::BITCAST, DL,
	EVT::getIntegerVT(*DAG.getContext(),
	ValueVT.getSizeInBits()),
	Val);

	for (unsigned StepSize = NumParts; StepSize > 1; StepSize /= 2) {
	for (unsigned i = 0; i < NumParts; i += StepSize) {
	unsigned ThisBits = StepSize * PartBits / 2;
	EVT ThisVT = EVT::getIntegerVT(*DAG.getContext(), ThisBits);
	SDValue &Part0 = Parts[i];
	SDValue &Part1 = Parts[i+StepSize/2];

	Part1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
	ThisVT, Part0, DAG.getIntPtrConstant(1, DL));
	Part0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
	ThisVT, Part0, DAG.getIntPtrConstant(0, DL));

	if (ThisBits == PartBits && ThisVT != PartVT) {
	Part0 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part0);
	Part1 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part1);
	}
	}
	}

	if (DAG.getDataLayout().isBigEndian())
	std::reverse(Parts, Parts + OrigNumParts);
	}

	static SDValue widenVectorToPartType(SelectionDAG &DAG,
	SDValue Val, const SDLoc &DL, EVT PartVT) {
	if (!PartVT.isVector())
	return SDValue();

	EVT ValueVT = Val.getValueType();
	unsigned PartNumElts = PartVT.getVectorNumElements();
	unsigned ValueNumElts = ValueVT.getVectorNumElements();
	if (PartNumElts > ValueNumElts &&
	PartVT.getVectorElementType() == ValueVT.getVectorElementType()) {
	EVT ElementVT = PartVT.getVectorElementType();
	// Vector widening case, e.g. <2 x float> -> <4 x float>. Shuffle in
	// undef elements.
	SmallVector<SDValue, 16> Ops;
	DAG.ExtractVectorElements(Val, Ops);
	SDValue EltUndef = DAG.getUNDEF(ElementVT);
	for (unsigned i = ValueNumElts, e = PartNumElts; i != e; ++i)
	Ops.push_back(EltUndef);

	// FIXME: Use CONCAT for 2x -> 4x.
	return DAG.getBuildVector(PartVT, DL, Ops);
	}

	return SDValue();
	}

	/// getCopyToPartsVector - Create a series of nodes that contain the specified
	/// value split into legal parts.
	static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
	SDValue Val, SDValue *Parts, unsigned NumParts,
	MVT PartVT, const Value *V,
	Optional<CallingConv::ID> CallConv) {
	EVT ValueVT = Val.getValueType();
	assert(ValueVT.isVector() && "Not a vector");
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const bool IsABIRegCopy = CallConv.hasValue();

	if (NumParts == 1) {
	EVT PartEVT = PartVT;
	if (PartEVT == ValueVT) {
	// Nothing to do.
	} else if (PartVT.getSizeInBits() == ValueVT.getSizeInBits()) {
	// Bitconvert vector->vector case.
	Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
	} else if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, PartVT)) {
	Val = Widened;
	} else if (PartVT.isVector() &&
	PartEVT.getVectorElementType().bitsGE(
	ValueVT.getVectorElementType()) &&
	PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements()) {

	// Promoted vector extract
	Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
	} else {
	if (ValueVT.getVectorNumElements() == 1) {
	Val = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	} else {
	assert(PartVT.getSizeInBits() > ValueVT.getSizeInBits() &&
	"lossy conversion of vector to scalar type");
	EVT IntermediateType =
	EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
	Val = DAG.getBitcast(IntermediateType, Val);
	Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
	}
	}

	assert(Val.getValueType() == PartVT && "Unexpected vector part value type");
	Parts[0] = Val;
	return;
	}

	// Handle a multi-element vector.
	EVT IntermediateVT;
	MVT RegisterVT;
	unsigned NumIntermediates;
	unsigned NumRegs;
	if (IsABIRegCopy) {
	NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
	*DAG.getContext(), CallConv.getValue(), ValueVT, IntermediateVT,
	NumIntermediates, RegisterVT);
	} else {
	NumRegs =
	TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
	NumIntermediates, RegisterVT);
	}

	assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
	NumParts = NumRegs; // Silence a compiler warning.
	assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");

	unsigned IntermediateNumElts = IntermediateVT.isVector() ?
	IntermediateVT.getVectorNumElements() : 1;

	// Convert the vector to the appropiate type if necessary.
	unsigned DestVectorNoElts = NumIntermediates * IntermediateNumElts;

	EVT BuiltVectorTy = EVT::getVectorVT(
	*DAG.getContext(), IntermediateVT.getScalarType(), DestVectorNoElts);
	MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
	if (ValueVT != BuiltVectorTy) {
	if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, BuiltVectorTy))
	Val = Widened;

	Val = DAG.getNode(ISD::BITCAST, DL, BuiltVectorTy, Val);
	}

	// Split the vector into intermediate operands.
	SmallVector<SDValue, 8> Ops(NumIntermediates);
	for (unsigned i = 0; i != NumIntermediates; ++i) {
	if (IntermediateVT.isVector()) {
	Ops[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val,
	DAG.getConstant(i * IntermediateNumElts, DL, IdxVT));
	} else {
	Ops[i] = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, DL, IntermediateVT, Val,
	DAG.getConstant(i, DL, IdxVT));
	}
	}

	// Split the intermediate operands into legal parts.
	if (NumParts == NumIntermediates) {
	// If the register was not expanded, promote or copy the value,
	// as appropriate.
	for (unsigned i = 0; i != NumParts; ++i)
	getCopyToParts(DAG, DL, Ops[i], &Parts[i], 1, PartVT, V, CallConv);
	} else if (NumParts > 0) {
	// If the intermediate type was expanded, split each the value into
	// legal parts.
	assert(NumIntermediates != 0 && "division by zero");
	assert(NumParts % NumIntermediates == 0 &&
	"Must expand into a divisible number of parts!");
	unsigned Factor = NumParts / NumIntermediates;
	for (unsigned i = 0; i != NumIntermediates; ++i)
	getCopyToParts(DAG, DL, Ops[i], &Parts[i * Factor], Factor, PartVT, V,
	CallConv);
	}
	}

	RegsForValue::RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt,
	EVT valuevt, Optional<CallingConv::ID> CC)
	: ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs),
	RegCount(1, regs.size()), CallConv(CC) {}

	RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
	const DataLayout &DL, unsigned Reg, Type *Ty,
	Optional<CallingConv::ID> CC) {
	ComputeValueVTs(TLI, DL, Ty, ValueVTs);

	CallConv = CC;

	for (EVT ValueVT : ValueVTs) {
	unsigned NumRegs =
	isABIMangled()
	? TLI.getNumRegistersForCallingConv(Context, CC.getValue(), ValueVT)
	: TLI.getNumRegisters(Context, ValueVT);
	MVT RegisterVT =
	isABIMangled()
	? TLI.getRegisterTypeForCallingConv(Context, CC.getValue(), ValueVT)
	: TLI.getRegisterType(Context, ValueVT);
	for (unsigned i = 0; i != NumRegs; ++i)
	Regs.push_back(Reg + i);
	RegVTs.push_back(RegisterVT);
	RegCount.push_back(NumRegs);
	Reg += NumRegs;
	}
	}

	SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
	FunctionLoweringInfo &FuncInfo,
	const SDLoc &dl, SDValue &Chain,
	SDValue Flag, const Value V) const {
	// A Value with type {} or [0 x %t] needs no registers.
	if (ValueVTs.empty())
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Assemble the legal parts into the final values.
	SmallVector<SDValue, 4> Values(ValueVTs.size());
	SmallVector<SDValue, 8> Parts;
	for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
	// Copy the legal parts from the registers.
	EVT ValueVT = ValueVTs[Value];
	unsigned NumRegs = RegCount[Value];
	MVT RegisterVT = isABIMangled() ? TLI.getRegisterTypeForCallingConv(
	*DAG.getContext(),
	CallConv.getValue(), RegVTs[Value])
	: RegVTs[Value];

	Parts.resize(NumRegs);
	for (unsigned i = 0; i != NumRegs; ++i) {
	SDValue P;
	if (!Flag) {
	P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT);
	} else {
	P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag);
	*Flag = P.getValue(2);
	}

	Chain = P.getValue(1);
	Parts[i] = P;

	// If the source register was virtual and if we know something about it,
	// add an assert node.
	if (!TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) \|\|
	!RegisterVT.isInteger())
	continue;

	const FunctionLoweringInfo::LiveOutInfo *LOI =
	FuncInfo.GetLiveOutRegInfo(Regs[Part+i]);
	if (!LOI)
	continue;

	unsigned RegSize = RegisterVT.getScalarSizeInBits();
	unsigned NumSignBits = LOI->NumSignBits;
	unsigned NumZeroBits = LOI->Known.countMinLeadingZeros();

	if (NumZeroBits == RegSize) {
	// The current value is a zero.
	// Explicitly express that as it would be easier for
	// optimizations to kick in.
	Parts[i] = DAG.getConstant(0, dl, RegisterVT);
	continue;
	}

	// FIXME: We capture more information than the dag can represent. For
	// now, just use the tightest assertzext/assertsext possible.
	bool isSExt;
	EVT FromVT(MVT::Other);
	if (NumZeroBits) {
	FromVT = EVT::getIntegerVT(*DAG.getContext(), RegSize - NumZeroBits);
	isSExt = false;
	} else if (NumSignBits > 1) {
	FromVT =
	EVT::getIntegerVT(*DAG.getContext(), RegSize - NumSignBits + 1);
	isSExt = true;
	} else {
	continue;
	}
	// Add an assertion node.
	assert(FromVT != MVT::Other);
	Parts[i] = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl,
	RegisterVT, P, DAG.getValueType(FromVT));
	}

	Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(), NumRegs,
	RegisterVT, ValueVT, V, CallConv);
	Part += NumRegs;
	Parts.clear();
	}

	return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values);
	}

	void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
	const SDLoc &dl, SDValue &Chain, SDValue *Flag,
	const Value *V,
	ISD::NodeType PreferredExtendType) const {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	ISD::NodeType ExtendKind = PreferredExtendType;

	// Get the list of the values's legal parts.
	unsigned NumRegs = Regs.size();
	SmallVector<SDValue, 8> Parts(NumRegs);
	for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
	unsigned NumParts = RegCount[Value];

	MVT RegisterVT = isABIMangled() ? TLI.getRegisterTypeForCallingConv(
	*DAG.getContext(),
	CallConv.getValue(), RegVTs[Value])
	: RegVTs[Value];

	if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
	ExtendKind = ISD::ZERO_EXTEND;

	getCopyToParts(DAG, dl, Val.getValue(Val.getResNo() + Value), &Parts[Part],
	NumParts, RegisterVT, V, CallConv, ExtendKind);
	Part += NumParts;
	}

	// Copy the parts into the registers.
	SmallVector<SDValue, 8> Chains(NumRegs);
	for (unsigned i = 0; i != NumRegs; ++i) {
	SDValue Part;
	if (!Flag) {
	Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]);
	} else {
	Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag);
	*Flag = Part.getValue(1);
	}

	Chains[i] = Part.getValue(0);
	}

	if (NumRegs == 1 \|\| Flag)
	// If NumRegs > 1 && Flag is used then the use of the last CopyToReg is
	// flagged to it. That is the CopyToReg nodes and the user are considered
	// a single scheduling unit. If we create a TokenFactor and return it as
	// chain, then the TokenFactor is both a predecessor (operand) of the
	// user as well as a successor (the TF operands are flagged to the user).
	// c1, f1 = CopyToReg
	// c2, f2 = CopyToReg
	// c3 = TokenFactor c1, c2
	// ...
	// = op c3, ..., f2
	Chain = Chains[NumRegs-1];
	else
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	}

	void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
	unsigned MatchingIdx, const SDLoc &dl,
	SelectionDAG &DAG,
	std::vector<SDValue> &Ops) const {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
	if (HasMatching)
	Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
	else if (!Regs.empty() &&
	TargetRegisterInfo::isVirtualRegister(Regs.front())) {
	// Put the register class of the virtual registers in the flag word. That
	// way, later passes can recompute register class constraints for inline
	// assembly as well as normal instructions.
	// Don't do this for tied operands that can use the regclass information
	// from the def.
	const MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
	const TargetRegisterClass *RC = MRI.getRegClass(Regs.front());
	Flag = InlineAsm::getFlagWordForRegClass(Flag, RC->getID());
	}

	SDValue Res = DAG.getTargetConstant(Flag, dl, MVT::i32);
	Ops.push_back(Res);

	if (Code == InlineAsm::Kind_Clobber) {
	// Clobbers should always have a 1:1 mapping with registers, and may
	// reference registers that have illegal (e.g. vector) types. Hence, we
	// shouldn't try to apply any sort of splitting logic to them.
	assert(Regs.size() == RegVTs.size() && Regs.size() == ValueVTs.size() &&
	"No 1:1 mapping from clobbers to regs?");
	unsigned SP = TLI.getStackPointerRegisterToSaveRestore();
	(void)SP;
	for (unsigned I = 0, E = ValueVTs.size(); I != E; ++I) {
	Ops.push_back(DAG.getRegister(Regs[I], RegVTs[I]));
	assert(
	(Regs[I] != SP \|\|
	DAG.getMachineFunction().getFrameInfo().hasOpaqueSPAdjustment()) &&
	"If we clobbered the stack pointer, MFI should know about it.");
	}
	return;
	}

	for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) {
	unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVTs[Value]);
	MVT RegisterVT = RegVTs[Value];
	for (unsigned i = 0; i != NumRegs; ++i) {
	assert(Reg < Regs.size() && "Mismatch in # registers expected");
	unsigned TheReg = Regs[Reg++];
	Ops.push_back(DAG.getRegister(TheReg, RegisterVT));
	}
	}
	}

	SmallVector<std::pair<unsigned, unsigned>, 4>
	RegsForValue::getRegsAndSizes() const {
	SmallVector<std::pair<unsigned, unsigned>, 4> OutVec;
	unsigned I = 0;
	for (auto CountAndVT : zip_first(RegCount, RegVTs)) {
	unsigned RegCount = std::get<0>(CountAndVT);
	MVT RegisterVT = std::get<1>(CountAndVT);
	unsigned RegisterSize = RegisterVT.getSizeInBits();
	for (unsigned E = I + RegCount; I != E; ++I)
	OutVec.push_back(std::make_pair(Regs[I], RegisterSize));
	}
	return OutVec;
	}

	void SelectionDAGBuilder::init(GCFunctionInfo gfi, AliasAnalysis aa,
	const TargetLibraryInfo *li) {
	AA = aa;
	GFI = gfi;
	LibInfo = li;
	DL = &DAG.getDataLayout();
	Context = DAG.getContext();
	LPadToCallSiteMap.clear();
	SL->init(DAG.getTargetLoweringInfo(), TM, DAG.getDataLayout());
	}

	void SelectionDAGBuilder::clear() {
	NodeMap.clear();
	UnusedArgNodeMap.clear();
	PendingLoads.clear();
	PendingExports.clear();
	CurInst = nullptr;
	HasTailCall = false;
	SDNodeOrder = LowestSDNodeOrder;
	StatepointLowering.clear();
	}

	void SelectionDAGBuilder::clearDanglingDebugInfo() {
	DanglingDebugInfoMap.clear();
	}

	SDValue SelectionDAGBuilder::getRoot() {
	if (PendingLoads.empty())
	return DAG.getRoot();

	if (PendingLoads.size() == 1) {
	SDValue Root = PendingLoads[0];
	DAG.setRoot(Root);
	PendingLoads.clear();
	return Root;
	}

	// Otherwise, we have to make a token factor node.
	SDValue Root = DAG.getTokenFactor(getCurSDLoc(), PendingLoads);
	PendingLoads.clear();
	DAG.setRoot(Root);
	return Root;
	}

	SDValue SelectionDAGBuilder::getControlRoot() {
	SDValue Root = DAG.getRoot();

	if (PendingExports.empty())
	return Root;

	// Turn all of the CopyToReg chains into one factored node.
	if (Root.getOpcode() != ISD::EntryToken) {
	unsigned i = 0, e = PendingExports.size();
	for (; i != e; ++i) {
	assert(PendingExports[i].getNode()->getNumOperands() > 1);
	if (PendingExports[i].getNode()->getOperand(0) == Root)
	break; // Don't add the root if we already indirectly depend on it.
	}

	if (i == e)
	PendingExports.push_back(Root);
	}

	Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
	PendingExports);
	PendingExports.clear();
	DAG.setRoot(Root);
	return Root;
	}

	void SelectionDAGBuilder::visit(const Instruction &I) {
	// Set up outgoing PHI node register values before emitting the terminator.
	if (I.isTerminator()) {
	HandlePHINodesInSuccessorBlocks(I.getParent());
	}

	// Increase the SDNodeOrder if dealing with a non-debug instruction.
	if (!isa<DbgInfoIntrinsic>(I))
	++SDNodeOrder;

	CurInst = &I;

	visit(I.getOpcode(), I);

	if (auto *FPMO = dyn_cast<FPMathOperator>(&I)) {
	// Propagate the fast-math-flags of this IR instruction to the DAG node that
	// maps to this instruction.
	// TODO: We could handle all flags (nsw, etc) here.
	// TODO: If an IR instruction maps to >1 node, only the final node will have
	// flags set.
	if (SDNode *Node = getNodeForIRValue(&I)) {
	SDNodeFlags IncomingFlags;
	IncomingFlags.copyFMF(*FPMO);
	if (!Node->getFlags().isDefined())
	Node->setFlags(IncomingFlags);
	else
	Node->intersectFlagsWith(IncomingFlags);
	}
	}

	if (!I.isTerminator() && !HasTailCall &&
	!isStatepoint(&I)) // statepoints handle their exports internally
	CopyToExportRegsIfNeeded(&I);

	CurInst = nullptr;
	}

	void SelectionDAGBuilder::visitPHI(const PHINode &) {
	llvm_unreachable("SelectionDAGBuilder shouldn't visit PHI nodes!");
	}

	void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
	// Note: this doesn't use InstVisitor, because it has to work with
	// ConstantExpr's in addition to instructions.
	switch (Opcode) {
	default: llvm_unreachable("Unknown instruction type encountered!");
	// Build the switch statement using the Instruction.def file.
	#define HANDLE_INST(NUM, OPCODE, CLASS) \
	case Instruction::OPCODE: visit##OPCODE((const CLASS&)I); break;
	#include "llvm/IR/Instruction.def"
	}
	}

	void SelectionDAGBuilder::dropDanglingDebugInfo(const DILocalVariable *Variable,
	const DIExpression *Expr) {
	auto isMatchingDbgValue = [&](DanglingDebugInfo &DDI) {
	const DbgValueInst *DI = DDI.getDI();
	DIVariable *DanglingVariable = DI->getVariable();
	DIExpression *DanglingExpr = DI->getExpression();
	if (DanglingVariable == Variable && Expr->fragmentsOverlap(DanglingExpr)) {
	LLVM_DEBUG(dbgs() << "Dropping dangling debug info for " << *DI << "\n");
	return true;
	}
	return false;
	};

	for (auto &DDIMI : DanglingDebugInfoMap) {
	DanglingDebugInfoVector &DDIV = DDIMI.second;

	// If debug info is to be dropped, run it through final checks to see
	// whether it can be salvaged.
	for (auto &DDI : DDIV)
	if (isMatchingDbgValue(DDI))
	salvageUnresolvedDbgValue(DDI);

	DDIV.erase(remove_if(DDIV, isMatchingDbgValue), DDIV.end());
	}
	}

	// resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V,
	// generate the debug data structures now that we've seen its definition.
	void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
	SDValue Val) {
	auto DanglingDbgInfoIt = DanglingDebugInfoMap.find(V);
	if (DanglingDbgInfoIt == DanglingDebugInfoMap.end())
	return;

	DanglingDebugInfoVector &DDIV = DanglingDbgInfoIt->second;
	for (auto &DDI : DDIV) {
	const DbgValueInst *DI = DDI.getDI();
	assert(DI && "Ill-formed DanglingDebugInfo");
	DebugLoc dl = DDI.getdl();
	unsigned ValSDNodeOrder = Val.getNode()->getIROrder();
	unsigned DbgSDNodeOrder = DDI.getSDNodeOrder();
	DILocalVariable *Variable = DI->getVariable();
	DIExpression *Expr = DI->getExpression();
	assert(Variable->isValidLocationForIntrinsic(dl) &&
	"Expected inlined-at fields to agree");
	SDDbgValue *SDV;
	if (Val.getNode()) {
	// FIXME: I doubt that it is correct to resolve a dangling DbgValue as a
	// FuncArgumentDbgValue (it would be hoisted to the function entry, and if
	// we couldn't resolve it directly when examining the DbgValue intrinsic
	// in the first place we should not be more successful here). Unless we
	// have some test case that prove this to be correct we should avoid
	// calling EmitFuncArgumentDbgValue here.
	if (!EmitFuncArgumentDbgValue(V, Variable, Expr, dl, false, Val)) {
	LLVM_DEBUG(dbgs() << "Resolve dangling debug info [order="
	<< DbgSDNodeOrder << "] for:\n " << *DI << "\n");
	LLVM_DEBUG(dbgs() << " By mapping to:\n "; Val.dump());
	// Increase the SDNodeOrder for the DbgValue here to make sure it is
	// inserted after the definition of Val when emitting the instructions
	// after ISel. An alternative could be to teach
	// ScheduleDAGSDNodes::EmitSchedule to delay the insertion properly.
	LLVM_DEBUG(if (ValSDNodeOrder > DbgSDNodeOrder) dbgs()
	<< "changing SDNodeOrder from " << DbgSDNodeOrder << " to "
	<< ValSDNodeOrder << "\n");
	SDV = getDbgValue(Val, Variable, Expr, dl,
	std::max(DbgSDNodeOrder, ValSDNodeOrder));
	DAG.AddDbgValue(SDV, Val.getNode(), false);
	} else
	LLVM_DEBUG(dbgs() << "Resolved dangling debug info for " << *DI
	<< "in EmitFuncArgumentDbgValue\n");
	} else {
	LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
	auto Undef =
	UndefValue::get(DDI.getDI()->getVariableLocation()->getType());
	auto SDV =
	DAG.getConstantDbgValue(Variable, Expr, Undef, dl, DbgSDNodeOrder);
	DAG.AddDbgValue(SDV, nullptr, false);
	}
	}
	DDIV.clear();
	}

	void SelectionDAGBuilder::salvageUnresolvedDbgValue(DanglingDebugInfo &DDI) {
	Value *V = DDI.getDI()->getValue();
	DILocalVariable *Var = DDI.getDI()->getVariable();
	DIExpression *Expr = DDI.getDI()->getExpression();
	DebugLoc DL = DDI.getdl();
	DebugLoc InstDL = DDI.getDI()->getDebugLoc();
	unsigned SDOrder = DDI.getSDNodeOrder();

	// Currently we consider only dbg.value intrinsics -- we tell the salvager
	// that DW_OP_stack_value is desired.
	assert(isa<DbgValueInst>(DDI.getDI()));
	bool StackValue = true;

	// Can this Value can be encoded without any further work?
	if (handleDebugValue(V, Var, Expr, DL, InstDL, SDOrder))
	return;

	// Attempt to salvage back through as many instructions as possible. Bail if
	// a non-instruction is seen, such as a constant expression or global
	// variable. FIXME: Further work could recover those too.
	while (isa<Instruction>(V)) {
	Instruction &VAsInst = *cast<Instruction>(V);
	DIExpression *NewExpr = salvageDebugInfoImpl(VAsInst, Expr, StackValue);

	// If we cannot salvage any further, and haven't yet found a suitable debug
	// expression, bail out.
	if (!NewExpr)
	break;

	// New value and expr now represent this debuginfo.
	V = VAsInst.getOperand(0);
	Expr = NewExpr;

	// Some kind of simplification occurred: check whether the operand of the
	// salvaged debug expression can be encoded in this DAG.
	if (handleDebugValue(V, Var, Expr, DL, InstDL, SDOrder)) {
	LLVM_DEBUG(dbgs() << "Salvaged debug location info for:\n "
	<< DDI.getDI() << "\nBy stripping back to:\n " << V);
	return;
	}
	}

	// This was the final opportunity to salvage this debug information, and it
	// couldn't be done. Place an undef DBG_VALUE at this location to terminate
	// any earlier variable location.
	auto Undef = UndefValue::get(DDI.getDI()->getVariableLocation()->getType());
	auto SDV = DAG.getConstantDbgValue(Var, Expr, Undef, DL, SDNodeOrder);
	DAG.AddDbgValue(SDV, nullptr, false);

	LLVM_DEBUG(dbgs() << "Dropping debug value info for:\n " << DDI.getDI()
	<< "\n");
	LLVM_DEBUG(dbgs() << " Last seen at:\n " << *DDI.getDI()->getOperand(0)
	<< "\n");
	}

	bool SelectionDAGBuilder::handleDebugValue(const Value V, DILocalVariable Var,
	DIExpression *Expr, DebugLoc dl,
	DebugLoc InstDL, unsigned Order) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDDbgValue *SDV;
	if (isa<ConstantInt>(V) \|\| isa<ConstantFP>(V) \|\| isa<UndefValue>(V) \|\|
	isa<ConstantPointerNull>(V)) {
	SDV = DAG.getConstantDbgValue(Var, Expr, V, dl, SDNodeOrder);
	DAG.AddDbgValue(SDV, nullptr, false);
	return true;
	}

	// If the Value is a frame index, we can create a FrameIndex debug value
	// without relying on the DAG at all.
	if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
	auto SI = FuncInfo.StaticAllocaMap.find(AI);
	if (SI != FuncInfo.StaticAllocaMap.end()) {
	auto SDV =
	DAG.getFrameIndexDbgValue(Var, Expr, SI->second,
	/IsIndirect/ false, dl, SDNodeOrder);
	// Do not attach the SDNodeDbgValue to an SDNode: this variable location
	// is still available even if the SDNode gets optimized out.
	DAG.AddDbgValue(SDV, nullptr, false);
	return true;
	}
	}

	// Do not use getValue() in here; we don't want to generate code at
	// this point if it hasn't been done yet.
	SDValue N = NodeMap[V];
	if (!N.getNode() && isa<Argument>(V)) // Check unused arguments map.
	N = UnusedArgNodeMap[V];
	if (N.getNode()) {
	if (EmitFuncArgumentDbgValue(V, Var, Expr, dl, false, N))
	return true;
	SDV = getDbgValue(N, Var, Expr, dl, SDNodeOrder);
	DAG.AddDbgValue(SDV, N.getNode(), false);
	return true;
	}

	// Special rules apply for the first dbg.values of parameter variables in a
	// function. Identify them by the fact they reference Argument Values, that
	// they're parameters, and they are parameters of the current function. We
	// need to let them dangle until they get an SDNode.
	bool IsParamOfFunc = isa<Argument>(V) && Var->isParameter() &&
	!InstDL.getInlinedAt();
	if (!IsParamOfFunc) {
	// The value is not used in this block yet (or it would have an SDNode).
	// We still want the value to appear for the user if possible -- if it has
	// an associated VReg, we can refer to that instead.
	auto VMI = FuncInfo.ValueMap.find(V);
	if (VMI != FuncInfo.ValueMap.end()) {
	unsigned Reg = VMI->second;
	// If this is a PHI node, it may be split up into several MI PHI nodes
	// (in FunctionLoweringInfo::set).
	RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg,
	V->getType(), None);
	if (RFV.occupiesMultipleRegs()) {
	unsigned Offset = 0;
	unsigned BitsToDescribe = 0;
	if (auto VarSize = Var->getSizeInBits())
	BitsToDescribe = *VarSize;
	if (auto Fragment = Expr->getFragmentInfo())
	BitsToDescribe = Fragment->SizeInBits;
	for (auto RegAndSize : RFV.getRegsAndSizes()) {
	unsigned RegisterSize = RegAndSize.second;
	// Bail out if all bits are described already.
	if (Offset >= BitsToDescribe)
	break;
	unsigned FragmentSize = (Offset + RegisterSize > BitsToDescribe)
	? BitsToDescribe - Offset
	: RegisterSize;
	auto FragmentExpr = DIExpression::createFragmentExpression(
	Expr, Offset, FragmentSize);
	if (!FragmentExpr)
	continue;
	SDV = DAG.getVRegDbgValue(Var, *FragmentExpr, RegAndSize.first,
	false, dl, SDNodeOrder);
	DAG.AddDbgValue(SDV, nullptr, false);
	Offset += RegisterSize;
	}
	} else {
	SDV = DAG.getVRegDbgValue(Var, Expr, Reg, false, dl, SDNodeOrder);
	DAG.AddDbgValue(SDV, nullptr, false);
	}
	return true;
	}
	}

	return false;
	}

	void SelectionDAGBuilder::resolveOrClearDbgInfo() {
	// Try to fixup any remaining dangling debug info -- and drop it if we can't.
	for (auto &Pair : DanglingDebugInfoMap)
	for (auto &DDI : Pair.second)
	salvageUnresolvedDbgValue(DDI);
	clearDanglingDebugInfo();
	}

	/// getCopyFromRegs - If there was virtual register allocated for the value V
	/// emit CopyFromReg of the specified type Ty. Return empty SDValue() otherwise.
	SDValue SelectionDAGBuilder::getCopyFromRegs(const Value V, Type Ty) {
	DenseMap<const Value *, unsigned>::iterator It = FuncInfo.ValueMap.find(V);
	SDValue Result;

	if (It != FuncInfo.ValueMap.end()) {
	unsigned InReg = It->second;

	RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
	DAG.getDataLayout(), InReg, Ty,
	None); // This is not an ABI copy.
	SDValue Chain = DAG.getEntryNode();
	Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr,
	V);
	resolveDanglingDebugInfo(V, Result);
	}

	return Result;
	}

	/// getValue - Return an SDValue for the given Value.
	SDValue SelectionDAGBuilder::getValue(const Value *V) {
	// If we already have an SDValue for this value, use it. It's important
	// to do this first, so that we don't create a CopyFromReg if we already
	// have a regular SDValue.
	SDValue &N = NodeMap[V];
	if (N.getNode()) return N;

	// If there's a virtual register allocated and initialized for this
	// value, use it.
	if (SDValue copyFromReg = getCopyFromRegs(V, V->getType()))
	return copyFromReg;

	// Otherwise create a new SDValue and remember it.
	SDValue Val = getValueImpl(V);
	NodeMap[V] = Val;
	resolveDanglingDebugInfo(V, Val);
	return Val;
	}

	// Return true if SDValue exists for the given Value
	bool SelectionDAGBuilder::findValue(const Value *V) const {
	return (NodeMap.find(V) != NodeMap.end()) \|\|
	(FuncInfo.ValueMap.find(V) != FuncInfo.ValueMap.end());
	}

	/// getNonRegisterValue - Return an SDValue for the given Value, but
	/// don't look in FuncInfo.ValueMap for a virtual register.
	SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
	// If we already have an SDValue for this value, use it.
	SDValue &N = NodeMap[V];
	if (N.getNode()) {
	if (isa<ConstantSDNode>(N) \|\| isa<ConstantFPSDNode>(N)) {
	// Remove the debug location from the node as the node is about to be used
	// in a location which may differ from the original debug location. This
	// is relevant to Constant and ConstantFP nodes because they can appear
	// as constant expressions inside PHI nodes.
	N->setDebugLoc(DebugLoc());
	}
	return N;
	}

	// Otherwise create a new SDValue and remember it.
	SDValue Val = getValueImpl(V);
	NodeMap[V] = Val;
	resolveDanglingDebugInfo(V, Val);
	return Val;
	}

	/// getValueImpl - Helper function for getValue and getNonRegisterValue.
	/// Create an SDValue for the given value.
	SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (const Constant *C = dyn_cast<Constant>(V)) {
	EVT VT = TLI.getValueType(DAG.getDataLayout(), V->getType(), true);

	if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
	return DAG.getConstant(*CI, getCurSDLoc(), VT);

	if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
	return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);

	if (isa<ConstantPointerNull>(C)) {
	unsigned AS = V->getType()->getPointerAddressSpace();
	return DAG.getConstant(0, getCurSDLoc(),
	TLI.getPointerTy(DAG.getDataLayout(), AS));
	}

	if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
	return DAG.getConstantFP(*CFP, getCurSDLoc(), VT);

	if (isa<UndefValue>(C) && !V->getType()->isAggregateType())
	return DAG.getUNDEF(VT);

	if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
	visit(CE->getOpcode(), *CE);
	SDValue N1 = NodeMap[V];
	assert(N1.getNode() && "visit didn't populate the NodeMap!");
	return N1;
	}

	if (isa<ConstantStruct>(C) \|\| isa<ConstantArray>(C)) {
	SmallVector<SDValue, 4> Constants;
	for (User::const_op_iterator OI = C->op_begin(), OE = C->op_end();
	OI != OE; ++OI) {
	SDNode Val = getValue(OI).getNode();
	// If the operand is an empty aggregate, there are no values.
	if (!Val) continue;
	// Add each leaf value from the operand to the Constants list
	// to form a flattened list of all the values.
	for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
	Constants.push_back(SDValue(Val, i));
	}

	return DAG.getMergeValues(Constants, getCurSDLoc());
	}

	if (const ConstantDataSequential *CDS =
	dyn_cast<ConstantDataSequential>(C)) {
	SmallVector<SDValue, 4> Ops;
	for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
	SDNode *Val = getValue(CDS->getElementAsConstant(i)).getNode();
	// Add each leaf value from the operand to the Constants list
	// to form a flattened list of all the values.
	for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
	Ops.push_back(SDValue(Val, i));
	}

	if (isa<ArrayType>(CDS->getType()))
	return DAG.getMergeValues(Ops, getCurSDLoc());
	return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
	}

	if (C->getType()->isStructTy() \|\| C->getType()->isArrayTy()) {
	assert((isa<ConstantAggregateZero>(C) \|\| isa<UndefValue>(C)) &&
	"Unknown struct or array constant!");

	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), C->getType(), ValueVTs);
	unsigned NumElts = ValueVTs.size();
	if (NumElts == 0)
	return SDValue(); // empty struct
	SmallVector<SDValue, 4> Constants(NumElts);
	for (unsigned i = 0; i != NumElts; ++i) {
	EVT EltVT = ValueVTs[i];
	if (isa<UndefValue>(C))
	Constants[i] = DAG.getUNDEF(EltVT);
	else if (EltVT.isFloatingPoint())
	Constants[i] = DAG.getConstantFP(0, getCurSDLoc(), EltVT);
	else
	Constants[i] = DAG.getConstant(0, getCurSDLoc(), EltVT);
	}

	return DAG.getMergeValues(Constants, getCurSDLoc());
	}

	if (const BlockAddress *BA = dyn_cast<BlockAddress>(C))
	return DAG.getBlockAddress(BA, VT);

	VectorType *VecTy = cast<VectorType>(V->getType());
	unsigned NumElements = VecTy->getNumElements();

	// Now that we know the number and type of the elements, get that number of
	// elements into the Ops array based on what kind of constant it is.
	SmallVector<SDValue, 16> Ops;
	if (const ConstantVector *CV = dyn_cast<ConstantVector>(C)) {
	for (unsigned i = 0; i != NumElements; ++i)
	Ops.push_back(getValue(CV->getOperand(i)));
	} else {
	assert(isa<ConstantAggregateZero>(C) && "Unknown vector constant!");
	EVT EltVT =
	TLI.getValueType(DAG.getDataLayout(), VecTy->getElementType());

	SDValue Op;
	if (EltVT.isFloatingPoint())
	Op = DAG.getConstantFP(0, getCurSDLoc(), EltVT);
	else
	Op = DAG.getConstant(0, getCurSDLoc(), EltVT);
	Ops.assign(NumElements, Op);
	}

	// Create a BUILD_VECTOR node.
	return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
	}

	// If this is a static alloca, generate it as the frameindex instead of
	// computation.
	if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
	DenseMap<const AllocaInst*, int>::iterator SI =
	FuncInfo.StaticAllocaMap.find(AI);
	if (SI != FuncInfo.StaticAllocaMap.end())
	return DAG.getFrameIndex(SI->second,
	TLI.getFrameIndexTy(DAG.getDataLayout()));
	}

	// If this is an instruction which fast-isel has deferred, select it now.
	if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
	unsigned InReg = FuncInfo.InitializeRegForValue(Inst);

	RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
	Inst->getType(), getABIRegCopyCC(V));
	SDValue Chain = DAG.getEntryNode();
	return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
	}

	llvm_unreachable("Can't get register for value!");
	}

	void SelectionDAGBuilder::visitCatchPad(const CatchPadInst &I) {
	auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
	bool IsMSVCCXX = Pers == EHPersonality::MSVC_CXX;
	bool IsCoreCLR = Pers == EHPersonality::CoreCLR;
	bool IsSEH = isAsynchronousEHPersonality(Pers);
	bool IsWasmCXX = Pers == EHPersonality::Wasm_CXX;
	MachineBasicBlock *CatchPadMBB = FuncInfo.MBB;
	if (!IsSEH)
	CatchPadMBB->setIsEHScopeEntry();
	// In MSVC C++ and CoreCLR, catchblocks are funclets and need prologues.
	if (IsMSVCCXX \|\| IsCoreCLR)
	CatchPadMBB->setIsEHFuncletEntry();
	// Wasm does not need catchpads anymore
	if (!IsWasmCXX)
	DAG.setRoot(DAG.getNode(ISD::CATCHPAD, getCurSDLoc(), MVT::Other,
	getControlRoot()));
	}

	void SelectionDAGBuilder::visitCatchRet(const CatchReturnInst &I) {
	// Update machine-CFG edge.
	MachineBasicBlock *TargetMBB = FuncInfo.MBBMap[I.getSuccessor()];
	FuncInfo.MBB->addSuccessor(TargetMBB);

	auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
	bool IsSEH = isAsynchronousEHPersonality(Pers);
	if (IsSEH) {
	// If this is not a fall-through branch or optimizations are switched off,
	// emit the branch.
	if (TargetMBB != NextBlock(FuncInfo.MBB) \|\|
	TM.getOptLevel() == CodeGenOpt::None)
	DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
	getControlRoot(), DAG.getBasicBlock(TargetMBB)));
	return;
	}

	// Figure out the funclet membership for the catchret's successor.
	// This will be used by the FuncletLayout pass to determine how to order the
	// BB's.
	// A 'catchret' returns to the outer scope's color.
	Value *ParentPad = I.getCatchSwitchParentPad();
	const BasicBlock *SuccessorColor;
	if (isa<ConstantTokenNone>(ParentPad))
	SuccessorColor = &FuncInfo.Fn->getEntryBlock();
	else
	SuccessorColor = cast<Instruction>(ParentPad)->getParent();
	assert(SuccessorColor && "No parent funclet for catchret!");
	MachineBasicBlock *SuccessorColorMBB = FuncInfo.MBBMap[SuccessorColor];
	assert(SuccessorColorMBB && "No MBB for SuccessorColor!");

	// Create the terminator node.
	SDValue Ret = DAG.getNode(ISD::CATCHRET, getCurSDLoc(), MVT::Other,
	getControlRoot(), DAG.getBasicBlock(TargetMBB),
	DAG.getBasicBlock(SuccessorColorMBB));
	DAG.setRoot(Ret);
	}

	void SelectionDAGBuilder::visitCleanupPad(const CleanupPadInst &CPI) {
	// Don't emit any special code for the cleanuppad instruction. It just marks
	// the start of an EH scope/funclet.
	FuncInfo.MBB->setIsEHScopeEntry();
	auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
	if (Pers != EHPersonality::Wasm_CXX) {
	FuncInfo.MBB->setIsEHFuncletEntry();
	FuncInfo.MBB->setIsCleanupFuncletEntry();
	}
	}

	// For wasm, there's alwyas a single catch pad attached to a catchswitch, and
	// the control flow always stops at the single catch pad, as it does for a
	// cleanup pad. In case the exception caught is not of the types the catch pad
	// catches, it will be rethrown by a rethrow.
	static void findWasmUnwindDestinations(
	FunctionLoweringInfo &FuncInfo, const BasicBlock *EHPadBB,
	BranchProbability Prob,
	SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>>
	&UnwindDests) {
	while (EHPadBB) {
	const Instruction *Pad = EHPadBB->getFirstNonPHI();
	if (isa<CleanupPadInst>(Pad)) {
	// Stop on cleanup pads.
	UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
	UnwindDests.back().first->setIsEHScopeEntry();
	break;
	} else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
	// Add the catchpad handlers to the possible destinations. We don't
	// continue to the unwind destination of the catchswitch for wasm.
	for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
	UnwindDests.emplace_back(FuncInfo.MBBMap[CatchPadBB], Prob);
	UnwindDests.back().first->setIsEHScopeEntry();
	}
	break;
	} else {
	continue;
	}
	}
	}

	/// When an invoke or a cleanupret unwinds to the next EH pad, there are
	/// many places it could ultimately go. In the IR, we have a single unwind
	/// destination, but in the machine CFG, we enumerate all the possible blocks.
	/// This function skips over imaginary basic blocks that hold catchswitch
	/// instructions, and finds all the "real" machine
	/// basic block destinations. As those destinations may not be successors of
	/// EHPadBB, here we also calculate the edge probability to those destinations.
	/// The passed-in Prob is the edge probability to EHPadBB.
	static void findUnwindDestinations(
	FunctionLoweringInfo &FuncInfo, const BasicBlock *EHPadBB,
	BranchProbability Prob,
	SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>>
	&UnwindDests) {
	EHPersonality Personality =
	classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
	bool IsMSVCCXX = Personality == EHPersonality::MSVC_CXX;
	bool IsCoreCLR = Personality == EHPersonality::CoreCLR;
	bool IsWasmCXX = Personality == EHPersonality::Wasm_CXX;
	bool IsSEH = isAsynchronousEHPersonality(Personality);

	if (IsWasmCXX) {
	findWasmUnwindDestinations(FuncInfo, EHPadBB, Prob, UnwindDests);
	assert(UnwindDests.size() <= 1 &&
	"There should be at most one unwind destination for wasm");
	return;
	}

	while (EHPadBB) {
	const Instruction *Pad = EHPadBB->getFirstNonPHI();
	BasicBlock *NewEHPadBB = nullptr;
	if (isa<LandingPadInst>(Pad)) {
	// Stop on landingpads. They are not funclets.
	UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
	break;
	} else if (isa<CleanupPadInst>(Pad)) {
	// Stop on cleanup pads. Cleanups are always funclet entries for all known
	// personalities.
	UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
	UnwindDests.back().first->setIsEHScopeEntry();
	UnwindDests.back().first->setIsEHFuncletEntry();
	break;
	} else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
	// Add the catchpad handlers to the possible destinations.
	for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
	UnwindDests.emplace_back(FuncInfo.MBBMap[CatchPadBB], Prob);
	// For MSVC++ and the CLR, catchblocks are funclets and need prologues.
	if (IsMSVCCXX \|\| IsCoreCLR)
	UnwindDests.back().first->setIsEHFuncletEntry();
	if (!IsSEH)
	UnwindDests.back().first->setIsEHScopeEntry();
	}
	NewEHPadBB = CatchSwitch->getUnwindDest();
	} else {
	continue;
	}

	BranchProbabilityInfo *BPI = FuncInfo.BPI;
	if (BPI && NewEHPadBB)
	Prob *= BPI->getEdgeProbability(EHPadBB, NewEHPadBB);
	EHPadBB = NewEHPadBB;
	}
	}

	void SelectionDAGBuilder::visitCleanupRet(const CleanupReturnInst &I) {
	// Update successor info.
	SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
	auto UnwindDest = I.getUnwindDest();
	BranchProbabilityInfo *BPI = FuncInfo.BPI;
	BranchProbability UnwindDestProb =
	(BPI && UnwindDest)
	? BPI->getEdgeProbability(FuncInfo.MBB->getBasicBlock(), UnwindDest)
	: BranchProbability::getZero();
	findUnwindDestinations(FuncInfo, UnwindDest, UnwindDestProb, UnwindDests);
	for (auto &UnwindDest : UnwindDests) {
	UnwindDest.first->setIsEHPad();
	addSuccessorWithProb(FuncInfo.MBB, UnwindDest.first, UnwindDest.second);
	}
	FuncInfo.MBB->normalizeSuccProbs();

	// Create the terminator node.
	SDValue Ret =
	DAG.getNode(ISD::CLEANUPRET, getCurSDLoc(), MVT::Other, getControlRoot());
	DAG.setRoot(Ret);
	}

	void SelectionDAGBuilder::visitCatchSwitch(const CatchSwitchInst &CSI) {
	report_fatal_error("visitCatchSwitch not yet implemented!");
	}

	void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	auto &DL = DAG.getDataLayout();
	SDValue Chain = getControlRoot();
	SmallVector<ISD::OutputArg, 8> Outs;
	SmallVector<SDValue, 8> OutVals;

	// Calls to @llvm.experimental.deoptimize don't generate a return value, so
	// lower
	//
	// %val = call <ty> @llvm.experimental.deoptimize()
	// ret <ty> %val
	//
	// differently.
	if (I.getParent()->getTerminatingDeoptimizeCall()) {
	LowerDeoptimizingReturn();
	return;
	}

	if (!FuncInfo.CanLowerReturn) {
	unsigned DemoteReg = FuncInfo.DemoteRegister;
	const Function *F = I.getParent()->getParent();

	// Emit a store of the return value through the virtual register.
	// Leave Outs empty so that LowerReturn won't try to load return
	// registers the usual way.
	SmallVector<EVT, 1> PtrValueVTs;
	ComputeValueVTs(TLI, DL,
	F->getReturnType()->getPointerTo(
	DAG.getDataLayout().getAllocaAddrSpace()),
	PtrValueVTs);

	SDValue RetPtr = DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
	DemoteReg, PtrValueVTs[0]);
	SDValue RetOp = getValue(I.getOperand(0));

	SmallVector<EVT, 4> ValueVTs, MemVTs;
	SmallVector<uint64_t, 4> Offsets;
	ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs, &MemVTs,
	&Offsets);
	unsigned NumValues = ValueVTs.size();

	SmallVector<SDValue, 4> Chains(NumValues);
	for (unsigned i = 0; i != NumValues; ++i) {
	// An aggregate return value cannot wrap around the address space, so
	// offsets to its parts don't wrap either.
	SDValue Ptr = DAG.getObjectPtrOffset(getCurSDLoc(), RetPtr, Offsets[i]);

	SDValue Val = RetOp.getValue(i);
	if (MemVTs[i] != ValueVTs[i])
	Val = DAG.getPtrExtOrTrunc(Val, getCurSDLoc(), MemVTs[i]);
	Chains[i] = DAG.getStore(Chain, getCurSDLoc(), Val,
	// FIXME: better loc info would be nice.
	Ptr, MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
	}

	Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
	MVT::Other, Chains);
	} else if (I.getNumOperands() != 0) {
	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs);
	unsigned NumValues = ValueVTs.size();
	if (NumValues) {
	SDValue RetOp = getValue(I.getOperand(0));

	const Function *F = I.getParent()->getParent();

	bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters(
	I.getOperand(0)->getType(), F->getCallingConv(),
	/IsVarArg/ false);

	ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
	if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
	Attribute::SExt))
	ExtendKind = ISD::SIGN_EXTEND;
	else if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
	Attribute::ZExt))
	ExtendKind = ISD::ZERO_EXTEND;

	LLVMContext &Context = F->getContext();
	bool RetInReg = F->getAttributes().hasAttribute(
	AttributeList::ReturnIndex, Attribute::InReg);

	for (unsigned j = 0; j != NumValues; ++j) {
	EVT VT = ValueVTs[j];

	if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
	VT = TLI.getTypeForExtReturn(Context, VT, ExtendKind);

	CallingConv::ID CC = F->getCallingConv();

	unsigned NumParts = TLI.getNumRegistersForCallingConv(Context, CC, VT);
	MVT PartVT = TLI.getRegisterTypeForCallingConv(Context, CC, VT);
	SmallVector<SDValue, 4> Parts(NumParts);
	getCopyToParts(DAG, getCurSDLoc(),
	SDValue(RetOp.getNode(), RetOp.getResNo() + j),
	&Parts[0], NumParts, PartVT, &I, CC, ExtendKind);

	// 'inreg' on function refers to return value
	ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
	if (RetInReg)
	Flags.setInReg();

	if (I.getOperand(0)->getType()->isPointerTy()) {
	Flags.setPointer();
	Flags.setPointerAddrSpace(
	cast<PointerType>(I.getOperand(0)->getType())->getAddressSpace());
	}

	if (NeedsRegBlock) {
	Flags.setInConsecutiveRegs();
	if (j == NumValues - 1)
	Flags.setInConsecutiveRegsLast();
	}

	// Propagate extension type if any
	if (ExtendKind == ISD::SIGN_EXTEND)
	Flags.setSExt();
	else if (ExtendKind == ISD::ZERO_EXTEND)
	Flags.setZExt();

	for (unsigned i = 0; i < NumParts; ++i) {
	Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
	VT, /isfixed=/true, 0, 0));
	OutVals.push_back(Parts[i]);
	}
	}
	}
	}

	// Push in swifterror virtual register as the last element of Outs. This makes
	// sure swifterror virtual register will be returned in the swifterror
	// physical register.
	const Function *F = I.getParent()->getParent();
	if (TLI.supportSwiftError() &&
	F->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) {
	assert(SwiftError.getFunctionArg() && "Need a swift error argument");
	ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
	Flags.setSwiftError();
	Outs.push_back(ISD::OutputArg(Flags, EVT(TLI.getPointerTy(DL)) /vt/,
	EVT(TLI.getPointerTy(DL)) /argvt/,
	true /isfixed/, 1 /origidx/,
	0 /partOffs/));
	// Create SDNode for the swifterror virtual register.
	OutVals.push_back(
	DAG.getRegister(SwiftError.getOrCreateVRegUseAt(
	&I, FuncInfo.MBB, SwiftError.getFunctionArg()),
	EVT(TLI.getPointerTy(DL))));
	}

	bool isVarArg = DAG.getMachineFunction().getFunction().isVarArg();
	CallingConv::ID CallConv =
	DAG.getMachineFunction().getFunction().getCallingConv();
	Chain = DAG.getTargetLoweringInfo().LowerReturn(
	Chain, CallConv, isVarArg, Outs, OutVals, getCurSDLoc(), DAG);

	// Verify that the target's LowerReturn behaved as expected.
	assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
	"LowerReturn didn't return a valid chain!");

	// Update the DAG with the new chain value resulting from return lowering.
	DAG.setRoot(Chain);
	}

	/// CopyToExportRegsIfNeeded - If the given value has virtual registers
	/// created for it, emit nodes to copy the value into the virtual
	/// registers.
	void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) {
	// Skip empty types
	if (V->getType()->isEmptyTy())
	return;

	DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
	if (VMI != FuncInfo.ValueMap.end()) {
	assert(!V->use_empty() && "Unused value assigned virtual registers!");
	CopyValueToVirtualRegister(V, VMI->second);
	}
	}

	/// ExportFromCurrentBlock - If this condition isn't known to be exported from
	/// the current basic block, add it to ValueMap now so that we'll get a
	/// CopyTo/FromReg.
	void SelectionDAGBuilder::ExportFromCurrentBlock(const Value *V) {
	// No need to export constants.
	if (!isa<Instruction>(V) && !isa<Argument>(V)) return;

	// Already exported?
	if (FuncInfo.isExportedInst(V)) return;

	unsigned Reg = FuncInfo.InitializeRegForValue(V);
	CopyValueToVirtualRegister(V, Reg);
	}

	bool SelectionDAGBuilder::isExportableFromCurrentBlock(const Value *V,
	const BasicBlock *FromBB) {
	// The operands of the setcc have to be in this block. We don't know
	// how to export them from some other block.
	if (const Instruction *VI = dyn_cast<Instruction>(V)) {
	// Can export from current BB.
	if (VI->getParent() == FromBB)
	return true;

	// Is already exported, noop.
	return FuncInfo.isExportedInst(V);
	}

	// If this is an argument, we can export it if the BB is the entry block or
	// if it is already exported.
	if (isa<Argument>(V)) {
	if (FromBB == &FromBB->getParent()->getEntryBlock())
	return true;

	// Otherwise, can only export this if it is already exported.
	return FuncInfo.isExportedInst(V);
	}

	// Otherwise, constants can always be exported.
	return true;
	}

	/// Return branch probability calculated by BranchProbabilityInfo for IR blocks.
	BranchProbability
	SelectionDAGBuilder::getEdgeProbability(const MachineBasicBlock *Src,
	const MachineBasicBlock *Dst) const {
	BranchProbabilityInfo *BPI = FuncInfo.BPI;
	const BasicBlock *SrcBB = Src->getBasicBlock();
	const BasicBlock *DstBB = Dst->getBasicBlock();
	if (!BPI) {
	// If BPI is not available, set the default probability as 1 / N, where N is
	// the number of successors.
	auto SuccSize = std::max<uint32_t>(succ_size(SrcBB), 1);
	return BranchProbability(1, SuccSize);
	}
	return BPI->getEdgeProbability(SrcBB, DstBB);
	}

	void SelectionDAGBuilder::addSuccessorWithProb(MachineBasicBlock *Src,
	MachineBasicBlock *Dst,
	BranchProbability Prob) {
	if (!FuncInfo.BPI)
	Src->addSuccessorWithoutProb(Dst);
	else {
	if (Prob.isUnknown())
	Prob = getEdgeProbability(Src, Dst);
	Src->addSuccessor(Dst, Prob);
	}
	}

	static bool InBlock(const Value V, const BasicBlock BB) {
	if (const Instruction *I = dyn_cast<Instruction>(V))
	return I->getParent() == BB;
	return true;
	}

	/// EmitBranchForMergedCondition - Helper method for FindMergedConditions.
	/// This function emits a branch and is used at the leaves of an OR or an
	/// AND operator tree.
	void
	SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
	MachineBasicBlock *TBB,
	MachineBasicBlock *FBB,
	MachineBasicBlock *CurBB,
	MachineBasicBlock *SwitchBB,
	BranchProbability TProb,
	BranchProbability FProb,
	bool InvertCond) {
	const BasicBlock *BB = CurBB->getBasicBlock();

	// If the leaf of the tree is a comparison, merge the condition into
	// the caseblock.
	if (const CmpInst *BOp = dyn_cast<CmpInst>(Cond)) {
	// The operands of the cmp have to be in this block. We don't know
	// how to export them from some other block. If this is the first block
	// of the sequence, no exporting is needed.
	if (CurBB == SwitchBB \|\|
	(isExportableFromCurrentBlock(BOp->getOperand(0), BB) &&
	isExportableFromCurrentBlock(BOp->getOperand(1), BB))) {
	ISD::CondCode Condition;
	if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) {
	ICmpInst::Predicate Pred =
	InvertCond ? IC->getInversePredicate() : IC->getPredicate();
	Condition = getICmpCondCode(Pred);
	} else {
	const FCmpInst *FC = cast<FCmpInst>(Cond);
	FCmpInst::Predicate Pred =
	InvertCond ? FC->getInversePredicate() : FC->getPredicate();
	Condition = getFCmpCondCode(Pred);
	if (TM.Options.NoNaNsFPMath)
	Condition = getFCmpCodeWithoutNaN(Condition);
	}

	CaseBlock CB(Condition, BOp->getOperand(0), BOp->getOperand(1), nullptr,
	TBB, FBB, CurBB, getCurSDLoc(), TProb, FProb);
	SL->SwitchCases.push_back(CB);
	return;
	}
	}

	// Create a CaseBlock record representing this branch.
	ISD::CondCode Opc = InvertCond ? ISD::SETNE : ISD::SETEQ;
	CaseBlock CB(Opc, Cond, ConstantInt::getTrue(*DAG.getContext()),
	nullptr, TBB, FBB, CurBB, getCurSDLoc(), TProb, FProb);
	SL->SwitchCases.push_back(CB);
	}

	void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
	MachineBasicBlock *TBB,
	MachineBasicBlock *FBB,
	MachineBasicBlock *CurBB,
	MachineBasicBlock *SwitchBB,
	Instruction::BinaryOps Opc,
	BranchProbability TProb,
	BranchProbability FProb,
	bool InvertCond) {
	// Skip over not part of the tree and remember to invert op and operands at
	// next level.
	Value *NotCond;
	if (match(Cond, m_OneUse(m_Not(m_Value(NotCond)))) &&
	InBlock(NotCond, CurBB->getBasicBlock())) {
	FindMergedConditions(NotCond, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
	!InvertCond);
	return;
	}

	const Instruction *BOp = dyn_cast<Instruction>(Cond);
	// Compute the effective opcode for Cond, taking into account whether it needs
	// to be inverted, e.g.
	// and (not (or A, B)), C
	// gets lowered as
	// and (and (not A, not B), C)
	unsigned BOpc = 0;
	if (BOp) {
	BOpc = BOp->getOpcode();
	if (InvertCond) {
	if (BOpc == Instruction::And)
	BOpc = Instruction::Or;
	else if (BOpc == Instruction::Or)
	BOpc = Instruction::And;
	}
	}

	// If this node is not part of the or/and tree, emit it as a branch.
	if (!BOp \|\| !(isa<BinaryOperator>(BOp) \|\| isa<CmpInst>(BOp)) \|\|
	BOpc != unsigned(Opc) \|\| !BOp->hasOneUse() \|\|
	BOp->getParent() != CurBB->getBasicBlock() \|\|
	!InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) \|\|
	!InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) {
	EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB,
	TProb, FProb, InvertCond);
	return;
	}

	// Create TmpBB after CurBB.
	MachineFunction::iterator BBI(CurBB);
	MachineFunction &MF = DAG.getMachineFunction();
	MachineBasicBlock *TmpBB = MF.CreateMachineBasicBlock(CurBB->getBasicBlock());
	CurBB->getParent()->insert(++BBI, TmpBB);

	if (Opc == Instruction::Or) {
	// Codegen X \| Y as:
	// BB1:
	// jmp_if_X TBB
	// jmp TmpBB
	// TmpBB:
	// jmp_if_Y TBB
	// jmp FBB
	//

	// We have flexibility in setting Prob for BB1 and Prob for TmpBB.
	// The requirement is that
	// TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
	// = TrueProb for original BB.
	// Assuming the original probabilities are A and B, one choice is to set
	// BB1's probabilities to A/2 and A/2+B, and set TmpBB's probabilities to
	// A/(1+B) and 2B/(1+B). This choice assumes that
	// TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
	// Another choice is to assume TrueProb for BB1 equals to TrueProb for
	// TmpBB, but the math is more complicated.

	auto NewTrueProb = TProb / 2;
	auto NewFalseProb = TProb / 2 + FProb;
	// Emit the LHS condition.
	FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc,
	NewTrueProb, NewFalseProb, InvertCond);

	// Normalize A/2 and B to get A/(1+B) and 2B/(1+B).
	SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb};
	BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
	// Emit the RHS condition into TmpBB.
	FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
	Probs[0], Probs[1], InvertCond);
	} else {
	assert(Opc == Instruction::And && "Unknown merge op!");
	// Codegen X & Y as:
	// BB1:
	// jmp_if_X TmpBB
	// jmp FBB
	// TmpBB:
	// jmp_if_Y TBB
	// jmp FBB
	//
	// This requires creation of TmpBB after CurBB.

	// We have flexibility in setting Prob for BB1 and Prob for TmpBB.
	// The requirement is that
	// FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
	// = FalseProb for original BB.
	// Assuming the original probabilities are A and B, one choice is to set
	// BB1's probabilities to A+B/2 and B/2, and set TmpBB's probabilities to
	// 2A/(1+A) and B/(1+A). This choice assumes that FalseProb for BB1 ==
	// TrueProb for BB1 * FalseProb for TmpBB.

	auto NewTrueProb = TProb + FProb / 2;
	auto NewFalseProb = FProb / 2;
	// Emit the LHS condition.
	FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc,
	NewTrueProb, NewFalseProb, InvertCond);

	// Normalize A and B/2 to get 2A/(1+A) and B/(1+A).
	SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2};
	BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
	// Emit the RHS condition into TmpBB.
	FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
	Probs[0], Probs[1], InvertCond);
	}
	}

	/// If the set of cases should be emitted as a series of branches, return true.
	/// If we should emit this as a bunch of and/or'd together conditions, return
	/// false.
	bool
	SelectionDAGBuilder::ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases) {
	if (Cases.size() != 2) return true;

	// If this is two comparisons of the same values or'd or and'd together, they
	// will get folded into a single comparison, so don't emit two blocks.
	if ((Cases[0].CmpLHS == Cases[1].CmpLHS &&
	Cases[0].CmpRHS == Cases[1].CmpRHS) \|\|
	(Cases[0].CmpRHS == Cases[1].CmpLHS &&
	Cases[0].CmpLHS == Cases[1].CmpRHS)) {
	return false;
	}

	// Handle: (X != null) \| (Y != null) --> (X\|Y) != 0
	// Handle: (X == null) & (Y == null) --> (X\|Y) == 0
	if (Cases[0].CmpRHS == Cases[1].CmpRHS &&
	Cases[0].CC == Cases[1].CC &&
	isa<Constant>(Cases[0].CmpRHS) &&
	cast<Constant>(Cases[0].CmpRHS)->isNullValue()) {
	if (Cases[0].CC == ISD::SETEQ && Cases[0].TrueBB == Cases[1].ThisBB)
	return false;
	if (Cases[0].CC == ISD::SETNE && Cases[0].FalseBB == Cases[1].ThisBB)
	return false;
	}

	return true;
	}

	void SelectionDAGBuilder::visitBr(const BranchInst &I) {
	MachineBasicBlock *BrMBB = FuncInfo.MBB;

	// Update machine-CFG edges.
	MachineBasicBlock *Succ0MBB = FuncInfo.MBBMap[I.getSuccessor(0)];

	if (I.isUnconditional()) {
	// Update machine-CFG edges.
	BrMBB->addSuccessor(Succ0MBB);

	// If this is not a fall-through branch or optimizations are switched off,
	// emit the branch.
	if (Succ0MBB != NextBlock(BrMBB) \|\| TM.getOptLevel() == CodeGenOpt::None)
	DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
	MVT::Other, getControlRoot(),
	DAG.getBasicBlock(Succ0MBB)));

	return;
	}

	// If this condition is one of the special cases we handle, do special stuff
	// now.
	const Value *CondVal = I.getCondition();
	MachineBasicBlock *Succ1MBB = FuncInfo.MBBMap[I.getSuccessor(1)];

	// If this is a series of conditions that are or'd or and'd together, emit
	// this as a sequence of branches instead of setcc's with and/or operations.
	// As long as jumps are not expensive, this should improve performance.
	// For example, instead of something like:
	// cmp A, B
	// C = seteq
	// cmp D, E
	// F = setle
	// or C, F
	// jnz foo
	// Emit:
	// cmp A, B
	// je foo
	// cmp D, E
	// jle foo
	if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
	Instruction::BinaryOps Opcode = BOp->getOpcode();
	if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp->hasOneUse() &&
	!I.getMetadata(LLVMContext::MD_unpredictable) &&
	(Opcode == Instruction::And \|\| Opcode == Instruction::Or)) {
	FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
	Opcode,
	getEdgeProbability(BrMBB, Succ0MBB),
	getEdgeProbability(BrMBB, Succ1MBB),
	/InvertCond=/false);
	// If the compares in later blocks need to use values not currently
	// exported from this block, export them now. This block should always
	// be the first entry.
	assert(SL->SwitchCases[0].ThisBB == BrMBB && "Unexpected lowering!");

	// Allow some cases to be rejected.
	if (ShouldEmitAsBranches(SL->SwitchCases)) {
	for (unsigned i = 1, e = SL->SwitchCases.size(); i != e; ++i) {
	ExportFromCurrentBlock(SL->SwitchCases[i].CmpLHS);
	ExportFromCurrentBlock(SL->SwitchCases[i].CmpRHS);
	}

	// Emit the branch for this block.
	visitSwitchCase(SL->SwitchCases[0], BrMBB);
	SL->SwitchCases.erase(SL->SwitchCases.begin());
	return;
	}

	// Okay, we decided not to do this, remove any inserted MBB's and clear
	// SwitchCases.
	for (unsigned i = 1, e = SL->SwitchCases.size(); i != e; ++i)
	FuncInfo.MF->erase(SL->SwitchCases[i].ThisBB);

	SL->SwitchCases.clear();
	}
	}

	// Create a CaseBlock record representing this branch.
	CaseBlock CB(ISD::SETEQ, CondVal, ConstantInt::getTrue(*DAG.getContext()),
	nullptr, Succ0MBB, Succ1MBB, BrMBB, getCurSDLoc());

	// Use visitSwitchCase to actually insert the fast branch sequence for this
	// cond branch.
	visitSwitchCase(CB, BrMBB);
	}

	/// visitSwitchCase - Emits the necessary code to represent a single node in
	/// the binary search tree resulting from lowering a switch instruction.
	void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
	MachineBasicBlock *SwitchBB) {
	SDValue Cond;
	SDValue CondLHS = getValue(CB.CmpLHS);
	SDLoc dl = CB.DL;

	if (CB.CC == ISD::SETTRUE) {
	// Branch or fall through to TrueBB.
	addSuccessorWithProb(SwitchBB, CB.TrueBB, CB.TrueProb);
	SwitchBB->normalizeSuccProbs();
	if (CB.TrueBB != NextBlock(SwitchBB)) {
	DAG.setRoot(DAG.getNode(ISD::BR, dl, MVT::Other, getControlRoot(),
	DAG.getBasicBlock(CB.TrueBB)));
	}
	return;
	}

	auto &TLI = DAG.getTargetLoweringInfo();
	EVT MemVT = TLI.getMemValueType(DAG.getDataLayout(), CB.CmpLHS->getType());

	// Build the setcc now.
	if (!CB.CmpMHS) {
	// Fold "(X == true)" to X and "(X == false)" to !X to
	// handle common cases produced by branch lowering.
	if (CB.CmpRHS == ConstantInt::getTrue(*DAG.getContext()) &&
	CB.CC == ISD::SETEQ)
	Cond = CondLHS;
	else if (CB.CmpRHS == ConstantInt::getFalse(*DAG.getContext()) &&
	CB.CC == ISD::SETEQ) {
	SDValue True = DAG.getConstant(1, dl, CondLHS.getValueType());
	Cond = DAG.getNode(ISD::XOR, dl, CondLHS.getValueType(), CondLHS, True);
	} else {
	SDValue CondRHS = getValue(CB.CmpRHS);

	// If a pointer's DAG type is larger than its memory type then the DAG
	// values are zero-extended. This breaks signed comparisons so truncate
	// back to the underlying type before doing the compare.
	if (CondLHS.getValueType() != MemVT) {
	CondLHS = DAG.getPtrExtOrTrunc(CondLHS, getCurSDLoc(), MemVT);
	CondRHS = DAG.getPtrExtOrTrunc(CondRHS, getCurSDLoc(), MemVT);
	}
	Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, CondRHS, CB.CC);
	}
	} else {
	assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now");

	const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
	const APInt& High = cast<ConstantInt>(CB.CmpRHS)->getValue();

	SDValue CmpOp = getValue(CB.CmpMHS);
	EVT VT = CmpOp.getValueType();

	if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
	Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, dl, VT),
	ISD::SETLE);
	} else {
	SDValue SUB = DAG.getNode(ISD::SUB, dl,
	VT, CmpOp, DAG.getConstant(Low, dl, VT));
	Cond = DAG.getSetCC(dl, MVT::i1, SUB,
	DAG.getConstant(High-Low, dl, VT), ISD::SETULE);
	}
	}

	// Update successor info
	addSuccessorWithProb(SwitchBB, CB.TrueBB, CB.TrueProb);
	// TrueBB and FalseBB are always different unless the incoming IR is
	// degenerate. This only happens when running llc on weird IR.
	if (CB.TrueBB != CB.FalseBB)
	addSuccessorWithProb(SwitchBB, CB.FalseBB, CB.FalseProb);
	SwitchBB->normalizeSuccProbs();

	// If the lhs block is the next block, invert the condition so that we can
	// fall through to the lhs instead of the rhs block.
	if (CB.TrueBB == NextBlock(SwitchBB)) {
	std::swap(CB.TrueBB, CB.FalseBB);
	SDValue True = DAG.getConstant(1, dl, Cond.getValueType());
	Cond = DAG.getNode(ISD::XOR, dl, Cond.getValueType(), Cond, True);
	}

	SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
	MVT::Other, getControlRoot(), Cond,
	DAG.getBasicBlock(CB.TrueBB));

	// Insert the false branch. Do this even if it's a fall through branch,
	// this makes it easier to do DAG optimizations which require inverting
	// the branch condition.
	BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
	DAG.getBasicBlock(CB.FalseBB));

	DAG.setRoot(BrCond);
	}

	/// visitJumpTable - Emit JumpTable node in the current MBB
	void SelectionDAGBuilder::visitJumpTable(SwitchCG::JumpTable &JT) {
	// Emit the code for the jump table
	assert(JT.Reg != -1U && "Should lower JT Header first!");
	EVT PTy = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(),
	JT.Reg, PTy);
	SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
	SDValue BrJumpTable = DAG.getNode(ISD::BR_JT, getCurSDLoc(),
	MVT::Other, Index.getValue(1),
	Table, Index);
	DAG.setRoot(BrJumpTable);
	}

	/// visitJumpTableHeader - This function emits necessary code to produce index
	/// in the JumpTable from switch case.
	void SelectionDAGBuilder::visitJumpTableHeader(SwitchCG::JumpTable &JT,
	JumpTableHeader &JTH,
	MachineBasicBlock *SwitchBB) {
	SDLoc dl = getCurSDLoc();

	// Subtract the lowest switch case value from the value being switched on.
	SDValue SwitchOp = getValue(JTH.SValue);
	EVT VT = SwitchOp.getValueType();
	SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SwitchOp,
	DAG.getConstant(JTH.First, dl, VT));

	// The SDNode we just created, which holds the value being switched on minus
	// the smallest case value, needs to be copied to a virtual register so it
	// can be used as an index into the jump table in a subsequent basic block.
	// This value may be smaller or larger than the target's pointer type, and
	// therefore require extension or truncating.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SwitchOp = DAG.getZExtOrTrunc(Sub, dl, TLI.getPointerTy(DAG.getDataLayout()));

	unsigned JumpTableReg =
	FuncInfo.CreateReg(TLI.getPointerTy(DAG.getDataLayout()));
	SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl,
	JumpTableReg, SwitchOp);
	JT.Reg = JumpTableReg;

	if (!JTH.OmitRangeCheck) {
	// Emit the range check for the jump table, and branch to the default block
	// for the switch statement if the value being switched on exceeds the
	// largest case in the switch.
	SDValue CMP = DAG.getSetCC(
	dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	Sub.getValueType()),
	Sub, DAG.getConstant(JTH.Last - JTH.First, dl, VT), ISD::SETUGT);

	SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
	MVT::Other, CopyTo, CMP,
	DAG.getBasicBlock(JT.Default));

	// Avoid emitting unnecessary branches to the next block.
	if (JT.MBB != NextBlock(SwitchBB))
	BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
	DAG.getBasicBlock(JT.MBB));

	DAG.setRoot(BrCond);
	} else {
	// Avoid emitting unnecessary branches to the next block.
	if (JT.MBB != NextBlock(SwitchBB))
	DAG.setRoot(DAG.getNode(ISD::BR, dl, MVT::Other, CopyTo,
	DAG.getBasicBlock(JT.MBB)));
	else
	DAG.setRoot(CopyTo);
	}
	}

	/// Create a LOAD_STACK_GUARD node, and let it carry the target specific global
	/// variable if there exists one.
	static SDValue getLoadStackGuard(SelectionDAG &DAG, const SDLoc &DL,
	SDValue &Chain) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
	EVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout());
	MachineFunction &MF = DAG.getMachineFunction();
	Value Global = TLI.getSDagStackGuard(MF.getFunction().getParent());
	MachineSDNode *Node =
	DAG.getMachineNode(TargetOpcode::LOAD_STACK_GUARD, DL, PtrTy, Chain);
	if (Global) {
	MachinePointerInfo MPInfo(Global);
	auto Flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOInvariant \|
	MachineMemOperand::MODereferenceable;
	MachineMemOperand *MemRef = MF.getMachineMemOperand(
	MPInfo, Flags, PtrTy.getSizeInBits() / 8, DAG.getEVTAlignment(PtrTy));
	DAG.setNodeMemRefs(Node, {MemRef});
	}
	if (PtrTy != PtrMemTy)
	return DAG.getPtrExtOrTrunc(SDValue(Node, 0), DL, PtrMemTy);
	return SDValue(Node, 0);
	}

	/// Codegen a new tail for a stack protector check ParentMBB which has had its
	/// tail spliced into a stack protector check success bb.
	///
	/// For a high level explanation of how this fits into the stack protector
	/// generation see the comment on the declaration of class
	/// StackProtectorDescriptor.
	void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
	MachineBasicBlock *ParentBB) {

	// First create the loads to the guard/stack slot for the comparison.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
	EVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout());

	MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
	int FI = MFI.getStackProtectorIndex();

	SDValue Guard;
	SDLoc dl = getCurSDLoc();
	SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
	const Module &M = *ParentBB->getParent()->getFunction().getParent();
	unsigned Align = DL->getPrefTypeAlignment(Type::getInt8PtrTy(M.getContext()));

	// Generate code to load the content of the guard slot.
	SDValue GuardVal = DAG.getLoad(
	PtrMemTy, dl, DAG.getEntryNode(), StackSlotPtr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), Align,
	MachineMemOperand::MOVolatile);

	if (TLI.useStackGuardXorFP())
	GuardVal = TLI.emitStackGuardXorFP(DAG, GuardVal, dl);

	// Retrieve guard check function, nullptr if instrumentation is inlined.
	if (const Function *GuardCheckFn = TLI.getSSPStackGuardCheck(M)) {
	// The target provides a guard check function to validate the guard value.
	// Generate a call to that function with the content of the guard slot as
	// argument.
	FunctionType *FnTy = GuardCheckFn->getFunctionType();
	assert(FnTy->getNumParams() == 1 && "Invalid function signature");

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Node = GuardVal;
	Entry.Ty = FnTy->getParamType(0);
	if (GuardCheckFn->hasAttribute(1, Attribute::AttrKind::InReg))
	Entry.IsInReg = true;
	Args.push_back(Entry);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(getCurSDLoc())
	.setChain(DAG.getEntryNode())
	.setCallee(GuardCheckFn->getCallingConv(), FnTy->getReturnType(),
	getValue(GuardCheckFn), std::move(Args));

	std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
	DAG.setRoot(Result.second);
	return;
	}

	// If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
	// Otherwise, emit a volatile load to retrieve the stack guard value.
	SDValue Chain = DAG.getEntryNode();
	if (TLI.useLoadStackGuardNode()) {
	Guard = getLoadStackGuard(DAG, dl, Chain);
	} else {
	const Value *IRGuard = TLI.getSDagStackGuard(M);
	SDValue GuardPtr = getValue(IRGuard);

	Guard = DAG.getLoad(PtrMemTy, dl, Chain, GuardPtr,
	MachinePointerInfo(IRGuard, 0), Align,
	MachineMemOperand::MOVolatile);
	}

	// Perform the comparison via a subtract/getsetcc.
	EVT VT = Guard.getValueType();
	SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Guard, GuardVal);

	SDValue Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(),
	Sub.getValueType()),
	Sub, DAG.getConstant(0, dl, VT), ISD::SETNE);

	// If the sub is not 0, then we know the guard/stackslot do not equal, so
	// branch to failure MBB.
	SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
	MVT::Other, GuardVal.getOperand(0),
	Cmp, DAG.getBasicBlock(SPD.getFailureMBB()));
	// Otherwise branch to success MBB.
	SDValue Br = DAG.getNode(ISD::BR, dl,
	MVT::Other, BrCond,
	DAG.getBasicBlock(SPD.getSuccessMBB()));

	DAG.setRoot(Br);
	}

	/// Codegen the failure basic block for a stack protector check.
	///
	/// A failure stack protector machine basic block consists simply of a call to
	/// __stack_chk_fail().
	///
	/// For a high level explanation of how this fits into the stack protector
	/// generation see the comment on the declaration of class
	/// StackProtectorDescriptor.
	void
	SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Chain =
	TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid,
	None, false, getCurSDLoc(), false, false).second;
	// On PS4, the "return address" must still be within the calling function,
	// even if it's at the very end, so emit an explicit TRAP here.
	// Passing 'true' for doesNotReturn above won't generate the trap for us.
	if (TM.getTargetTriple().isPS4CPU())
	Chain = DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, Chain);

	DAG.setRoot(Chain);
	}

	/// visitBitTestHeader - This function emits necessary code to produce value
	/// suitable for "bit tests"
	void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
	MachineBasicBlock *SwitchBB) {
	SDLoc dl = getCurSDLoc();

	// Subtract the minimum value
	SDValue SwitchOp = getValue(B.SValue);
	EVT VT = SwitchOp.getValueType();
	SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SwitchOp,
	DAG.getConstant(B.First, dl, VT));

	// Check range
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue RangeCmp = DAG.getSetCC(
	dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	Sub.getValueType()),
	Sub, DAG.getConstant(B.Range, dl, VT), ISD::SETUGT);

	// Determine the type of the test operands.
	bool UsePtrType = false;
	if (!TLI.isTypeLegal(VT))
	UsePtrType = true;
	else {
	for (unsigned i = 0, e = B.Cases.size(); i != e; ++i)
	if (!isUIntN(VT.getSizeInBits(), B.Cases[i].Mask)) {
	// Switch table case range are encoded into series of masks.
	// Just use pointer type, it's guaranteed to fit.
	UsePtrType = true;
	break;
	}
	}
	if (UsePtrType) {
	VT = TLI.getPointerTy(DAG.getDataLayout());
	Sub = DAG.getZExtOrTrunc(Sub, dl, VT);
	}

	B.RegVT = VT.getSimpleVT();
	B.Reg = FuncInfo.CreateReg(B.RegVT);
	SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl, B.Reg, Sub);

	MachineBasicBlock* MBB = B.Cases[0].ThisBB;

	addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
	addSuccessorWithProb(SwitchBB, MBB, B.Prob);
	SwitchBB->normalizeSuccProbs();

	SDValue BrRange = DAG.getNode(ISD::BRCOND, dl,
	MVT::Other, CopyTo, RangeCmp,
	DAG.getBasicBlock(B.Default));

	// Avoid emitting unnecessary branches to the next block.
	if (MBB != NextBlock(SwitchBB))
	BrRange = DAG.getNode(ISD::BR, dl, MVT::Other, BrRange,
	DAG.getBasicBlock(MBB));

	DAG.setRoot(BrRange);
	}

	/// visitBitTestCase - this function produces one "bit test"
	void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
	MachineBasicBlock* NextMBB,
	BranchProbability BranchProbToNext,
	unsigned Reg,
	BitTestCase &B,
	MachineBasicBlock *SwitchBB) {
	SDLoc dl = getCurSDLoc();
	MVT VT = BB.RegVT;
	SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), dl, Reg, VT);
	SDValue Cmp;
	unsigned PopCount = countPopulation(B.Mask);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (PopCount == 1) {
	// Testing for a single bit; just compare the shift count with what it
	// would need to be to shift a 1 bit in that position.
	Cmp = DAG.getSetCC(
	dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
	ShiftOp, DAG.getConstant(countTrailingZeros(B.Mask), dl, VT),
	ISD::SETEQ);
	} else if (PopCount == BB.Range) {
	// There is only one zero bit in the range, test for it directly.
	Cmp = DAG.getSetCC(
	dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
	ShiftOp, DAG.getConstant(countTrailingOnes(B.Mask), dl, VT),
	ISD::SETNE);
	} else {
	// Make desired shift
	SDValue SwitchVal = DAG.getNode(ISD::SHL, dl, VT,
	DAG.getConstant(1, dl, VT), ShiftOp);

	// Emit bit tests and jumps
	SDValue AndOp = DAG.getNode(ISD::AND, dl,
	VT, SwitchVal, DAG.getConstant(B.Mask, dl, VT));
	Cmp = DAG.getSetCC(
	dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
	AndOp, DAG.getConstant(0, dl, VT), ISD::SETNE);
	}

	// The branch probability from SwitchBB to B.TargetBB is B.ExtraProb.
	addSuccessorWithProb(SwitchBB, B.TargetBB, B.ExtraProb);
	// The branch probability from SwitchBB to NextMBB is BranchProbToNext.
	addSuccessorWithProb(SwitchBB, NextMBB, BranchProbToNext);
	// It is not guaranteed that the sum of B.ExtraProb and BranchProbToNext is
	// one as they are relative probabilities (and thus work more like weights),
	// and hence we need to normalize them to let the sum of them become one.
	SwitchBB->normalizeSuccProbs();

	SDValue BrAnd = DAG.getNode(ISD::BRCOND, dl,
	MVT::Other, getControlRoot(),
	Cmp, DAG.getBasicBlock(B.TargetBB));

	// Avoid emitting unnecessary branches to the next block.
	if (NextMBB != NextBlock(SwitchBB))
	BrAnd = DAG.getNode(ISD::BR, dl, MVT::Other, BrAnd,
	DAG.getBasicBlock(NextMBB));

	DAG.setRoot(BrAnd);
	}

	void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
	MachineBasicBlock *InvokeMBB = FuncInfo.MBB;

	// Retrieve successors. Look through artificial IR level blocks like
	// catchswitch for successors.
	MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)];
	const BasicBlock *EHPadBB = I.getSuccessor(1);

	// Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
	// have to do anything here to lower funclet bundles.
	assert(!I.hasOperandBundlesOtherThan(
	{LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
	"Cannot lower invokes with arbitrary operand bundles yet!");

	const Value *Callee(I.getCalledValue());
	const Function *Fn = dyn_cast<Function>(Callee);
	if (isa<InlineAsm>(Callee))
	visitInlineAsm(&I);
	else if (Fn && Fn->isIntrinsic()) {
	switch (Fn->getIntrinsicID()) {
	default:
	llvm_unreachable("Cannot invoke this intrinsic");
	case Intrinsic::donothing:
	// Ignore invokes to @llvm.donothing: jump directly to the next BB.
	break;
	case Intrinsic::experimental_patchpoint_void:
	case Intrinsic::experimental_patchpoint_i64:
	visitPatchpoint(&I, EHPadBB);
	break;
	case Intrinsic::experimental_gc_statepoint:
	LowerStatepoint(ImmutableStatepoint(&I), EHPadBB);
	break;
	case Intrinsic::wasm_rethrow_in_catch: {
	// This is usually done in visitTargetIntrinsic, but this intrinsic is
	// special because it can be invoked, so we manually lower it to a DAG
	// node here.
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(getRoot()); // inchain
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	Ops.push_back(
	DAG.getTargetConstant(Intrinsic::wasm_rethrow_in_catch, getCurSDLoc(),
	TLI.getPointerTy(DAG.getDataLayout())));
	SDVTList VTs = DAG.getVTList(ArrayRef<EVT>({MVT::Other})); // outchain
	DAG.setRoot(DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops));
	break;
	}
	}
	} else if (I.countOperandBundlesOfType(LLVMContext::OB_deopt)) {
	// Currently we do not lower any intrinsic calls with deopt operand bundles.
	// Eventually we will support lowering the @llvm.experimental.deoptimize
	// intrinsic, and right now there are no plans to support other intrinsics
	// with deopt state.
	LowerCallSiteWithDeoptBundle(&I, getValue(Callee), EHPadBB);
	} else {
	LowerCallTo(&I, getValue(Callee), false, EHPadBB);
	}

	// If the value of the invoke is used outside of its defining block, make it
	// available as a virtual register.
	// We already took care of the exported value for the statepoint instruction
	// during call to the LowerStatepoint.
	if (!isStatepoint(I)) {
	CopyToExportRegsIfNeeded(&I);
	}

	SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
	BranchProbabilityInfo *BPI = FuncInfo.BPI;
	BranchProbability EHPadBBProb =
	BPI ? BPI->getEdgeProbability(InvokeMBB->getBasicBlock(), EHPadBB)
	: BranchProbability::getZero();
	findUnwindDestinations(FuncInfo, EHPadBB, EHPadBBProb, UnwindDests);

	// Update successor info.
	addSuccessorWithProb(InvokeMBB, Return);
	for (auto &UnwindDest : UnwindDests) {
	UnwindDest.first->setIsEHPad();
	addSuccessorWithProb(InvokeMBB, UnwindDest.first, UnwindDest.second);
	}
	InvokeMBB->normalizeSuccProbs();

	// Drop into normal successor.
	DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, getControlRoot(),
	DAG.getBasicBlock(Return)));
	}

	void SelectionDAGBuilder::visitCallBr(const CallBrInst &I) {
	MachineBasicBlock *CallBrMBB = FuncInfo.MBB;

	// Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
	// have to do anything here to lower funclet bundles.
	assert(!I.hasOperandBundlesOtherThan(
	{LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
	"Cannot lower callbrs with arbitrary operand bundles yet!");

	assert(isa<InlineAsm>(I.getCalledValue()) &&
	"Only know how to handle inlineasm callbr");
	visitInlineAsm(&I);

	// Retrieve successors.
	MachineBasicBlock *Return = FuncInfo.MBBMap[I.getDefaultDest()];

	// Update successor info.
	addSuccessorWithProb(CallBrMBB, Return);
	for (unsigned i = 0, e = I.getNumIndirectDests(); i < e; ++i) {
	MachineBasicBlock *Target = FuncInfo.MBBMap[I.getIndirectDest(i)];
	addSuccessorWithProb(CallBrMBB, Target);
	}
	CallBrMBB->normalizeSuccProbs();

	// Drop into default successor.
	DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
	MVT::Other, getControlRoot(),
	DAG.getBasicBlock(Return)));
	}

	void SelectionDAGBuilder::visitResume(const ResumeInst &RI) {
	llvm_unreachable("SelectionDAGBuilder shouldn't visit resume instructions!");
	}

	void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
	assert(FuncInfo.MBB->isEHPad() &&
	"Call to landingpad not in landing pad!");

	// If there aren't registers to copy the values into (e.g., during SjLj
	// exceptions), then don't bother to create these DAG nodes.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const Constant *PersonalityFn = FuncInfo.Fn->getPersonalityFn();
	if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 &&
	TLI.getExceptionSelectorRegister(PersonalityFn) == 0)
	return;

	// If landingpad's return type is token type, we don't create DAG nodes
	// for its exception pointer and selector value. The extraction of exception
	// pointer or selector value from token type landingpads is not currently
	// supported.
	if (LP.getType()->isTokenTy())
	return;

	SmallVector<EVT, 2> ValueVTs;
	SDLoc dl = getCurSDLoc();
	ComputeValueVTs(TLI, DAG.getDataLayout(), LP.getType(), ValueVTs);
	assert(ValueVTs.size() == 2 && "Only two-valued landingpads are supported");

	// Get the two live-in registers as SDValues. The physregs have already been
	// copied into virtual registers.
	SDValue Ops[2];
	if (FuncInfo.ExceptionPointerVirtReg) {
	Ops[0] = DAG.getZExtOrTrunc(
	DAG.getCopyFromReg(DAG.getEntryNode(), dl,
	FuncInfo.ExceptionPointerVirtReg,
	TLI.getPointerTy(DAG.getDataLayout())),
	dl, ValueVTs[0]);
	} else {
	Ops[0] = DAG.getConstant(0, dl, TLI.getPointerTy(DAG.getDataLayout()));
	}
	Ops[1] = DAG.getZExtOrTrunc(
	DAG.getCopyFromReg(DAG.getEntryNode(), dl,
	FuncInfo.ExceptionSelectorVirtReg,
	TLI.getPointerTy(DAG.getDataLayout())),
	dl, ValueVTs[1]);

	// Merge into one.
	SDValue Res = DAG.getNode(ISD::MERGE_VALUES, dl,
	DAG.getVTList(ValueVTs), Ops);
	setValue(&LP, Res);
	}

	void SelectionDAGBuilder::UpdateSplitBlock(MachineBasicBlock *First,
	MachineBasicBlock *Last) {
	// Update JTCases.
	for (unsigned i = 0, e = SL->JTCases.size(); i != e; ++i)
	if (SL->JTCases[i].first.HeaderBB == First)
	SL->JTCases[i].first.HeaderBB = Last;

	// Update BitTestCases.
	for (unsigned i = 0, e = SL->BitTestCases.size(); i != e; ++i)
	if (SL->BitTestCases[i].Parent == First)
	SL->BitTestCases[i].Parent = Last;
	}

	void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
	MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB;

	// Update machine-CFG edges with unique successors.
	SmallSet<BasicBlock*, 32> Done;
	for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) {
	BasicBlock *BB = I.getSuccessor(i);
	bool Inserted = Done.insert(BB).second;
	if (!Inserted)
	continue;

	MachineBasicBlock *Succ = FuncInfo.MBBMap[BB];
	addSuccessorWithProb(IndirectBrMBB, Succ);
	}
	IndirectBrMBB->normalizeSuccProbs();

	DAG.setRoot(DAG.getNode(ISD::BRIND, getCurSDLoc(),
	MVT::Other, getControlRoot(),
	getValue(I.getAddress())));
	}

	void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) {
	if (!DAG.getTarget().Options.TrapUnreachable)
	return;

	// We may be able to ignore unreachable behind a noreturn call.
	if (DAG.getTarget().Options.NoTrapAfterNoreturn) {
	const BasicBlock &BB = *I.getParent();
	if (&I != &BB.front()) {
	BasicBlock::const_iterator PredI =
	std::prev(BasicBlock::const_iterator(&I));
	if (const CallInst Call = dyn_cast<CallInst>(&PredI)) {
	if (Call->doesNotReturn())
	return;
	}
	}
	}

	DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
	}

	void SelectionDAGBuilder::visitFSub(const User &I) {
	// -0.0 - X --> fneg
	Type *Ty = I.getType();
	if (isa<Constant>(I.getOperand(0)) &&
	I.getOperand(0) == ConstantFP::getZeroValueForNegation(Ty)) {
	SDValue Op2 = getValue(I.getOperand(1));
	setValue(&I, DAG.getNode(ISD::FNEG, getCurSDLoc(),
	Op2.getValueType(), Op2));
	return;
	}

	visitBinary(I, ISD::FSUB);
	}

	/// Checks if the given instruction performs a vector reduction, in which case
	/// we have the freedom to alter the elements in the result as long as the
	/// reduction of them stays unchanged.
	static bool isVectorReductionOp(const User *I) {
	const Instruction *Inst = dyn_cast<Instruction>(I);
	if (!Inst \|\| !Inst->getType()->isVectorTy())
	return false;

	auto OpCode = Inst->getOpcode();
	switch (OpCode) {
	case Instruction::Add:
	case Instruction::Mul:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor:
	break;
	case Instruction::FAdd:
	case Instruction::FMul:
	if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
	if (FPOp->getFastMathFlags().isFast())
	break;
	LLVM_FALLTHROUGH;
	default:
	return false;
	}

	unsigned ElemNum = Inst->getType()->getVectorNumElements();
	// Ensure the reduction size is a power of 2.
	if (!isPowerOf2_32(ElemNum))
	return false;

	unsigned ElemNumToReduce = ElemNum;

	// Do DFS search on the def-use chain from the given instruction. We only
	// allow four kinds of operations during the search until we reach the
	// instruction that extracts the first element from the vector:
	//
	// 1. The reduction operation of the same opcode as the given instruction.
	//
	// 2. PHI node.
	//
	// 3. ShuffleVector instruction together with a reduction operation that
	// does a partial reduction.
	//
	// 4. ExtractElement that extracts the first element from the vector, and we
	// stop searching the def-use chain here.
	//
	// 3 & 4 above perform a reduction on all elements of the vector. We push defs
	// from 1-3 to the stack to continue the DFS. The given instruction is not
	// a reduction operation if we meet any other instructions other than those
	// listed above.

	SmallVector<const User *, 16> UsersToVisit{Inst};
	SmallPtrSet<const User *, 16> Visited;
	bool ReduxExtracted = false;

	while (!UsersToVisit.empty()) {
	auto User = UsersToVisit.back();
	UsersToVisit.pop_back();
	if (!Visited.insert(User).second)
	continue;

	for (const auto &U : User->users()) {
	auto Inst = dyn_cast<Instruction>(U);
	if (!Inst)
	return false;

	if (Inst->getOpcode() == OpCode \|\| isa<PHINode>(U)) {
	if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
	if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
	return false;
	UsersToVisit.push_back(U);
	} else if (const ShuffleVectorInst *ShufInst =
	dyn_cast<ShuffleVectorInst>(U)) {
	// Detect the following pattern: A ShuffleVector instruction together
	// with a reduction that do partial reduction on the first and second
	// ElemNumToReduce / 2 elements, and store the result in
	// ElemNumToReduce / 2 elements in another vector.

	unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
	if (ResultElements < ElemNum)
	return false;

	if (ElemNumToReduce == 1)
	return false;
	if (!isa<UndefValue>(U->getOperand(1)))
	return false;
	for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
	if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
	return false;
	for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
	if (ShufInst->getMaskValue(i) != -1)
	return false;

	// There is only one user of this ShuffleVector instruction, which
	// must be a reduction operation.
	if (!U->hasOneUse())
	return false;

	auto U2 = dyn_cast<Instruction>(*U->user_begin());
	if (!U2 \|\| U2->getOpcode() != OpCode)
	return false;

	// Check operands of the reduction operation.
	if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) \|\|
	(U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
	UsersToVisit.push_back(U2);
	ElemNumToReduce /= 2;
	} else
	return false;
	} else if (isa<ExtractElementInst>(U)) {
	// At this moment we should have reduced all elements in the vector.
	if (ElemNumToReduce != 1)
	return false;

	const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
	if (!Val \|\| !Val->isZero())
	return false;

	ReduxExtracted = true;
	} else
	return false;
	}
	}
	return ReduxExtracted;
	}

	void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) {
	SDNodeFlags Flags;

	SDValue Op = getValue(I.getOperand(0));
	SDValue UnNodeValue = DAG.getNode(Opcode, getCurSDLoc(), Op.getValueType(),
	Op, Flags);
	setValue(&I, UnNodeValue);
	}

	void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) {
	SDNodeFlags Flags;
	if (auto *OFBinOp = dyn_cast<OverflowingBinaryOperator>(&I)) {
	Flags.setNoSignedWrap(OFBinOp->hasNoSignedWrap());
	Flags.setNoUnsignedWrap(OFBinOp->hasNoUnsignedWrap());
	}
	if (auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) {
	Flags.setExact(ExactOp->isExact());
	}
	if (isVectorReductionOp(&I)) {
	Flags.setVectorReduction(true);
	LLVM_DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
	}

	SDValue Op1 = getValue(I.getOperand(0));
	SDValue Op2 = getValue(I.getOperand(1));
	SDValue BinNodeValue = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(),
	Op1, Op2, Flags);
	setValue(&I, BinNodeValue);
	}

	void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
	SDValue Op1 = getValue(I.getOperand(0));
	SDValue Op2 = getValue(I.getOperand(1));

	EVT ShiftTy = DAG.getTargetLoweringInfo().getShiftAmountTy(
	Op1.getValueType(), DAG.getDataLayout());

	// Coerce the shift amount to the right type if we can.
	if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) {
	unsigned ShiftSize = ShiftTy.getSizeInBits();
	unsigned Op2Size = Op2.getValueSizeInBits();
	SDLoc DL = getCurSDLoc();

	// If the operand is smaller than the shift count type, promote it.
	if (ShiftSize > Op2Size)
	Op2 = DAG.getNode(ISD::ZERO_EXTEND, DL, ShiftTy, Op2);

	// If the operand is larger than the shift count type but the shift
	// count type has enough bits to represent any shift value, truncate
	// it now. This is a common case and it exposes the truncate to
	// optimization early.
	else if (ShiftSize >= Log2_32_Ceil(Op2.getValueSizeInBits()))
	Op2 = DAG.getNode(ISD::TRUNCATE, DL, ShiftTy, Op2);
	// Otherwise we'll need to temporarily settle for some other convenient
	// type. Type legalization will make adjustments once the shiftee is split.
	else
	Op2 = DAG.getZExtOrTrunc(Op2, DL, MVT::i32);
	}

	bool nuw = false;
	bool nsw = false;
	bool exact = false;

	if (Opcode == ISD::SRL \|\| Opcode == ISD::SRA \|\| Opcode == ISD::SHL) {

	if (const OverflowingBinaryOperator *OFBinOp =
	dyn_cast<const OverflowingBinaryOperator>(&I)) {
	nuw = OFBinOp->hasNoUnsignedWrap();
	nsw = OFBinOp->hasNoSignedWrap();
	}
	if (const PossiblyExactOperator *ExactOp =
	dyn_cast<const PossiblyExactOperator>(&I))
	exact = ExactOp->isExact();
	}
	SDNodeFlags Flags;
	Flags.setExact(exact);
	Flags.setNoSignedWrap(nsw);
	Flags.setNoUnsignedWrap(nuw);
	SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2,
	Flags);
	setValue(&I, Res);
	}

	void SelectionDAGBuilder::visitSDiv(const User &I) {
	SDValue Op1 = getValue(I.getOperand(0));
	SDValue Op2 = getValue(I.getOperand(1));

	SDNodeFlags Flags;
	Flags.setExact(isa<PossiblyExactOperator>(&I) &&
	cast<PossiblyExactOperator>(&I)->isExact());
	setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(), Op1,
	Op2, Flags));
	}

	void SelectionDAGBuilder::visitICmp(const User &I) {
	ICmpInst::Predicate predicate = ICmpInst::BAD_ICMP_PREDICATE;
	if (const ICmpInst *IC = dyn_cast<ICmpInst>(&I))
	predicate = IC->getPredicate();
	else if (const ConstantExpr *IC = dyn_cast<ConstantExpr>(&I))
	predicate = ICmpInst::Predicate(IC->getPredicate());
	SDValue Op1 = getValue(I.getOperand(0));
	SDValue Op2 = getValue(I.getOperand(1));
	ISD::CondCode Opcode = getICmpCondCode(predicate);

	auto &TLI = DAG.getTargetLoweringInfo();
	EVT MemVT =
	TLI.getMemValueType(DAG.getDataLayout(), I.getOperand(0)->getType());

	// If a pointer's DAG type is larger than its memory type then the DAG values
	// are zero-extended. This breaks signed comparisons so truncate back to the
	// underlying type before doing the compare.
	if (Op1.getValueType() != MemVT) {
	Op1 = DAG.getPtrExtOrTrunc(Op1, getCurSDLoc(), MemVT);
	Op2 = DAG.getPtrExtOrTrunc(Op2, getCurSDLoc(), MemVT);
	}

	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Opcode));
	}

	void SelectionDAGBuilder::visitFCmp(const User &I) {
	FCmpInst::Predicate predicate = FCmpInst::BAD_FCMP_PREDICATE;
	if (const FCmpInst *FC = dyn_cast<FCmpInst>(&I))
	predicate = FC->getPredicate();
	else if (const ConstantExpr *FC = dyn_cast<ConstantExpr>(&I))
	predicate = FCmpInst::Predicate(FC->getPredicate());
	SDValue Op1 = getValue(I.getOperand(0));
	SDValue Op2 = getValue(I.getOperand(1));

	ISD::CondCode Condition = getFCmpCondCode(predicate);
	auto *FPMO = dyn_cast<FPMathOperator>(&I);
	if ((FPMO && FPMO->hasNoNaNs()) \|\| TM.Options.NoNaNsFPMath)
	Condition = getFCmpCodeWithoutNaN(Condition);

	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
	}

	// Check if the condition of the select has one use or two users that are both
	// selects with the same condition.
	static bool hasOnlySelectUsers(const Value *Cond) {
	return llvm::all_of(Cond->users(), [](const Value *V) {
	return isa<SelectInst>(V);
	});
	}

	void SelectionDAGBuilder::visitSelect(const User &I) {
	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(),
	ValueVTs);
	unsigned NumValues = ValueVTs.size();
	if (NumValues == 0) return;

	SmallVector<SDValue, 4> Values(NumValues);
	SDValue Cond = getValue(I.getOperand(0));
	SDValue LHSVal = getValue(I.getOperand(1));
	SDValue RHSVal = getValue(I.getOperand(2));
	auto BaseOps = {Cond};
	ISD::NodeType OpCode = Cond.getValueType().isVector() ?
	ISD::VSELECT : ISD::SELECT;

	bool IsUnaryAbs = false;

	// Min/max matching is only viable if all output VTs are the same.
	if (is_splat(ValueVTs)) {
	EVT VT = ValueVTs[0];
	LLVMContext &Ctx = *DAG.getContext();
	auto &TLI = DAG.getTargetLoweringInfo();

	// We care about the legality of the operation after it has been type
	// legalized.
	while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal &&
	VT != TLI.getTypeToTransformTo(Ctx, VT))
	VT = TLI.getTypeToTransformTo(Ctx, VT);

	// If the vselect is legal, assume we want to leave this as a vector setcc +
	// vselect. Otherwise, if this is going to be scalarized, we want to see if
	// min/max is legal on the scalar type.
	bool UseScalarMinMax = VT.isVector() &&
	!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT);

	Value LHS, RHS;
	auto SPR = matchSelectPattern(const_cast<User*>(&I), LHS, RHS);
	ISD::NodeType Opc = ISD::DELETED_NODE;
	switch (SPR.Flavor) {
	case SPF_UMAX: Opc = ISD::UMAX; break;
	case SPF_UMIN: Opc = ISD::UMIN; break;
	case SPF_SMAX: Opc = ISD::SMAX; break;
	case SPF_SMIN: Opc = ISD::SMIN; break;
	case SPF_FMINNUM:
	switch (SPR.NaNBehavior) {
	case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
	case SPNB_RETURNS_NAN: Opc = ISD::FMINIMUM; break;
	case SPNB_RETURNS_OTHER: Opc = ISD::FMINNUM; break;
	case SPNB_RETURNS_ANY: {
	if (TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT))
	Opc = ISD::FMINNUM;
	else if (TLI.isOperationLegalOrCustom(ISD::FMINIMUM, VT))
	Opc = ISD::FMINIMUM;
	else if (UseScalarMinMax)
	Opc = TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT.getScalarType()) ?
	ISD::FMINNUM : ISD::FMINIMUM;
	break;
	}
	}
	break;
	case SPF_FMAXNUM:
	switch (SPR.NaNBehavior) {
	case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
	case SPNB_RETURNS_NAN: Opc = ISD::FMAXIMUM; break;
	case SPNB_RETURNS_OTHER: Opc = ISD::FMAXNUM; break;
	case SPNB_RETURNS_ANY:

	if (TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT))
	Opc = ISD::FMAXNUM;
	else if (TLI.isOperationLegalOrCustom(ISD::FMAXIMUM, VT))
	Opc = ISD::FMAXIMUM;
	else if (UseScalarMinMax)
	Opc = TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT.getScalarType()) ?
	ISD::FMAXNUM : ISD::FMAXIMUM;
	break;
	}
	break;
	case SPF_ABS:
	IsUnaryAbs = true;
	Opc = ISD::ABS;
	break;
	case SPF_NABS:
	// TODO: we need to produce sub(0, abs(X)).
	default: break;
	}

	if (!IsUnaryAbs && Opc != ISD::DELETED_NODE &&
	(TLI.isOperationLegalOrCustom(Opc, VT) \|\|
	(UseScalarMinMax &&
	TLI.isOperationLegalOrCustom(Opc, VT.getScalarType()))) &&
	// If the underlying comparison instruction is used by any other
	// instruction, the consumed instructions won't be destroyed, so it is
	// not profitable to convert to a min/max.
	hasOnlySelectUsers(cast<SelectInst>(I).getCondition())) {
	OpCode = Opc;
	LHSVal = getValue(LHS);
	RHSVal = getValue(RHS);
	BaseOps = {};
	}

	if (IsUnaryAbs) {
	OpCode = Opc;
	LHSVal = getValue(LHS);
	BaseOps = {};
	}
	}

	if (IsUnaryAbs) {
	for (unsigned i = 0; i != NumValues; ++i) {
	Values[i] =
	DAG.getNode(OpCode, getCurSDLoc(),
	LHSVal.getNode()->getValueType(LHSVal.getResNo() + i),
	SDValue(LHSVal.getNode(), LHSVal.getResNo() + i));
	}
	} else {
	for (unsigned i = 0; i != NumValues; ++i) {
	SmallVector<SDValue, 3> Ops(BaseOps.begin(), BaseOps.end());
	Ops.push_back(SDValue(LHSVal.getNode(), LHSVal.getResNo() + i));
	Ops.push_back(SDValue(RHSVal.getNode(), RHSVal.getResNo() + i));
	Values[i] = DAG.getNode(
	OpCode, getCurSDLoc(),
	LHSVal.getNode()->getValueType(LHSVal.getResNo() + i), Ops);
	}
	}

	setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
	DAG.getVTList(ValueVTs), Values));
	}

	void SelectionDAGBuilder::visitTrunc(const User &I) {
	// TruncInst cannot be a no-op cast because sizeof(src) > sizeof(dest).
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitZExt(const User &I) {
	// ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
	// ZExt also can't be a cast to bool for same reason. So, nothing much to do
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitSExt(const User &I) {
	// SExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
	// SExt also can't be a cast to bool for same reason. So, nothing much to do
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitFPTrunc(const User &I) {
	// FPTrunc is never a no-op cast, no need to check
	SDValue N = getValue(I.getOperand(0));
	SDLoc dl = getCurSDLoc();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N,
	DAG.getTargetConstant(
	0, dl, TLI.getPointerTy(DAG.getDataLayout()))));
	}

	void SelectionDAGBuilder::visitFPExt(const User &I) {
	// FPExt is never a no-op cast, no need to check
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitFPToUI(const User &I) {
	// FPToUI is never a no-op cast, no need to check
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitFPToSI(const User &I) {
	// FPToSI is never a no-op cast, no need to check
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitUIToFP(const User &I) {
	// UIToFP is never a no-op cast, no need to check
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitSIToFP(const User &I) {
	// SIToFP is never a no-op cast, no need to check
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitPtrToInt(const User &I) {
	// What to do depends on the size of the integer and the size of the pointer.
	// We can either truncate, zero extend, or no-op, accordingly.
	SDValue N = getValue(I.getOperand(0));
	auto &TLI = DAG.getTargetLoweringInfo();
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	EVT PtrMemVT =
	TLI.getMemValueType(DAG.getDataLayout(), I.getOperand(0)->getType());
	N = DAG.getPtrExtOrTrunc(N, getCurSDLoc(), PtrMemVT);
	N = DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT);
	setValue(&I, N);
	}

	void SelectionDAGBuilder::visitIntToPtr(const User &I) {
	// What to do depends on the size of the integer and the size of the pointer.
	// We can either truncate, zero extend, or no-op, accordingly.
	SDValue N = getValue(I.getOperand(0));
	auto &TLI = DAG.getTargetLoweringInfo();
	EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	EVT PtrMemVT = TLI.getMemValueType(DAG.getDataLayout(), I.getType());
	N = DAG.getZExtOrTrunc(N, getCurSDLoc(), PtrMemVT);
	N = DAG.getPtrExtOrTrunc(N, getCurSDLoc(), DestVT);
	setValue(&I, N);
	}

	void SelectionDAGBuilder::visitBitCast(const User &I) {
	SDValue N = getValue(I.getOperand(0));
	SDLoc dl = getCurSDLoc();
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());

	// BitCast assures us that source and destination are the same size so this is
	// either a BITCAST or a no-op.
	if (DestVT != N.getValueType())
	setValue(&I, DAG.getNode(ISD::BITCAST, dl,
	DestVT, N)); // convert types.
	// Check if the original LLVM IR Operand was a ConstantInt, because getValue()
	// might fold any kind of constant expression to an integer constant and that
	// is not what we are looking for. Only recognize a bitcast of a genuine
	// constant integer as an opaque constant.
	else if(ConstantInt *C = dyn_cast<ConstantInt>(I.getOperand(0)))
	setValue(&I, DAG.getConstant(C->getValue(), dl, DestVT, /isTarget=/false,
	/isOpaque/true));
	else
	setValue(&I, N); // noop cast.
	}

	void SelectionDAGBuilder::visitAddrSpaceCast(const User &I) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const Value *SV = I.getOperand(0);
	SDValue N = getValue(SV);
	EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());

	unsigned SrcAS = SV->getType()->getPointerAddressSpace();
	unsigned DestAS = I.getType()->getPointerAddressSpace();

	if (!TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
	N = DAG.getAddrSpaceCast(getCurSDLoc(), DestVT, N, SrcAS, DestAS);

	setValue(&I, N);
	}

	void SelectionDAGBuilder::visitInsertElement(const User &I) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue InVec = getValue(I.getOperand(0));
	SDValue InVal = getValue(I.getOperand(1));
	SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(2)), getCurSDLoc(),
	TLI.getVectorIdxTy(DAG.getDataLayout()));
	setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurSDLoc(),
	TLI.getValueType(DAG.getDataLayout(), I.getType()),
	InVec, InVal, InIdx));
	}

	void SelectionDAGBuilder::visitExtractElement(const User &I) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue InVec = getValue(I.getOperand(0));
	SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(1)), getCurSDLoc(),
	TLI.getVectorIdxTy(DAG.getDataLayout()));
	setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurSDLoc(),
	TLI.getValueType(DAG.getDataLayout(), I.getType()),
	InVec, InIdx));
	}

	void SelectionDAGBuilder::visitShuffleVector(const User &I) {
	SDValue Src1 = getValue(I.getOperand(0));
	SDValue Src2 = getValue(I.getOperand(1));
	SDLoc DL = getCurSDLoc();

	SmallVector<int, 8> Mask;
	ShuffleVectorInst::getShuffleMask(cast<Constant>(I.getOperand(2)), Mask);
	unsigned MaskNumElts = Mask.size();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	EVT SrcVT = Src1.getValueType();
	unsigned SrcNumElts = SrcVT.getVectorNumElements();

	if (SrcNumElts == MaskNumElts) {
	setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, Mask));
	return;
	}

	// Normalize the shuffle vector since mask and vector length don't match.
	if (SrcNumElts < MaskNumElts) {
	// Mask is longer than the source vectors. We can use concatenate vector to
	// make the mask and vectors lengths match.

	if (MaskNumElts % SrcNumElts == 0) {
	// Mask length is a multiple of the source vector length.
	// Check if the shuffle is some kind of concatenation of the input
	// vectors.
	unsigned NumConcat = MaskNumElts / SrcNumElts;
	bool IsConcat = true;
	SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
	for (unsigned i = 0; i != MaskNumElts; ++i) {
	int Idx = Mask[i];
	if (Idx < 0)
	continue;
	// Ensure the indices in each SrcVT sized piece are sequential and that
	// the same source is used for the whole piece.
	if ((Idx % SrcNumElts != (i % SrcNumElts)) \|\|
	(ConcatSrcs[i / SrcNumElts] >= 0 &&
	ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts))) {
	IsConcat = false;
	break;
	}
	// Remember which source this index came from.
	ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
	}

	// The shuffle is concatenating multiple vectors together. Just emit
	// a CONCAT_VECTORS operation.
	if (IsConcat) {
	SmallVector<SDValue, 8> ConcatOps;
	for (auto Src : ConcatSrcs) {
	if (Src < 0)
	ConcatOps.push_back(DAG.getUNDEF(SrcVT));
	else if (Src == 0)
	ConcatOps.push_back(Src1);
	else
	ConcatOps.push_back(Src2);
	}
	setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps));
	return;
	}
	}

	unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts);
	unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
	EVT PaddedVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
	PaddedMaskNumElts);

	// Pad both vectors with undefs to make them the same length as the mask.
	SDValue UndefVal = DAG.getUNDEF(SrcVT);

	SmallVector<SDValue, 8> MOps1(NumConcat, UndefVal);
	SmallVector<SDValue, 8> MOps2(NumConcat, UndefVal);
	MOps1[0] = Src1;
	MOps2[0] = Src2;

	Src1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, PaddedVT, MOps1);
	Src2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, PaddedVT, MOps2);

	// Readjust mask for new input vector length.
	SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
	for (unsigned i = 0; i != MaskNumElts; ++i) {
	int Idx = Mask[i];
	if (Idx >= (int)SrcNumElts)
	Idx -= SrcNumElts - PaddedMaskNumElts;
	MappedOps[i] = Idx;
	}

	SDValue Result = DAG.getVectorShuffle(PaddedVT, DL, Src1, Src2, MappedOps);

	// If the concatenated vector was padded, extract a subvector with the
	// correct number of elements.
	if (MaskNumElts != PaddedMaskNumElts)
	Result = DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, DL, VT, Result,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));

	setValue(&I, Result);
	return;
	}

	if (SrcNumElts > MaskNumElts) {
	// Analyze the access pattern of the vector to see if we can extract
	// two subvectors and do the shuffle.
	int StartIdx[2] = { -1, -1 }; // StartIdx to extract from
	bool CanExtract = true;
	for (int Idx : Mask) {
	unsigned Input = 0;
	if (Idx < 0)
	continue;

	if (Idx >= (int)SrcNumElts) {
	Input = 1;
	Idx -= SrcNumElts;
	}

	// If all the indices come from the same MaskNumElts sized portion of
	// the sources we can use extract. Also make sure the extract wouldn't
	// extract past the end of the source.
	int NewStartIdx = alignDown(Idx, MaskNumElts);
	if (NewStartIdx + MaskNumElts > SrcNumElts \|\|
	(StartIdx[Input] >= 0 && StartIdx[Input] != NewStartIdx))
	CanExtract = false;
	// Make sure we always update StartIdx as we use it to track if all
	// elements are undef.
	StartIdx[Input] = NewStartIdx;
	}

	if (StartIdx[0] < 0 && StartIdx[1] < 0) {
	setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used.
	return;
	}
	if (CanExtract) {
	// Extract appropriate subvector and generate a vector shuffle
	for (unsigned Input = 0; Input < 2; ++Input) {
	SDValue &Src = Input == 0 ? Src1 : Src2;
	if (StartIdx[Input] < 0)
	Src = DAG.getUNDEF(VT);
	else {
	Src = DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
	DAG.getConstant(StartIdx[Input], DL,
	TLI.getVectorIdxTy(DAG.getDataLayout())));
	}
	}

	// Calculate new mask.
	SmallVector<int, 8> MappedOps(Mask.begin(), Mask.end());
	for (int &Idx : MappedOps) {
	if (Idx >= (int)SrcNumElts)
	Idx -= SrcNumElts + StartIdx[1] - MaskNumElts;
	else if (Idx >= 0)
	Idx -= StartIdx[0];
	}

	setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, MappedOps));
	return;
	}
	}

	// We can't use either concat vectors or extract subvectors so fall back to
	// replacing the shuffle with extract and build vector.
	// to insert and build vector.
	EVT EltVT = VT.getVectorElementType();
	EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
	SmallVector<SDValue,8> Ops;
	for (int Idx : Mask) {
	SDValue Res;

	if (Idx < 0) {
	Res = DAG.getUNDEF(EltVT);
	} else {
	SDValue &Src = Idx < (int)SrcNumElts ? Src1 : Src2;
	if (Idx >= (int)SrcNumElts) Idx -= SrcNumElts;

	Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
	EltVT, Src, DAG.getConstant(Idx, DL, IdxVT));
	}

	Ops.push_back(Res);
	}

	setValue(&I, DAG.getBuildVector(VT, DL, Ops));
	}

	void SelectionDAGBuilder::visitInsertValue(const User &I) {
	ArrayRef<unsigned> Indices;
	if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(&I))
	Indices = IV->getIndices();
	else
	Indices = cast<ConstantExpr>(&I)->getIndices();

	const Value *Op0 = I.getOperand(0);
	const Value *Op1 = I.getOperand(1);
	Type *AggTy = I.getType();
	Type *ValTy = Op1->getType();
	bool IntoUndef = isa<UndefValue>(Op0);
	bool FromUndef = isa<UndefValue>(Op1);

	unsigned LinearIndex = ComputeLinearIndex(AggTy, Indices);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SmallVector<EVT, 4> AggValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), AggTy, AggValueVTs);
	SmallVector<EVT, 4> ValValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), ValTy, ValValueVTs);

	unsigned NumAggValues = AggValueVTs.size();
	unsigned NumValValues = ValValueVTs.size();
	SmallVector<SDValue, 4> Values(NumAggValues);

	// Ignore an insertvalue that produces an empty object
	if (!NumAggValues) {
	setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
	return;
	}

	SDValue Agg = getValue(Op0);
	unsigned i = 0;
	// Copy the beginning value(s) from the original aggregate.
	for (; i != LinearIndex; ++i)
	Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
	SDValue(Agg.getNode(), Agg.getResNo() + i);
	// Copy values from the inserted value(s).
	if (NumValValues) {
	SDValue Val = getValue(Op1);
	for (; i != LinearIndex + NumValValues; ++i)
	Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) :
	SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex);
	}
	// Copy remaining value(s) from the original aggregate.
	for (; i != NumAggValues; ++i)
	Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
	SDValue(Agg.getNode(), Agg.getResNo() + i);

	setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
	DAG.getVTList(AggValueVTs), Values));
	}

	void SelectionDAGBuilder::visitExtractValue(const User &I) {
	ArrayRef<unsigned> Indices;
	if (const ExtractValueInst *EV = dyn_cast<ExtractValueInst>(&I))
	Indices = EV->getIndices();
	else
	Indices = cast<ConstantExpr>(&I)->getIndices();

	const Value *Op0 = I.getOperand(0);
	Type *AggTy = Op0->getType();
	Type *ValTy = I.getType();
	bool OutOfUndef = isa<UndefValue>(Op0);

	unsigned LinearIndex = ComputeLinearIndex(AggTy, Indices);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SmallVector<EVT, 4> ValValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), ValTy, ValValueVTs);

	unsigned NumValValues = ValValueVTs.size();

	// Ignore a extractvalue that produces an empty object
	if (!NumValValues) {
	setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
	return;
	}

	SmallVector<SDValue, 4> Values(NumValValues);

	SDValue Agg = getValue(Op0);
	// Copy out the selected value(s).
	for (unsigned i = LinearIndex; i != LinearIndex + NumValValues; ++i)
	Values[i - LinearIndex] =
	OutOfUndef ?
	DAG.getUNDEF(Agg.getNode()->getValueType(Agg.getResNo() + i)) :
	SDValue(Agg.getNode(), Agg.getResNo() + i);

	setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
	DAG.getVTList(ValValueVTs), Values));
	}

	void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
	Value *Op0 = I.getOperand(0);
	// Note that the pointer operand may be a vector of pointers. Take the scalar
	// element which holds a pointer.
	unsigned AS = Op0->getType()->getScalarType()->getPointerAddressSpace();
	SDValue N = getValue(Op0);
	SDLoc dl = getCurSDLoc();
	auto &TLI = DAG.getTargetLoweringInfo();
	MVT PtrTy = TLI.getPointerTy(DAG.getDataLayout(), AS);
	MVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout(), AS);

	// Normalize Vector GEP - all scalar operands should be converted to the
	// splat vector.
	unsigned VectorWidth = I.getType()->isVectorTy() ?
	cast<VectorType>(I.getType())->getVectorNumElements() : 0;

	if (VectorWidth && !N.getValueType().isVector()) {
	LLVMContext &Context = *DAG.getContext();
	EVT VT = EVT::getVectorVT(Context, N.getValueType(), VectorWidth);
	N = DAG.getSplatBuildVector(VT, dl, N);
	}

	for (gep_type_iterator GTI = gep_type_begin(&I), E = gep_type_end(&I);
	GTI != E; ++GTI) {
	const Value *Idx = GTI.getOperand();
	if (StructType *StTy = GTI.getStructTypeOrNull()) {
	unsigned Field = cast<Constant>(Idx)->getUniqueInteger().getZExtValue();
	if (Field) {
	// N = N + Offset
	uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field);

	// In an inbounds GEP with an offset that is nonnegative even when
	// interpreted as signed, assume there is no unsigned overflow.
	SDNodeFlags Flags;
	if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds())
	Flags.setNoUnsignedWrap(true);

	N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N,
	DAG.getConstant(Offset, dl, N.getValueType()), Flags);
	}
	} else {
	unsigned IdxSize = DAG.getDataLayout().getIndexSizeInBits(AS);
	MVT IdxTy = MVT::getIntegerVT(IdxSize);
	APInt ElementSize(IdxSize, DL->getTypeAllocSize(GTI.getIndexedType()));

	// If this is a scalar constant or a splat vector of constants,
	// handle it quickly.
	const auto *CI = dyn_cast<ConstantInt>(Idx);
	if (!CI && isa<ConstantDataVector>(Idx) &&
	cast<ConstantDataVector>(Idx)->getSplatValue())
	CI = cast<ConstantInt>(cast<ConstantDataVector>(Idx)->getSplatValue());

	if (CI) {
	if (CI->isZero())
	continue;
	APInt Offs = ElementSize * CI->getValue().sextOrTrunc(IdxSize);
	LLVMContext &Context = *DAG.getContext();
	SDValue OffsVal = VectorWidth ?
	DAG.getConstant(Offs, dl, EVT::getVectorVT(Context, IdxTy, VectorWidth)) :
	DAG.getConstant(Offs, dl, IdxTy);

	// In an inbouds GEP with an offset that is nonnegative even when
	// interpreted as signed, assume there is no unsigned overflow.
	SDNodeFlags Flags;
	if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds())
	Flags.setNoUnsignedWrap(true);

	OffsVal = DAG.getSExtOrTrunc(OffsVal, dl, N.getValueType());

	N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, Flags);
	continue;
	}

	// N = N + Idx * ElementSize;
	SDValue IdxN = getValue(Idx);

	if (!IdxN.getValueType().isVector() && VectorWidth) {
	EVT VT = EVT::getVectorVT(*Context, IdxN.getValueType(), VectorWidth);
	IdxN = DAG.getSplatBuildVector(VT, dl, IdxN);
	}

	// If the index is smaller or larger than intptr_t, truncate or extend
	// it.
	IdxN = DAG.getSExtOrTrunc(IdxN, dl, N.getValueType());

	// If this is a multiply by a power of two, turn it into a shl
	// immediately. This is a very common case.
	if (ElementSize != 1) {
	if (ElementSize.isPowerOf2()) {
	unsigned Amt = ElementSize.logBase2();
	IdxN = DAG.getNode(ISD::SHL, dl,
	N.getValueType(), IdxN,
	DAG.getConstant(Amt, dl, IdxN.getValueType()));
	} else {
	SDValue Scale = DAG.getConstant(ElementSize.getZExtValue(), dl,
	IdxN.getValueType());
	IdxN = DAG.getNode(ISD::MUL, dl,
	N.getValueType(), IdxN, Scale);
	}
	}

	N = DAG.getNode(ISD::ADD, dl,
	N.getValueType(), N, IdxN);
	}
	}

	if (PtrMemTy != PtrTy && !cast<GEPOperator>(I).isInBounds())
	N = DAG.getPtrExtendInReg(N, dl, PtrMemTy);

	setValue(&I, N);
	}

	void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
	// If this is a fixed sized alloca in the entry block of the function,
	// allocate it statically on the stack.
	if (FuncInfo.StaticAllocaMap.count(&I))
	return; // getValue will auto-populate this.

	SDLoc dl = getCurSDLoc();
	Type *Ty = I.getAllocatedType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	auto &DL = DAG.getDataLayout();
	uint64_t TySize = DL.getTypeAllocSize(Ty);
	unsigned Align =
	std::max((unsigned)DL.getPrefTypeAlignment(Ty), I.getAlignment());

	SDValue AllocSize = getValue(I.getArraySize());

	EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout(), DL.getAllocaAddrSpace());
	if (AllocSize.getValueType() != IntPtr)
	AllocSize = DAG.getZExtOrTrunc(AllocSize, dl, IntPtr);

	AllocSize = DAG.getNode(ISD::MUL, dl, IntPtr,
	AllocSize,
	DAG.getConstant(TySize, dl, IntPtr));

	// Handle alignment. If the requested alignment is less than or equal to
	// the stack alignment, ignore it. If the size is greater than or equal to
	// the stack alignment, we note this in the DYNAMIC_STACKALLOC node.
	unsigned StackAlign =
	DAG.getSubtarget().getFrameLowering()->getStackAlignment();
	if (Align <= StackAlign)
	Align = 0;

	// Round the size of the allocation up to the stack alignment size
	// by add SA-1 to the size. This doesn't overflow because we're computing
	// an address inside an alloca.
	SDNodeFlags Flags;
	Flags.setNoUnsignedWrap(true);
	AllocSize = DAG.getNode(ISD::ADD, dl, AllocSize.getValueType(), AllocSize,
	DAG.getConstant(StackAlign - 1, dl, IntPtr), Flags);

	// Mask out the low bits for alignment purposes.
	AllocSize =
	DAG.getNode(ISD::AND, dl, AllocSize.getValueType(), AllocSize,
	DAG.getConstant(~(uint64_t)(StackAlign - 1), dl, IntPtr));

	SDValue Ops[] = {getRoot(), AllocSize, DAG.getConstant(Align, dl, IntPtr)};
	SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other);
	SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, dl, VTs, Ops);
	setValue(&I, DSA);
	DAG.setRoot(DSA.getValue(1));

	assert(FuncInfo.MF->getFrameInfo().hasVarSizedObjects());
	}

	void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
	if (I.isAtomic())
	return visitAtomicLoad(I);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const Value *SV = I.getOperand(0);
	if (TLI.supportSwiftError()) {
	// Swifterror values can come from either a function parameter with
	// swifterror attribute or an alloca with swifterror attribute.
	if (const Argument *Arg = dyn_cast<Argument>(SV)) {
	if (Arg->hasSwiftErrorAttr())
	return visitLoadFromSwiftError(I);
	}

	if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
	if (Alloca->isSwiftError())
	return visitLoadFromSwiftError(I);
	}
	}

	SDValue Ptr = getValue(SV);

	Type *Ty = I.getType();

	bool isVolatile = I.isVolatile();
	bool isNonTemporal = I.getMetadata(LLVMContext::MD_nontemporal) != nullptr;
	bool isInvariant = I.getMetadata(LLVMContext::MD_invariant_load) != nullptr;
	bool isDereferenceable =
	isDereferenceablePointer(SV, I.getType(), DAG.getDataLayout());
	unsigned Alignment = I.getAlignment();

	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);
	const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);

	SmallVector<EVT, 4> ValueVTs, MemVTs;
	SmallVector<uint64_t, 4> Offsets;
	ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets);
	unsigned NumValues = ValueVTs.size();
	if (NumValues == 0)
	return;

	SDValue Root;
	bool ConstantMemory = false;
	if (isVolatile \|\| NumValues > MaxParallelChains)
	// Serialize volatile loads with other side effects.
	Root = getRoot();
	else if (AA &&
	AA->pointsToConstantMemory(MemoryLocation(
	SV,
	LocationSize::precise(DAG.getDataLayout().getTypeStoreSize(Ty)),
	AAInfo))) {
	// Do not serialize (non-volatile) loads of constant memory with anything.
	Root = DAG.getEntryNode();
	ConstantMemory = true;
	} else {
	// Do not serialize non-volatile loads against each other.
	Root = DAG.getRoot();
	}

	SDLoc dl = getCurSDLoc();

	if (isVolatile)
	Root = TLI.prepareVolatileOrAtomicLoad(Root, dl, DAG);

	// An aggregate load cannot wrap around the address space, so offsets to its
	// parts don't wrap either.
	SDNodeFlags Flags;
	Flags.setNoUnsignedWrap(true);

	SmallVector<SDValue, 4> Values(NumValues);
	SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
	EVT PtrVT = Ptr.getValueType();
	unsigned ChainI = 0;
	for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
	// Serializing loads here may result in excessive register pressure, and
	// TokenFactor places arbitrary choke points on the scheduler. SD scheduling
	// could recover a bit by hoisting nodes upward in the chain by recognizing
	// they are side-effect free or do not alias. The optimizer should really
	// avoid this case by converting large object/array copies to llvm.memcpy
	// (MaxParallelChains should always remain as failsafe).
	if (ChainI == MaxParallelChains) {
	assert(PendingLoads.empty() && "PendingLoads must be serialized first");
	SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	makeArrayRef(Chains.data(), ChainI));
	Root = Chain;
	ChainI = 0;
	}
	SDValue A = DAG.getNode(ISD::ADD, dl,
	PtrVT, Ptr,
	DAG.getConstant(Offsets[i], dl, PtrVT),
	Flags);
	auto MMOFlags = MachineMemOperand::MONone;
	if (isVolatile)
	MMOFlags \|= MachineMemOperand::MOVolatile;
	if (isNonTemporal)
	MMOFlags \|= MachineMemOperand::MONonTemporal;
	if (isInvariant)
	MMOFlags \|= MachineMemOperand::MOInvariant;
	if (isDereferenceable)
	MMOFlags \|= MachineMemOperand::MODereferenceable;
	MMOFlags \|= TLI.getMMOFlags(I);

	SDValue L = DAG.getLoad(MemVTs[i], dl, Root, A,
	MachinePointerInfo(SV, Offsets[i]), Alignment,
	MMOFlags, AAInfo, Ranges);
	Chains[ChainI] = L.getValue(1);

	if (MemVTs[i] != ValueVTs[i])
	L = DAG.getZExtOrTrunc(L, dl, ValueVTs[i]);

	Values[i] = L;
	}

	if (!ConstantMemory) {
	SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	makeArrayRef(Chains.data(), ChainI));
	if (isVolatile)
	DAG.setRoot(Chain);
	else
	PendingLoads.push_back(Chain);
	}

	setValue(&I, DAG.getNode(ISD::MERGE_VALUES, dl,
	DAG.getVTList(ValueVTs), Values));
	}

	void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) {
	assert(DAG.getTargetLoweringInfo().supportSwiftError() &&
	"call visitStoreToSwiftError when backend supports swifterror");

	SmallVector<EVT, 4> ValueVTs;
	SmallVector<uint64_t, 4> Offsets;
	const Value *SrcV = I.getOperand(0);
	ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
	SrcV->getType(), ValueVTs, &Offsets);
	assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
	"expect a single EVT for swifterror");

	SDValue Src = getValue(SrcV);
	// Create a virtual register, then update the virtual register.
	unsigned VReg =
	SwiftError.getOrCreateVRegDefAt(&I, FuncInfo.MBB, I.getPointerOperand());
	// Chain, DL, Reg, N or Chain, DL, Reg, N, Glue
	// Chain can be getRoot or getControlRoot.
	SDValue CopyNode = DAG.getCopyToReg(getRoot(), getCurSDLoc(), VReg,
	SDValue(Src.getNode(), Src.getResNo()));
	DAG.setRoot(CopyNode);
	}

	void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
	assert(DAG.getTargetLoweringInfo().supportSwiftError() &&
	"call visitLoadFromSwiftError when backend supports swifterror");

	assert(!I.isVolatile() &&
	I.getMetadata(LLVMContext::MD_nontemporal) == nullptr &&
	I.getMetadata(LLVMContext::MD_invariant_load) == nullptr &&
	"Support volatile, non temporal, invariant for load_from_swift_error");

	const Value *SV = I.getOperand(0);
	Type *Ty = I.getType();
	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);
	assert(
	(!AA \|\|
	!AA->pointsToConstantMemory(MemoryLocation(
	SV, LocationSize::precise(DAG.getDataLayout().getTypeStoreSize(Ty)),
	AAInfo))) &&
	"load_from_swift_error should not be constant memory");

	SmallVector<EVT, 4> ValueVTs;
	SmallVector<uint64_t, 4> Offsets;
	ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), Ty,
	ValueVTs, &Offsets);
	assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
	"expect a single EVT for swifterror");

	// Chain, DL, Reg, VT, Glue or Chain, DL, Reg, VT
	SDValue L = DAG.getCopyFromReg(
	getRoot(), getCurSDLoc(),
	SwiftError.getOrCreateVRegUseAt(&I, FuncInfo.MBB, SV), ValueVTs[0]);

	setValue(&I, L);
	}

	void SelectionDAGBuilder::visitStore(const StoreInst &I) {
	if (I.isAtomic())
	return visitAtomicStore(I);

	const Value *SrcV = I.getOperand(0);
	const Value *PtrV = I.getOperand(1);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.supportSwiftError()) {
	// Swifterror values can come from either a function parameter with
	// swifterror attribute or an alloca with swifterror attribute.
	if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
	if (Arg->hasSwiftErrorAttr())
	return visitStoreToSwiftError(I);
	}

	if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
	if (Alloca->isSwiftError())
	return visitStoreToSwiftError(I);
	}
	}

	SmallVector<EVT, 4> ValueVTs, MemVTs;
	SmallVector<uint64_t, 4> Offsets;
	ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
	SrcV->getType(), ValueVTs, &MemVTs, &Offsets);
	unsigned NumValues = ValueVTs.size();
	if (NumValues == 0)
	return;

	// Get the lowered operands. Note that we do this after
	// checking if NumResults is zero, because with zero results
	// the operands won't have values in the map.
	SDValue Src = getValue(SrcV);
	SDValue Ptr = getValue(PtrV);

	SDValue Root = getRoot();
	SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
	SDLoc dl = getCurSDLoc();
	EVT PtrVT = Ptr.getValueType();
	unsigned Alignment = I.getAlignment();
	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);

	auto MMOFlags = MachineMemOperand::MONone;
	if (I.isVolatile())
	MMOFlags \|= MachineMemOperand::MOVolatile;
	if (I.getMetadata(LLVMContext::MD_nontemporal) != nullptr)
	MMOFlags \|= MachineMemOperand::MONonTemporal;
	MMOFlags \|= TLI.getMMOFlags(I);

	// An aggregate load cannot wrap around the address space, so offsets to its
	// parts don't wrap either.
	SDNodeFlags Flags;
	Flags.setNoUnsignedWrap(true);

	unsigned ChainI = 0;
	for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
	// See visitLoad comments.
	if (ChainI == MaxParallelChains) {
	SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	makeArrayRef(Chains.data(), ChainI));
	Root = Chain;
	ChainI = 0;
	}
	SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr,
	DAG.getConstant(Offsets[i], dl, PtrVT), Flags);
	SDValue Val = SDValue(Src.getNode(), Src.getResNo() + i);
	if (MemVTs[i] != ValueVTs[i])
	Val = DAG.getPtrExtOrTrunc(Val, dl, MemVTs[i]);
	SDValue St =
	DAG.getStore(Root, dl, Val, Add, MachinePointerInfo(PtrV, Offsets[i]),
	Alignment, MMOFlags, AAInfo);
	Chains[ChainI] = St;
	}

	SDValue StoreNode = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	makeArrayRef(Chains.data(), ChainI));
	DAG.setRoot(StoreNode);
	}

	void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
	bool IsCompressing) {
	SDLoc sdl = getCurSDLoc();

	auto getMaskedStoreOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
	unsigned& Alignment) {
	// llvm.masked.store.*(Src0, Ptr, alignment, Mask)
	Src0 = I.getArgOperand(0);
	Ptr = I.getArgOperand(1);
	Alignment = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
	Mask = I.getArgOperand(3);
	};
	auto getCompressingStoreOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
	unsigned& Alignment) {
	// llvm.masked.compressstore.*(Src0, Ptr, Mask)
	Src0 = I.getArgOperand(0);
	Ptr = I.getArgOperand(1);
	Mask = I.getArgOperand(2);
	Alignment = 0;
	};

	Value PtrOperand, MaskOperand, *Src0Operand;
	unsigned Alignment;
	if (IsCompressing)
	getCompressingStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
	else
	getMaskedStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);

	SDValue Ptr = getValue(PtrOperand);
	SDValue Src0 = getValue(Src0Operand);
	SDValue Mask = getValue(MaskOperand);

	EVT VT = Src0.getValueType();
	if (!Alignment)
	Alignment = DAG.getEVTAlignment(VT);

	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);

	MachineMemOperand *MMO =
	DAG.getMachineFunction().
	getMachineMemOperand(MachinePointerInfo(PtrOperand),
	MachineMemOperand::MOStore, VT.getStoreSize(),
	Alignment, AAInfo);
	SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT,
	MMO, false /* Truncating */,
	IsCompressing);
	DAG.setRoot(StoreNode);
	setValue(&I, StoreNode);
	}

	// Get a uniform base for the Gather/Scatter intrinsic.
	// The first argument of the Gather/Scatter intrinsic is a vector of pointers.
	// We try to represent it as a base pointer + vector of indices.
	// Usually, the vector of pointers comes from a 'getelementptr' instruction.
	// The first operand of the GEP may be a single pointer or a vector of pointers
	// Example:
	// %gep.ptr = getelementptr i32, <8 x i32*> %vptr, <8 x i32> %ind
	// or
	// %gep.ptr = getelementptr i32, i32* %ptr, <8 x i32> %ind
	// %res = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.ptr, ..
	//
	// When the first GEP operand is a single pointer - it is the uniform base we
	// are looking for. If first operand of the GEP is a splat vector - we
	// extract the splat value and use it as a uniform base.
	// In all other cases the function returns 'false'.
	static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
	SDValue &Scale, SelectionDAGBuilder* SDB) {
	SelectionDAG& DAG = SDB->DAG;
	LLVMContext &Context = *DAG.getContext();

	assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type");
	const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
	if (!GEP)
	return false;

	const Value *GEPPtr = GEP->getPointerOperand();
	if (!GEPPtr->getType()->isVectorTy())
	Ptr = GEPPtr;
	else if (!(Ptr = getSplatValue(GEPPtr)))
	return false;

	unsigned FinalIndex = GEP->getNumOperands() - 1;
	Value *IndexVal = GEP->getOperand(FinalIndex);

	// Ensure all the other indices are 0.
	for (unsigned i = 1; i < FinalIndex; ++i) {
	auto *C = dyn_cast<ConstantInt>(GEP->getOperand(i));
	if (!C \|\| !C->isZero())
	return false;
	}

	// The operands of the GEP may be defined in another basic block.
	// In this case we'll not find nodes for the operands.
	if (!SDB->findValue(Ptr) \|\| !SDB->findValue(IndexVal))
	return false;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const DataLayout &DL = DAG.getDataLayout();
	Scale = DAG.getTargetConstant(DL.getTypeAllocSize(GEP->getResultElementType()),
	SDB->getCurSDLoc(), TLI.getPointerTy(DL));
	Base = SDB->getValue(Ptr);
	Index = SDB->getValue(IndexVal);

	if (!Index.getValueType().isVector()) {
	unsigned GEPWidth = GEP->getType()->getVectorNumElements();
	EVT VT = EVT::getVectorVT(Context, Index.getValueType(), GEPWidth);
	Index = DAG.getSplatBuildVector(VT, SDLoc(Index), Index);
	}
	return true;
	}

	void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
	SDLoc sdl = getCurSDLoc();

	// llvm.masked.scatter.*(Src0, Ptrs, alignemt, Mask)
	const Value *Ptr = I.getArgOperand(1);
	SDValue Src0 = getValue(I.getArgOperand(0));
	SDValue Mask = getValue(I.getArgOperand(3));
	EVT VT = Src0.getValueType();
	unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(2)))->getZExtValue();
	if (!Alignment)
	Alignment = DAG.getEVTAlignment(VT);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);

	SDValue Base;
	SDValue Index;
	SDValue Scale;
	const Value *BasePtr = Ptr;
	bool UniformBase = getUniformBase(BasePtr, Base, Index, Scale, this);

	const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(MachinePointerInfo(MemOpBasePtr),
	MachineMemOperand::MOStore, VT.getStoreSize(),
	Alignment, AAInfo);
	if (!UniformBase) {
	Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
	Index = getValue(Ptr);
	Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
	}
	SDValue Ops[] = { getRoot(), Src0, Mask, Base, Index, Scale };
	SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl,
	Ops, MMO);
	DAG.setRoot(Scatter);
	setValue(&I, Scatter);
	}

	void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
	SDLoc sdl = getCurSDLoc();

	auto getMaskedLoadOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
	unsigned& Alignment) {
	// @llvm.masked.load.*(Ptr, alignment, Mask, Src0)
	Ptr = I.getArgOperand(0);
	Alignment = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
	Mask = I.getArgOperand(2);
	Src0 = I.getArgOperand(3);
	};
	auto getExpandingLoadOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
	unsigned& Alignment) {
	// @llvm.masked.expandload.*(Ptr, Mask, Src0)
	Ptr = I.getArgOperand(0);
	Alignment = 0;
	Mask = I.getArgOperand(1);
	Src0 = I.getArgOperand(2);
	};

	Value PtrOperand, MaskOperand, *Src0Operand;
	unsigned Alignment;
	if (IsExpanding)
	getExpandingLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
	else
	getMaskedLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);

	SDValue Ptr = getValue(PtrOperand);
	SDValue Src0 = getValue(Src0Operand);
	SDValue Mask = getValue(MaskOperand);

	EVT VT = Src0.getValueType();
	if (!Alignment)
	Alignment = DAG.getEVTAlignment(VT);

	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);
	const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);

	// Do not serialize masked loads of constant memory with anything.
	bool AddToChain =
	!AA \|\| !AA->pointsToConstantMemory(MemoryLocation(
	PtrOperand,
	LocationSize::precise(
	DAG.getDataLayout().getTypeStoreSize(I.getType())),
	AAInfo));
	SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();

	MachineMemOperand *MMO =
	DAG.getMachineFunction().
	getMachineMemOperand(MachinePointerInfo(PtrOperand),
	MachineMemOperand::MOLoad, VT.getStoreSize(),
	Alignment, AAInfo, Ranges);

	SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO,
	ISD::NON_EXTLOAD, IsExpanding);
	if (AddToChain)
	PendingLoads.push_back(Load.getValue(1));
	setValue(&I, Load);
	}

	void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
	SDLoc sdl = getCurSDLoc();

	// @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
	const Value *Ptr = I.getArgOperand(0);
	SDValue Src0 = getValue(I.getArgOperand(3));
	SDValue Mask = getValue(I.getArgOperand(2));

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(1)))->getZExtValue();
	if (!Alignment)
	Alignment = DAG.getEVTAlignment(VT);

	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);
	const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);

	SDValue Root = DAG.getRoot();
	SDValue Base;
	SDValue Index;
	SDValue Scale;
	const Value *BasePtr = Ptr;
	bool UniformBase = getUniformBase(BasePtr, Base, Index, Scale, this);
	bool ConstantMemory = false;
	if (UniformBase && AA &&
	AA->pointsToConstantMemory(
	MemoryLocation(BasePtr,
	LocationSize::precise(
	DAG.getDataLayout().getTypeStoreSize(I.getType())),
	AAInfo))) {
	// Do not serialize (non-volatile) loads of constant memory with anything.
	Root = DAG.getEntryNode();
	ConstantMemory = true;
	}

	MachineMemOperand *MMO =
	DAG.getMachineFunction().
	getMachineMemOperand(MachinePointerInfo(UniformBase ? BasePtr : nullptr),
	MachineMemOperand::MOLoad, VT.getStoreSize(),
	Alignment, AAInfo, Ranges);

	if (!UniformBase) {
	Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
	Index = getValue(Ptr);
	Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
	}
	SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };
	SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
	Ops, MMO);

	SDValue OutChain = Gather.getValue(1);
	if (!ConstantMemory)
	PendingLoads.push_back(OutChain);
	setValue(&I, Gather);
	}

	void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
	SDLoc dl = getCurSDLoc();
	AtomicOrdering SuccessOrdering = I.getSuccessOrdering();
	AtomicOrdering FailureOrdering = I.getFailureOrdering();
	SyncScope::ID SSID = I.getSyncScopeID();

	SDValue InChain = getRoot();

	MVT MemVT = getValue(I.getCompareOperand()).getSimpleValueType();
	SDVTList VTs = DAG.getVTList(MemVT, MVT::i1, MVT::Other);

	auto Alignment = DAG.getEVTAlignment(MemVT);

	auto Flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;
	if (I.isVolatile())
	Flags \|= MachineMemOperand::MOVolatile;
	Flags \|= DAG.getTargetLoweringInfo().getMMOFlags(I);

	MachineFunction &MF = DAG.getMachineFunction();
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
	Flags, MemVT.getStoreSize(), Alignment,
	AAMDNodes(), nullptr, SSID, SuccessOrdering,
	FailureOrdering);

	SDValue L = DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
	dl, MemVT, VTs, InChain,
	getValue(I.getPointerOperand()),
	getValue(I.getCompareOperand()),
	getValue(I.getNewValOperand()), MMO);

	SDValue OutChain = L.getValue(2);

	setValue(&I, L);
	DAG.setRoot(OutChain);
	}

	void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
	SDLoc dl = getCurSDLoc();
	ISD::NodeType NT;
	switch (I.getOperation()) {
	default: llvm_unreachable("Unknown atomicrmw operation");
	case AtomicRMWInst::Xchg: NT = ISD::ATOMIC_SWAP; break;
	case AtomicRMWInst::Add: NT = ISD::ATOMIC_LOAD_ADD; break;
	case AtomicRMWInst::Sub: NT = ISD::ATOMIC_LOAD_SUB; break;
	case AtomicRMWInst::And: NT = ISD::ATOMIC_LOAD_AND; break;
	case AtomicRMWInst::Nand: NT = ISD::ATOMIC_LOAD_NAND; break;
	case AtomicRMWInst::Or: NT = ISD::ATOMIC_LOAD_OR; break;
	case AtomicRMWInst::Xor: NT = ISD::ATOMIC_LOAD_XOR; break;
	case AtomicRMWInst::Max: NT = ISD::ATOMIC_LOAD_MAX; break;
	case AtomicRMWInst::Min: NT = ISD::ATOMIC_LOAD_MIN; break;
	case AtomicRMWInst::UMax: NT = ISD::ATOMIC_LOAD_UMAX; break;
	case AtomicRMWInst::UMin: NT = ISD::ATOMIC_LOAD_UMIN; break;
	case AtomicRMWInst::FAdd: NT = ISD::ATOMIC_LOAD_FADD; break;
	case AtomicRMWInst::FSub: NT = ISD::ATOMIC_LOAD_FSUB; break;
	}
	AtomicOrdering Ordering = I.getOrdering();
	SyncScope::ID SSID = I.getSyncScopeID();

	SDValue InChain = getRoot();

	auto MemVT = getValue(I.getValOperand()).getSimpleValueType();
	auto Alignment = DAG.getEVTAlignment(MemVT);

	auto Flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;
	if (I.isVolatile())
	Flags \|= MachineMemOperand::MOVolatile;
	Flags \|= DAG.getTargetLoweringInfo().getMMOFlags(I);

	MachineFunction &MF = DAG.getMachineFunction();
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()), Flags,
	MemVT.getStoreSize(), Alignment, AAMDNodes(),
	nullptr, SSID, Ordering);

	SDValue L =
	DAG.getAtomic(NT, dl, MemVT, InChain,
	getValue(I.getPointerOperand()), getValue(I.getValOperand()),
	MMO);

	SDValue OutChain = L.getValue(1);

	setValue(&I, L);
	DAG.setRoot(OutChain);
	}

	void SelectionDAGBuilder::visitFence(const FenceInst &I) {
	SDLoc dl = getCurSDLoc();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Ops[3];
	Ops[0] = getRoot();
	Ops[1] = DAG.getConstant((unsigned)I.getOrdering(), dl,
	TLI.getFenceOperandTy(DAG.getDataLayout()));
	Ops[2] = DAG.getConstant(I.getSyncScopeID(), dl,
	TLI.getFenceOperandTy(DAG.getDataLayout()));
	DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops));
	}

	void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
	SDLoc dl = getCurSDLoc();
	AtomicOrdering Order = I.getOrdering();
	SyncScope::ID SSID = I.getSyncScopeID();

	SDValue InChain = getRoot();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	EVT MemVT = TLI.getMemValueType(DAG.getDataLayout(), I.getType());

	if (!TLI.supportsUnalignedAtomics() &&
	I.getAlignment() < MemVT.getSizeInBits() / 8)
	report_fatal_error("Cannot generate unaligned atomic load");

	auto Flags = MachineMemOperand::MOLoad;
	if (I.isVolatile())
	Flags \|= MachineMemOperand::MOVolatile;
	if (I.getMetadata(LLVMContext::MD_invariant_load) != nullptr)
	Flags \|= MachineMemOperand::MOInvariant;
	if (isDereferenceablePointer(I.getPointerOperand(), I.getType(),
	DAG.getDataLayout()))
	Flags \|= MachineMemOperand::MODereferenceable;

	Flags \|= TLI.getMMOFlags(I);

	MachineMemOperand *MMO =
	DAG.getMachineFunction().
	getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
	Flags, MemVT.getStoreSize(),
	I.getAlignment() ? I.getAlignment() :
	DAG.getEVTAlignment(MemVT),
	AAMDNodes(), nullptr, SSID, Order);

	InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
	SDValue L =
	DAG.getAtomic(ISD::ATOMIC_LOAD, dl, MemVT, MemVT, InChain,
	getValue(I.getPointerOperand()), MMO);

	SDValue OutChain = L.getValue(1);
	if (MemVT != VT)
	L = DAG.getPtrExtOrTrunc(L, dl, VT);

	setValue(&I, L);
	DAG.setRoot(OutChain);
	}

	void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
	SDLoc dl = getCurSDLoc();

	AtomicOrdering Ordering = I.getOrdering();
	SyncScope::ID SSID = I.getSyncScopeID();

	SDValue InChain = getRoot();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT MemVT =
	TLI.getMemValueType(DAG.getDataLayout(), I.getValueOperand()->getType());

	if (I.getAlignment() < MemVT.getSizeInBits() / 8)
	report_fatal_error("Cannot generate unaligned atomic store");

	auto Flags = MachineMemOperand::MOStore;
	if (I.isVolatile())
	Flags \|= MachineMemOperand::MOVolatile;
	Flags \|= TLI.getMMOFlags(I);

	MachineFunction &MF = DAG.getMachineFunction();
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()), Flags,
	MemVT.getStoreSize(), I.getAlignment(), AAMDNodes(),
	nullptr, SSID, Ordering);

	SDValue Val = getValue(I.getValueOperand());
	if (Val.getValueType() != MemVT)
	Val = DAG.getPtrExtOrTrunc(Val, dl, MemVT);

	SDValue OutChain = DAG.getAtomic(ISD::ATOMIC_STORE, dl, MemVT, InChain,
	getValue(I.getPointerOperand()), Val, MMO);


	DAG.setRoot(OutChain);
	}

	/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
	/// node.
	void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
	unsigned Intrinsic) {
	// Ignore the callsite's attributes. A specific call site may be marked with
	// readnone, but the lowering code will expect the chain based on the
	// definition.
	const Function *F = I.getCalledFunction();
	bool HasChain = !F->doesNotAccessMemory();
	bool OnlyLoad = HasChain && F->onlyReadsMemory();

	// Build the operand list.
	SmallVector<SDValue, 8> Ops;
	if (HasChain) { // If this intrinsic has side-effects, chainify it.
	if (OnlyLoad) {
	// We don't need to serialize loads against other loads.
	Ops.push_back(DAG.getRoot());
	} else {
	Ops.push_back(getRoot());
	}
	}

	// Info is set by getTgtMemInstrinsic
	TargetLowering::IntrinsicInfo Info;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I,
	DAG.getMachineFunction(),
	Intrinsic);

	// Add the intrinsic ID as an integer operand if it's not a target intrinsic.
	if (!IsTgtIntrinsic \|\| Info.opc == ISD::INTRINSIC_VOID \|\|
	Info.opc == ISD::INTRINSIC_W_CHAIN)
	Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(),
	TLI.getPointerTy(DAG.getDataLayout())));

	// Add all operands of the call to the operand list.
	for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
	SDValue Op = getValue(I.getArgOperand(i));
	Ops.push_back(Op);
	}

	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);

	if (HasChain)
	ValueVTs.push_back(MVT::Other);

	SDVTList VTs = DAG.getVTList(ValueVTs);

	// Create the node.
	SDValue Result;
	if (IsTgtIntrinsic) {
	// This is target intrinsic that touches memory
	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);
	Result =
	DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT,
	MachinePointerInfo(Info.ptrVal, Info.offset),
	Info.align, Info.flags, Info.size, AAInfo);
	} else if (!HasChain) {
	Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
	} else if (!I.getType()->isVoidTy()) {
	Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
	} else {
	Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
	}

	if (HasChain) {
	SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1);
	if (OnlyLoad)
	PendingLoads.push_back(Chain);
	else
	DAG.setRoot(Chain);
	}

	if (!I.getType()->isVoidTy()) {
	if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
	EVT VT = TLI.getValueType(DAG.getDataLayout(), PTy);
	Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result);
	} else
	Result = lowerRangeToAssertZExt(DAG, I, Result);

	setValue(&I, Result);
	}
	}

	/// GetSignificand - Get the significand and build it into a floating-point
	/// number with exponent of 1:
	///
	/// Op = (Op & 0x007fffff) \| 0x3f800000;
	///
	/// where Op is the hexadecimal representation of floating point value.
	static SDValue GetSignificand(SelectionDAG &DAG, SDValue Op, const SDLoc &dl) {
	SDValue t1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
	DAG.getConstant(0x007fffff, dl, MVT::i32));
	SDValue t2 = DAG.getNode(ISD::OR, dl, MVT::i32, t1,
	DAG.getConstant(0x3f800000, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::f32, t2);
	}

	/// GetExponent - Get the exponent:
	///
	/// (float)(int)(((Op & 0x7f800000) >> 23) - 127);
	///
	/// where Op is the hexadecimal representation of floating point value.
	static SDValue GetExponent(SelectionDAG &DAG, SDValue Op,
	const TargetLowering &TLI, const SDLoc &dl) {
	SDValue t0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
	DAG.getConstant(0x7f800000, dl, MVT::i32));
	SDValue t1 = DAG.getNode(
	ISD::SRL, dl, MVT::i32, t0,
	DAG.getConstant(23, dl, TLI.getPointerTy(DAG.getDataLayout())));
	SDValue t2 = DAG.getNode(ISD::SUB, dl, MVT::i32, t1,
	DAG.getConstant(127, dl, MVT::i32));
	return DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, t2);
	}

	/// getF32Constant - Get 32-bit floating point constant.
	static SDValue getF32Constant(SelectionDAG &DAG, unsigned Flt,
	const SDLoc &dl) {
	return DAG.getConstantFP(APFloat(APFloat::IEEEsingle(), APInt(32, Flt)), dl,
	MVT::f32);
	}

	static SDValue getLimitedPrecisionExp2(SDValue t0, const SDLoc &dl,
	SelectionDAG &DAG) {
	// TODO: What fast-math-flags should be set on the floating-point nodes?

	// IntegerPartOfX = ((int32_t)(t0);
	SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0);

	// FractionalPartOfX = t0 - (float)IntegerPartOfX;
	SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
	SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1);

	// IntegerPartOfX <<= 23;
	IntegerPartOfX = DAG.getNode(
	ISD::SHL, dl, MVT::i32, IntegerPartOfX,
	DAG.getConstant(23, dl, DAG.getTargetLoweringInfo().getPointerTy(
	DAG.getDataLayout())));

	SDValue TwoToFractionalPartOfX;
	if (LimitFloatPrecision <= 6) {
	// For floating-point precision of 6:
	//
	// TwoToFractionalPartOfX =
	// 0.997535578f +
	// (0.735607626f + 0.252464424f * x) * x;
	//
	// error 0.0144103317, which is 6 bits
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0x3e814304, dl));
	SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3f3c50c8, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x3f7f5e7e, dl));
	} else if (LimitFloatPrecision <= 12) {
	// For floating-point precision of 12:
	//
	// TwoToFractionalPartOfX =
	// 0.999892986f +
	// (0.696457318f +
	// (0.224338339f + 0.792043434e-1f * x) * x) * x;
	//
	// error 0.000107046256, which is 13 to 14 bits
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0x3da235e3, dl));
	SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3e65b8f3, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x3f324b07, dl));
	SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
	TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
	getF32Constant(DAG, 0x3f7ff8fd, dl));
	} else { // LimitFloatPrecision <= 18
	// For floating-point precision of 18:
	//
	// TwoToFractionalPartOfX =
	// 0.999999982f +
	// (0.693148872f +
	// (0.240227044f +
	// (0.554906021e-1f +
	// (0.961591928e-2f +
	// (0.136028312e-2f + 0.157059148e-3f x)x)x)x)x)x;
	// error 2.47208000*10^(-7), which is better than 18 bits
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0x3924b03e, dl));
	SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3ab24b87, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x3c1d8c17, dl));
	SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
	SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
	getF32Constant(DAG, 0x3d634a1d, dl));
	SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
	SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
	getF32Constant(DAG, 0x3e75fe14, dl));
	SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
	SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
	getF32Constant(DAG, 0x3f317234, dl));
	SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
	TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
	getF32Constant(DAG, 0x3f800000, dl));
	}

	// Add the exponent into the result in integer domain.
	SDValue t13 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, TwoToFractionalPartOfX);
	return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
	DAG.getNode(ISD::ADD, dl, MVT::i32, t13, IntegerPartOfX));
	}

	/// expandExp - Lower an exp intrinsic. Handles the special sequences for
	/// limited-precision mode.
	static SDValue expandExp(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
	const TargetLowering &TLI) {
	if (Op.getValueType() == MVT::f32 &&
	LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {

	// Put the exponent in the right bit position for later addition to the
	// final result:
	//
	// #define LOG2OFe 1.4426950f
	// t0 = Op * LOG2OFe

	// TODO: What fast-math-flags should be set here?
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op,
	getF32Constant(DAG, 0x3fb8aa3b, dl));
	return getLimitedPrecisionExp2(t0, dl, DAG);
	}

	// No special expansion.
	return DAG.getNode(ISD::FEXP, dl, Op.getValueType(), Op);
	}

	/// expandLog - Lower a log intrinsic. Handles the special sequences for
	/// limited-precision mode.
	static SDValue expandLog(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
	const TargetLowering &TLI) {
	// TODO: What fast-math-flags should be set on the floating-point nodes?

	if (Op.getValueType() == MVT::f32 &&
	LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
	SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);

	// Scale the exponent by log(2) [0.69314718f].
	SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
	SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
	getF32Constant(DAG, 0x3f317218, dl));

	// Get the significand and build it into a floating-point number with
	// exponent of 1.
	SDValue X = GetSignificand(DAG, Op1, dl);

	SDValue LogOfMantissa;
	if (LimitFloatPrecision <= 6) {
	// For floating-point precision of 6:
	//
	// LogofMantissa =
	// -1.1609546f +
	// (1.4034025f - 0.23903021f * x) * x;
	//
	// error 0.0034276066, which is better than 8 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0xbe74c456, dl));
	SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3fb3a2b1, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3f949a29, dl));
	} else if (LimitFloatPrecision <= 12) {
	// For floating-point precision of 12:
	//
	// LogOfMantissa =
	// -1.7417939f +
	// (2.8212026f +
	// (-1.4699568f +
	// (0.44717955f - 0.56570851e-1f * x) * x) * x) * x;
	//
	// error 0.000061011436, which is 14 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0xbd67b6d6, dl));
	SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3ee4f4b8, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3fbc278b, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x40348e95, dl));
	SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
	LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
	getF32Constant(DAG, 0x3fdef31a, dl));
	} else { // LimitFloatPrecision <= 18
	// For floating-point precision of 18:
	//
	// LogOfMantissa =
	// -2.1072184f +
	// (4.2372794f +
	// (-3.7029485f +
	// (2.2781945f +
	// (-0.87823314f +
	// (0.19073739f - 0.17809712e-1f * x) * x) * x) * x) * x)*x;
	//
	// error 0.0000023660568, which is better than 18 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0xbc91e5ac, dl));
	SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3e4350aa, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3f60d3e3, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x4011cdf0, dl));
	SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
	SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
	getF32Constant(DAG, 0x406cfd1c, dl));
	SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
	SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
	getF32Constant(DAG, 0x408797cb, dl));
	SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
	LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
	getF32Constant(DAG, 0x4006dcab, dl));
	}

	return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, LogOfMantissa);
	}

	// No special expansion.
	return DAG.getNode(ISD::FLOG, dl, Op.getValueType(), Op);
	}

	/// expandLog2 - Lower a log2 intrinsic. Handles the special sequences for
	/// limited-precision mode.
	static SDValue expandLog2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
	const TargetLowering &TLI) {
	// TODO: What fast-math-flags should be set on the floating-point nodes?

	if (Op.getValueType() == MVT::f32 &&
	LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
	SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);

	// Get the exponent.
	SDValue LogOfExponent = GetExponent(DAG, Op1, TLI, dl);

	// Get the significand and build it into a floating-point number with
	// exponent of 1.
	SDValue X = GetSignificand(DAG, Op1, dl);

	// Different possible minimax approximations of significand in
	// floating-point for various degrees of accuracy over [1,2].
	SDValue Log2ofMantissa;
	if (LimitFloatPrecision <= 6) {
	// For floating-point precision of 6:
	//
	// Log2ofMantissa = -1.6749035f + (2.0246817f - .34484768f * x) * x;
	//
	// error 0.0049451742, which is more than 7 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0xbeb08fe0, dl));
	SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x40019463, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3fd6633d, dl));
	} else if (LimitFloatPrecision <= 12) {
	// For floating-point precision of 12:
	//
	// Log2ofMantissa =
	// -2.51285454f +
	// (4.07009056f +
	// (-2.12067489f +
	// (.645142248f - 0.816157886e-1f * x) * x) * x) * x;
	//
	// error 0.0000876136000, which is better than 13 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0xbda7262e, dl));
	SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3f25280b, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x4007b923, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x40823e2f, dl));
	SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
	Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
	getF32Constant(DAG, 0x4020d29c, dl));
	} else { // LimitFloatPrecision <= 18
	// For floating-point precision of 18:
	//
	// Log2ofMantissa =
	// -3.0400495f +
	// (6.1129976f +
	// (-5.3420409f +
	// (3.2865683f +
	// (-1.2669343f +
	// (0.27515199f -
	// 0.25691327e-1f * x) * x) * x) * x) * x) * x;
	//
	// error 0.0000018516, which is better than 18 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0xbcd2769e, dl));
	SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3e8ce0b9, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3fa22ae7, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x40525723, dl));
	SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
	SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
	getF32Constant(DAG, 0x40aaf200, dl));
	SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
	SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
	getF32Constant(DAG, 0x40c39dad, dl));
	SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
	Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
	getF32Constant(DAG, 0x4042902c, dl));
	}

	return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log2ofMantissa);
	}

	// No special expansion.
	return DAG.getNode(ISD::FLOG2, dl, Op.getValueType(), Op);
	}

	/// expandLog10 - Lower a log10 intrinsic. Handles the special sequences for
	/// limited-precision mode.
	static SDValue expandLog10(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
	const TargetLowering &TLI) {
	// TODO: What fast-math-flags should be set on the floating-point nodes?

	if (Op.getValueType() == MVT::f32 &&
	LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
	SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);

	// Scale the exponent by log10(2) [0.30102999f].
	SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
	SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
	getF32Constant(DAG, 0x3e9a209a, dl));

	// Get the significand and build it into a floating-point number with
	// exponent of 1.
	SDValue X = GetSignificand(DAG, Op1, dl);

	SDValue Log10ofMantissa;
	if (LimitFloatPrecision <= 6) {
	// For floating-point precision of 6:
	//
	// Log10ofMantissa =
	// -0.50419619f +
	// (0.60948995f - 0.10380950f * x) * x;
	//
	// error 0.0014886165, which is 6 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0xbdd49a13, dl));
	SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3f1c0789, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3f011300, dl));
	} else if (LimitFloatPrecision <= 12) {
	// For floating-point precision of 12:
	//
	// Log10ofMantissa =
	// -0.64831180f +
	// (0.91751397f +
	// (-0.31664806f + 0.47637168e-1f * x) * x) * x;
	//
	// error 0.00019228036, which is better than 12 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0x3d431f31, dl));
	SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3ea21fb2, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3f6ae232, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x3f25f7c3, dl));
	} else { // LimitFloatPrecision <= 18
	// For floating-point precision of 18:
	//
	// Log10ofMantissa =
	// -0.84299375f +
	// (1.5327582f +
	// (-1.0688956f +
	// (0.49102474f +
	// (-0.12539807f + 0.13508273e-1f * x) * x) * x) * x) * x;
	//
	// error 0.0000037995730, which is better than 18 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0x3c5d51ce, dl));
	SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3e00685a, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3efb6798, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	SDValue t5 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x3f88d192, dl));
	SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
	SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
	getF32Constant(DAG, 0x3fc4316c, dl));
	SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
	Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t8,
	getF32Constant(DAG, 0x3f57ce70, dl));
	}

	return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log10ofMantissa);
	}

	// No special expansion.
	return DAG.getNode(ISD::FLOG10, dl, Op.getValueType(), Op);
	}

	/// expandExp2 - Lower an exp2 intrinsic. Handles the special sequences for
	/// limited-precision mode.
	static SDValue expandExp2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
	const TargetLowering &TLI) {
	if (Op.getValueType() == MVT::f32 &&
	LimitFloatPrecision > 0 && LimitFloatPrecision <= 18)
	return getLimitedPrecisionExp2(Op, dl, DAG);

	// No special expansion.
	return DAG.getNode(ISD::FEXP2, dl, Op.getValueType(), Op);
	}

	/// visitPow - Lower a pow intrinsic. Handles the special sequences for
	/// limited-precision mode with x == 10.0f.
	static SDValue expandPow(const SDLoc &dl, SDValue LHS, SDValue RHS,
	SelectionDAG &DAG, const TargetLowering &TLI) {
	bool IsExp10 = false;
	if (LHS.getValueType() == MVT::f32 && RHS.getValueType() == MVT::f32 &&
	LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
	if (ConstantFPSDNode *LHSC = dyn_cast<ConstantFPSDNode>(LHS)) {
	APFloat Ten(10.0f);
	IsExp10 = LHSC->isExactlyValue(Ten);
	}
	}

	// TODO: What fast-math-flags should be set on the FMUL node?
	if (IsExp10) {
	// Put the exponent in the right bit position for later addition to the
	// final result:
	//
	// #define LOG2OF10 3.3219281f
	// t0 = Op * LOG2OF10;
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, RHS,
	getF32Constant(DAG, 0x40549a78, dl));
	return getLimitedPrecisionExp2(t0, dl, DAG);
	}

	// No special expansion.
	return DAG.getNode(ISD::FPOW, dl, LHS.getValueType(), LHS, RHS);
	}

	/// ExpandPowI - Expand a llvm.powi intrinsic.
	static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
	SelectionDAG &DAG) {
	// If RHS is a constant, we can expand this out to a multiplication tree,
	// otherwise we end up lowering to a call to __powidf2 (for example). When
	// optimizing for size, we only want to do this if the expansion would produce
	// a small number of multiplies, otherwise we do the full expansion.
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
	// Get the exponent as a positive value.
	unsigned Val = RHSC->getSExtValue();
	if ((int)Val < 0) Val = -Val;

	// powi(x, 0) -> 1.0
	if (Val == 0)
	return DAG.getConstantFP(1.0, DL, LHS.getValueType());

	const Function &F = DAG.getMachineFunction().getFunction();
	if (!F.hasOptSize() \|\|
	// If optimizing for size, don't insert too many multiplies.
	// This inserts up to 5 multiplies.
	countPopulation(Val) + Log2_32(Val) < 7) {
	// We use the simple binary decomposition method to generate the multiply
	// sequence. There are more optimal ways to do this (for example,
	// powi(x,15) generates one more multiply than it should), but this has
	// the benefit of being both really simple and much better than a libcall.
	SDValue Res; // Logically starts equal to 1.0
	SDValue CurSquare = LHS;
	// TODO: Intrinsics should have fast-math-flags that propagate to these
	// nodes.
	while (Val) {
	if (Val & 1) {
	if (Res.getNode())
	Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
	else
	Res = CurSquare; // 1.0*CurSquare.
	}

	CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
	CurSquare, CurSquare);
	Val >>= 1;
	}

	// If the original was negative, invert the result, producing 1/(xxx).
	if (RHSC->getSExtValue() < 0)
	Res = DAG.getNode(ISD::FDIV, DL, LHS.getValueType(),
	DAG.getConstantFP(1.0, DL, LHS.getValueType()), Res);
	return Res;
	}
	}

	// Otherwise, expand to a libcall.
	return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
	}

	// getUnderlyingArgReg - Find underlying register used for a truncated or
	// bitcasted argument.
	static unsigned getUnderlyingArgReg(const SDValue &N) {
	switch (N.getOpcode()) {
	case ISD::CopyFromReg:
	return cast<RegisterSDNode>(N.getOperand(1))->getReg();
	case ISD::BITCAST:
	case ISD::AssertZext:
	case ISD::AssertSext:
	case ISD::TRUNCATE:
	return getUnderlyingArgReg(N.getOperand(0));
	default:
	return 0;
	}
	}

	/// If the DbgValueInst is a dbg_value of a function argument, create the
	/// corresponding DBG_VALUE machine instruction for it now. At the end of
	/// instruction selection, they will be inserted to the entry BB.
	bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
	const Value V, DILocalVariable Variable, DIExpression *Expr,
	DILocation *DL, bool IsDbgDeclare, const SDValue &N) {
	const Argument *Arg = dyn_cast<Argument>(V);
	if (!Arg)
	return false;

	if (!IsDbgDeclare) {
	// ArgDbgValues are hoisted to the beginning of the entry block. So we
	// should only emit as ArgDbgValue if the dbg.value intrinsic is found in
	// the entry block.
	bool IsInEntryBlock = FuncInfo.MBB == &FuncInfo.MF->front();
	if (!IsInEntryBlock)
	return false;

	// ArgDbgValues are hoisted to the beginning of the entry block. So we
	// should only emit as ArgDbgValue if the dbg.value intrinsic describes a
	// variable that also is a param.
	//
	// Although, if we are at the top of the entry block already, we can still
	// emit using ArgDbgValue. This might catch some situations when the
	// dbg.value refers to an argument that isn't used in the entry block, so
	// any CopyToReg node would be optimized out and the only way to express
	// this DBG_VALUE is by using the physical reg (or FI) as done in this
	// method. ArgDbgValues are hoisted to the beginning of the entry block. So
	// we should only emit as ArgDbgValue if the Variable is an argument to the
	// current function, and the dbg.value intrinsic is found in the entry
	// block.
	bool VariableIsFunctionInputArg = Variable->isParameter() &&
	!DL->getInlinedAt();
	bool IsInPrologue = SDNodeOrder == LowestSDNodeOrder;
	if (!IsInPrologue && !VariableIsFunctionInputArg)
	return false;

	// Here we assume that a function argument on IR level only can be used to
	// describe one input parameter on source level. If we for example have
	// source code like this
	//
	// struct A { long x, y; };
	// void foo(struct A a, long b) {
	// ...
	// b = a.x;
	// ...
	// }
	//
	// and IR like this
	//
	// define void @foo(i32 %a1, i32 %a2, i32 %b) {
	// entry:
	// call void @llvm.dbg.value(metadata i32 %a1, "a", DW_OP_LLVM_fragment
	// call void @llvm.dbg.value(metadata i32 %a2, "a", DW_OP_LLVM_fragment
	// call void @llvm.dbg.value(metadata i32 %b, "b",
	// ...
	// call void @llvm.dbg.value(metadata i32 %a1, "b"
	// ...
	//
	// then the last dbg.value is describing a parameter "b" using a value that
	// is an argument. But since we already has used %a1 to describe a parameter
	// we should not handle that last dbg.value here (that would result in an
	// incorrect hoisting of the DBG_VALUE to the function entry).
	// Notice that we allow one dbg.value per IR level argument, to accomodate
	// for the situation with fragments above.
	if (VariableIsFunctionInputArg) {
	unsigned ArgNo = Arg->getArgNo();
	if (ArgNo >= FuncInfo.DescribedArgs.size())
	FuncInfo.DescribedArgs.resize(ArgNo + 1, false);
	else if (!IsInPrologue && FuncInfo.DescribedArgs.test(ArgNo))
	return false;
	FuncInfo.DescribedArgs.set(ArgNo);
	}
	}

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();

	bool IsIndirect = false;
	Optional<MachineOperand> Op;
	// Some arguments' frame index is recorded during argument lowering.
	int FI = FuncInfo.getArgumentFrameIndex(Arg);
	if (FI != std::numeric_limits<int>::max())
	Op = MachineOperand::CreateFI(FI);

	if (!Op && N.getNode()) {
	unsigned Reg = getUnderlyingArgReg(N);
	if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) {
	MachineRegisterInfo &RegInfo = MF.getRegInfo();
	unsigned PR = RegInfo.getLiveInPhysReg(Reg);
	if (PR)
	Reg = PR;
	}
	if (Reg) {
	Op = MachineOperand::CreateReg(Reg, false);
	IsIndirect = IsDbgDeclare;
	}
	}

	if (!Op && N.getNode()) {
	// Check if frame index is available.
	SDValue LCandidate = peekThroughBitcasts(N);
	if (LoadSDNode *LNode = dyn_cast<LoadSDNode>(LCandidate.getNode()))
	if (FrameIndexSDNode *FINode =
	dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
	Op = MachineOperand::CreateFI(FINode->getIndex());
	}

	if (!Op) {
	// Check if ValueMap has reg number.
	DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
	if (VMI != FuncInfo.ValueMap.end()) {
	const auto &TLI = DAG.getTargetLoweringInfo();
	RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), VMI->second,
	V->getType(), getABIRegCopyCC(V));
	if (RFV.occupiesMultipleRegs()) {
	unsigned Offset = 0;
	for (auto RegAndSize : RFV.getRegsAndSizes()) {
	Op = MachineOperand::CreateReg(RegAndSize.first, false);
	auto FragmentExpr = DIExpression::createFragmentExpression(
	Expr, Offset, RegAndSize.second);
	if (!FragmentExpr)
	continue;
	FuncInfo.ArgDbgValues.push_back(
	BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsDbgDeclare,
	Op->getReg(), Variable, *FragmentExpr));
	Offset += RegAndSize.second;
	}
	return true;
	}
	Op = MachineOperand::CreateReg(VMI->second, false);
	IsIndirect = IsDbgDeclare;
	}
	}

	if (!Op)
	return false;

	assert(Variable->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	IsIndirect = (Op->isReg()) ? IsIndirect : true;
	FuncInfo.ArgDbgValues.push_back(
	BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
	*Op, Variable, Expr));

	return true;
	}

	/// Return the appropriate SDDbgValue based on N.
	SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N,
	DILocalVariable *Variable,
	DIExpression *Expr,
	const DebugLoc &dl,
	unsigned DbgSDNodeOrder) {
	if (auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode())) {
	// Construct a FrameIndexDbgValue for FrameIndexSDNodes so we can describe
	// stack slot locations.
	//
	// Consider "int x = 0; int *px = &x;". There are two kinds of interesting
	// debug values here after optimization:
	//
	// dbg.value(i32* %px, !"int *px", !DIExpression()), and
	// dbg.value(i32* %px, !"int x", !DIExpression(DW_OP_deref))
	//
	// Both describe the direct values of their associated variables.
	return DAG.getFrameIndexDbgValue(Variable, Expr, FISDN->getIndex(),
	/IsIndirect/ false, dl, DbgSDNodeOrder);
	}
	return DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(),
	/IsIndirect/ false, dl, DbgSDNodeOrder);
	}

	// VisualStudio defines setjmp as _setjmp
	#if defined(_MSC_VER) && defined(setjmp) && \
	!defined(setjmp_undefined_for_msvc)
	# pragma push_macro("setjmp")
	# undef setjmp
	# define setjmp_undefined_for_msvc
	#endif

	static unsigned FixedPointIntrinsicToOpcode(unsigned Intrinsic) {
	switch (Intrinsic) {
	case Intrinsic::smul_fix:
	return ISD::SMULFIX;
	case Intrinsic::umul_fix:
	return ISD::UMULFIX;
	default:
	llvm_unreachable("Unhandled fixed point intrinsic");
	}
	}

	void SelectionDAGBuilder::lowerCallToExternalSymbol(const CallInst &I,
	const char *FunctionName) {
	assert(FunctionName && "FunctionName must not be nullptr");
	SDValue Callee = DAG.getExternalSymbol(
	FunctionName,
	DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));
	LowerCallTo(&I, Callee, I.isTailCall());
	}

	/// Lower the call to the specified intrinsic function.
	void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
	unsigned Intrinsic) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDLoc sdl = getCurSDLoc();
	DebugLoc dl = getCurDebugLoc();
	SDValue Res;

	switch (Intrinsic) {
	default:
	// By default, turn this into a target intrinsic node.
	visitTargetIntrinsic(I, Intrinsic);
	return;
	case Intrinsic::vastart: visitVAStart(I); return;
	case Intrinsic::vaend: visitVAEnd(I); return;
	case Intrinsic::vacopy: visitVACopy(I); return;
	case Intrinsic::returnaddress:
	setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl,
	TLI.getPointerTy(DAG.getDataLayout()),
	getValue(I.getArgOperand(0))));
	return;
	case Intrinsic::addressofreturnaddress:
	setValue(&I, DAG.getNode(ISD::ADDROFRETURNADDR, sdl,
	TLI.getPointerTy(DAG.getDataLayout())));
	return;
	case Intrinsic::sponentry:
	setValue(&I, DAG.getNode(ISD::SPONENTRY, sdl,
	TLI.getPointerTy(DAG.getDataLayout())));
	return;
	case Intrinsic::frameaddress:
	setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl,
	TLI.getPointerTy(DAG.getDataLayout()),
	getValue(I.getArgOperand(0))));
	return;
	case Intrinsic::read_register: {
	Value *Reg = I.getArgOperand(0);
	SDValue Chain = getRoot();
	SDValue RegName =
	DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
	EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	Res = DAG.getNode(ISD::READ_REGISTER, sdl,
	DAG.getVTList(VT, MVT::Other), Chain, RegName);
	setValue(&I, Res);
	DAG.setRoot(Res.getValue(1));
	return;
	}
	case Intrinsic::write_register: {
	Value *Reg = I.getArgOperand(0);
	Value *RegValue = I.getArgOperand(1);
	SDValue Chain = getRoot();
	SDValue RegName =
	DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
	DAG.setRoot(DAG.getNode(ISD::WRITE_REGISTER, sdl, MVT::Other, Chain,
	RegName, getValue(RegValue)));
	return;
	}
	case Intrinsic::setjmp:
	lowerCallToExternalSymbol(I, &"_setjmp"[!TLI.usesUnderscoreSetJmp()]);
	return;
	case Intrinsic::longjmp:
	lowerCallToExternalSymbol(I, &"_longjmp"[!TLI.usesUnderscoreLongJmp()]);
	return;
	case Intrinsic::memcpy: {
	const auto &MCI = cast<MemCpyInst>(I);
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2 = getValue(I.getArgOperand(1));
	SDValue Op3 = getValue(I.getArgOperand(2));
	// @llvm.memcpy defines 0 and 1 to both mean no alignment.
	unsigned DstAlign = std::max<unsigned>(MCI.getDestAlignment(), 1);
	unsigned SrcAlign = std::max<unsigned>(MCI.getSourceAlignment(), 1);
	unsigned Align = MinAlign(DstAlign, SrcAlign);
	bool isVol = MCI.isVolatile();
	bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
	// FIXME: Support passing different dest/src alignments to the memcpy DAG
	// node.
	SDValue MC = DAG.getMemcpy(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
	false, isTC,
	MachinePointerInfo(I.getArgOperand(0)),
	MachinePointerInfo(I.getArgOperand(1)));
	updateDAGForMaybeTailCall(MC);
	return;
	}
	case Intrinsic::memset: {
	const auto &MSI = cast<MemSetInst>(I);
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2 = getValue(I.getArgOperand(1));
	SDValue Op3 = getValue(I.getArgOperand(2));
	// @llvm.memset defines 0 and 1 to both mean no alignment.
	unsigned Align = std::max<unsigned>(MSI.getDestAlignment(), 1);
	bool isVol = MSI.isVolatile();
	bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
	SDValue MS = DAG.getMemset(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
	isTC, MachinePointerInfo(I.getArgOperand(0)));
	updateDAGForMaybeTailCall(MS);
	return;
	}
	case Intrinsic::memmove: {
	const auto &MMI = cast<MemMoveInst>(I);
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2 = getValue(I.getArgOperand(1));
	SDValue Op3 = getValue(I.getArgOperand(2));
	// @llvm.memmove defines 0 and 1 to both mean no alignment.
	unsigned DstAlign = std::max<unsigned>(MMI.getDestAlignment(), 1);
	unsigned SrcAlign = std::max<unsigned>(MMI.getSourceAlignment(), 1);
	unsigned Align = MinAlign(DstAlign, SrcAlign);
	bool isVol = MMI.isVolatile();
	bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
	// FIXME: Support passing different dest/src alignments to the memmove DAG
	// node.
	SDValue MM = DAG.getMemmove(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
	isTC, MachinePointerInfo(I.getArgOperand(0)),
	MachinePointerInfo(I.getArgOperand(1)));
	updateDAGForMaybeTailCall(MM);
	return;
	}
	case Intrinsic::memcpy_element_unordered_atomic: {
	const AtomicMemCpyInst &MI = cast<AtomicMemCpyInst>(I);
	SDValue Dst = getValue(MI.getRawDest());
	SDValue Src = getValue(MI.getRawSource());
	SDValue Length = getValue(MI.getLength());

	unsigned DstAlign = MI.getDestAlignment();
	unsigned SrcAlign = MI.getSourceAlignment();
	Type *LengthTy = MI.getLength()->getType();
	unsigned ElemSz = MI.getElementSizeInBytes();
	bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
	SDValue MC = DAG.getAtomicMemcpy(getRoot(), sdl, Dst, DstAlign, Src,
	SrcAlign, Length, LengthTy, ElemSz, isTC,
	MachinePointerInfo(MI.getRawDest()),
	MachinePointerInfo(MI.getRawSource()));
	updateDAGForMaybeTailCall(MC);
	return;
	}
	case Intrinsic::memmove_element_unordered_atomic: {
	auto &MI = cast<AtomicMemMoveInst>(I);
	SDValue Dst = getValue(MI.getRawDest());
	SDValue Src = getValue(MI.getRawSource());
	SDValue Length = getValue(MI.getLength());

	unsigned DstAlign = MI.getDestAlignment();
	unsigned SrcAlign = MI.getSourceAlignment();
	Type *LengthTy = MI.getLength()->getType();
	unsigned ElemSz = MI.getElementSizeInBytes();
	bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
	SDValue MC = DAG.getAtomicMemmove(getRoot(), sdl, Dst, DstAlign, Src,
	SrcAlign, Length, LengthTy, ElemSz, isTC,
	MachinePointerInfo(MI.getRawDest()),
	MachinePointerInfo(MI.getRawSource()));
	updateDAGForMaybeTailCall(MC);
	return;
	}
	case Intrinsic::memset_element_unordered_atomic: {
	auto &MI = cast<AtomicMemSetInst>(I);
	SDValue Dst = getValue(MI.getRawDest());
	SDValue Val = getValue(MI.getValue());
	SDValue Length = getValue(MI.getLength());

	unsigned DstAlign = MI.getDestAlignment();
	Type *LengthTy = MI.getLength()->getType();
	unsigned ElemSz = MI.getElementSizeInBytes();
	bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
	SDValue MC = DAG.getAtomicMemset(getRoot(), sdl, Dst, DstAlign, Val, Length,
	LengthTy, ElemSz, isTC,
	MachinePointerInfo(MI.getRawDest()));
	updateDAGForMaybeTailCall(MC);
	return;
	}
	case Intrinsic::dbg_addr:
	case Intrinsic::dbg_declare: {
	const auto &DI = cast<DbgVariableIntrinsic>(I);
	DILocalVariable *Variable = DI.getVariable();
	DIExpression *Expression = DI.getExpression();
	dropDanglingDebugInfo(Variable, Expression);
	assert(Variable && "Missing variable");

	// Check if address has undef value.
	const Value *Address = DI.getVariableLocation();
	if (!Address \|\| isa<UndefValue>(Address) \|\|
	(Address->use_empty() && !isa<Argument>(Address))) {
	LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
	return;
	}

	bool isParameter = Variable->isParameter() \|\| isa<Argument>(Address);

	// Check if this variable can be described by a frame index, typically
	// either as a static alloca or a byval parameter.
	int FI = std::numeric_limits<int>::max();
	if (const auto *AI =
	dyn_cast<AllocaInst>(Address->stripInBoundsConstantOffsets())) {
	if (AI->isStaticAlloca()) {
	auto I = FuncInfo.StaticAllocaMap.find(AI);
	if (I != FuncInfo.StaticAllocaMap.end())
	FI = I->second;
	}
	} else if (const auto *Arg = dyn_cast<Argument>(
	Address->stripInBoundsConstantOffsets())) {
	FI = FuncInfo.getArgumentFrameIndex(Arg);
	}

	// llvm.dbg.addr is control dependent and always generates indirect
	// DBG_VALUE instructions. llvm.dbg.declare is handled as a frame index in
	// the MachineFunction variable table.
	if (FI != std::numeric_limits<int>::max()) {
	if (Intrinsic == Intrinsic::dbg_addr) {
	SDDbgValue *SDV = DAG.getFrameIndexDbgValue(
	Variable, Expression, FI, /IsIndirect/ true, dl, SDNodeOrder);
	DAG.AddDbgValue(SDV, getRoot().getNode(), isParameter);
	}
	return;
	}

	SDValue &N = NodeMap[Address];
	if (!N.getNode() && isa<Argument>(Address))
	// Check unused arguments map.
	N = UnusedArgNodeMap[Address];
	SDDbgValue *SDV;
	if (N.getNode()) {
	if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
	Address = BCI->getOperand(0);
	// Parameters are handled specially.
	auto FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
	if (isParameter && FINode) {
	// Byval parameter. We have a frame index at this point.
	SDV =
	DAG.getFrameIndexDbgValue(Variable, Expression, FINode->getIndex(),
	/IsIndirect/ true, dl, SDNodeOrder);
	} else if (isa<Argument>(Address)) {
	// Address is an argument, so try to emit its dbg value using
	// virtual register info from the FuncInfo.ValueMap.
	EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, true, N);
	return;
	} else {
	SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
	true, dl, SDNodeOrder);
	}
	DAG.AddDbgValue(SDV, N.getNode(), isParameter);
	} else {
	// If Address is an argument then try to emit its dbg value using
	// virtual register info from the FuncInfo.ValueMap.
	if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, true,
	N)) {
	LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
	}
	}
	return;
	}
	case Intrinsic::dbg_label: {
	const DbgLabelInst &DI = cast<DbgLabelInst>(I);
	DILabel *Label = DI.getLabel();
	assert(Label && "Missing label");

	SDDbgLabel *SDV;
	SDV = DAG.getDbgLabel(Label, dl, SDNodeOrder);
	DAG.AddDbgLabel(SDV);
	return;
	}
	case Intrinsic::dbg_value: {
	const DbgValueInst &DI = cast<DbgValueInst>(I);
	assert(DI.getVariable() && "Missing variable");

	DILocalVariable *Variable = DI.getVariable();
	DIExpression *Expression = DI.getExpression();
	dropDanglingDebugInfo(Variable, Expression);
	const Value *V = DI.getValue();
	if (!V)
	return;

	if (handleDebugValue(V, Variable, Expression, dl, DI.getDebugLoc(),
	SDNodeOrder))
	return;

	// TODO: Dangling debug info will eventually either be resolved or produce
	// an Undef DBG_VALUE. However in the resolution case, a gap may appear
	// between the original dbg.value location and its resolved DBG_VALUE, which
	// we should ideally fill with an extra Undef DBG_VALUE.

	DanglingDebugInfoMap[V].emplace_back(&DI, dl, SDNodeOrder);
	return;
	}

	case Intrinsic::eh_typeid_for: {
	// Find the type id for the given typeinfo.
	GlobalValue *GV = ExtractTypeInfo(I.getArgOperand(0));
	unsigned TypeID = DAG.getMachineFunction().getTypeIDFor(GV);
	Res = DAG.getConstant(TypeID, sdl, MVT::i32);
	setValue(&I, Res);
	return;
	}

	case Intrinsic::eh_return_i32:
	case Intrinsic::eh_return_i64:
	DAG.getMachineFunction().setCallsEHReturn(true);
	DAG.setRoot(DAG.getNode(ISD::EH_RETURN, sdl,
	MVT::Other,
	getControlRoot(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1))));
	return;
	case Intrinsic::eh_unwind_init:
	DAG.getMachineFunction().setCallsUnwindInit(true);
	return;
	case Intrinsic::eh_dwarf_cfa:
	setValue(&I, DAG.getNode(ISD::EH_DWARF_CFA, sdl,
	TLI.getPointerTy(DAG.getDataLayout()),
	getValue(I.getArgOperand(0))));
	return;
	case Intrinsic::eh_sjlj_callsite: {
	MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
	ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(0));
	assert(CI && "Non-constant call site value in eh.sjlj.callsite!");
	assert(MMI.getCurrentCallSite() == 0 && "Overlapping call sites!");

	MMI.setCurrentCallSite(CI->getZExtValue());
	return;
	}
	case Intrinsic::eh_sjlj_functioncontext: {
	// Get and store the index of the function context.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	AllocaInst *FnCtx =
	cast<AllocaInst>(I.getArgOperand(0)->stripPointerCasts());
	int FI = FuncInfo.StaticAllocaMap[FnCtx];
	MFI.setFunctionContextIndex(FI);
	return;
	}
	case Intrinsic::eh_sjlj_setjmp: {
	SDValue Ops[2];
	Ops[0] = getRoot();
	Ops[1] = getValue(I.getArgOperand(0));
	SDValue Op = DAG.getNode(ISD::EH_SJLJ_SETJMP, sdl,
	DAG.getVTList(MVT::i32, MVT::Other), Ops);
	setValue(&I, Op.getValue(0));
	DAG.setRoot(Op.getValue(1));
	return;
	}
	case Intrinsic::eh_sjlj_longjmp:
	DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_LONGJMP, sdl, MVT::Other,
	getRoot(), getValue(I.getArgOperand(0))));
	return;
	case Intrinsic::eh_sjlj_setup_dispatch:
	DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_SETUP_DISPATCH, sdl, MVT::Other,
	getRoot()));
	return;
	case Intrinsic::masked_gather:
	visitMaskedGather(I);
	return;
	case Intrinsic::masked_load:
	visitMaskedLoad(I);
	return;
	case Intrinsic::masked_scatter:
	visitMaskedScatter(I);
	return;
	case Intrinsic::masked_store:
	visitMaskedStore(I);
	return;
	case Intrinsic::masked_expandload:
	visitMaskedLoad(I, true /* IsExpanding */);
	return;
	case Intrinsic::masked_compressstore:
	visitMaskedStore(I, true /* IsCompressing */);
	return;
	case Intrinsic::x86_mmx_pslli_w:
	case Intrinsic::x86_mmx_pslli_d:
	case Intrinsic::x86_mmx_pslli_q:
	case Intrinsic::x86_mmx_psrli_w:
	case Intrinsic::x86_mmx_psrli_d:
	case Intrinsic::x86_mmx_psrli_q:
	case Intrinsic::x86_mmx_psrai_w:
	case Intrinsic::x86_mmx_psrai_d: {
	SDValue ShAmt = getValue(I.getArgOperand(1));
	if (isa<ConstantSDNode>(ShAmt)) {
	visitTargetIntrinsic(I, Intrinsic);
	return;
	}
	unsigned NewIntrinsic = 0;
	EVT ShAmtVT = MVT::v2i32;
	switch (Intrinsic) {
	case Intrinsic::x86_mmx_pslli_w:
	NewIntrinsic = Intrinsic::x86_mmx_psll_w;
	break;
	case Intrinsic::x86_mmx_pslli_d:
	NewIntrinsic = Intrinsic::x86_mmx_psll_d;
	break;
	case Intrinsic::x86_mmx_pslli_q:
	NewIntrinsic = Intrinsic::x86_mmx_psll_q;
	break;
	case Intrinsic::x86_mmx_psrli_w:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
	break;
	case Intrinsic::x86_mmx_psrli_d:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
	break;
	case Intrinsic::x86_mmx_psrli_q:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
	break;
	case Intrinsic::x86_mmx_psrai_w:
	NewIntrinsic = Intrinsic::x86_mmx_psra_w;
	break;
	case Intrinsic::x86_mmx_psrai_d:
	NewIntrinsic = Intrinsic::x86_mmx_psra_d;
	break;
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	}

	// The vector shift intrinsics with scalars uses 32b shift amounts but
	// the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
	// to be zero.
	// We must do this early because v2i32 is not a legal type.
	SDValue ShOps[2];
	ShOps[0] = ShAmt;
	ShOps[1] = DAG.getConstant(0, sdl, MVT::i32);
	ShAmt = DAG.getBuildVector(ShAmtVT, sdl, ShOps);
	EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt);
	Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT,
	DAG.getConstant(NewIntrinsic, sdl, MVT::i32),
	getValue(I.getArgOperand(0)), ShAmt);
	setValue(&I, Res);
	return;
	}
	case Intrinsic::powi:
	setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1)), DAG));
	return;
	case Intrinsic::log:
	setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
	return;
	case Intrinsic::log2:
	setValue(&I, expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
	return;
	case Intrinsic::log10:
	setValue(&I, expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
	return;
	case Intrinsic::exp:
	setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
	return;
	case Intrinsic::exp2:
	setValue(&I, expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
	return;
	case Intrinsic::pow:
	setValue(&I, expandPow(sdl, getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1)), DAG, TLI));
	return;
	case Intrinsic::sqrt:
	case Intrinsic::fabs:
	case Intrinsic::sin:
	case Intrinsic::cos:
	case Intrinsic::floor:
	case Intrinsic::ceil:
	case Intrinsic::trunc:
	case Intrinsic::rint:
	case Intrinsic::nearbyint:
	case Intrinsic::round:
	case Intrinsic::canonicalize: {
	unsigned Opcode;
	switch (Intrinsic) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
	case Intrinsic::fabs: Opcode = ISD::FABS; break;
	case Intrinsic::sin: Opcode = ISD::FSIN; break;
	case Intrinsic::cos: Opcode = ISD::FCOS; break;
	case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
	case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
	case Intrinsic::trunc: Opcode = ISD::FTRUNC; break;
	case Intrinsic::rint: Opcode = ISD::FRINT; break;
	case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
	case Intrinsic::round: Opcode = ISD::FROUND; break;
	case Intrinsic::canonicalize: Opcode = ISD::FCANONICALIZE; break;
	}

	setValue(&I, DAG.getNode(Opcode, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0))));
	return;
	}
	case Intrinsic::lround:
	case Intrinsic::llround:
	case Intrinsic::lrint:
	case Intrinsic::llrint: {
	unsigned Opcode;
	switch (Intrinsic) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::lround: Opcode = ISD::LROUND; break;
	case Intrinsic::llround: Opcode = ISD::LLROUND; break;
	case Intrinsic::lrint: Opcode = ISD::LRINT; break;
	case Intrinsic::llrint: Opcode = ISD::LLRINT; break;
	}

	EVT RetVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	setValue(&I, DAG.getNode(Opcode, sdl, RetVT,
	getValue(I.getArgOperand(0))));
	return;
	}
	case Intrinsic::minnum:
	setValue(&I, DAG.getNode(ISD::FMINNUM, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1))));
	return;
	case Intrinsic::maxnum:
	setValue(&I, DAG.getNode(ISD::FMAXNUM, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1))));
	return;
	case Intrinsic::minimum:
	setValue(&I, DAG.getNode(ISD::FMINIMUM, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1))));
	return;
	case Intrinsic::maximum:
	setValue(&I, DAG.getNode(ISD::FMAXIMUM, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1))));
	return;
	case Intrinsic::copysign:
	setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1))));
	return;
	case Intrinsic::fma:
	setValue(&I, DAG.getNode(ISD::FMA, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1)),
	getValue(I.getArgOperand(2))));
	return;
	case Intrinsic::experimental_constrained_fadd:
	case Intrinsic::experimental_constrained_fsub:
	case Intrinsic::experimental_constrained_fmul:
	case Intrinsic::experimental_constrained_fdiv:
	case Intrinsic::experimental_constrained_frem:
	case Intrinsic::experimental_constrained_fma:
	case Intrinsic::experimental_constrained_fptrunc:
	case Intrinsic::experimental_constrained_fpext:
	case Intrinsic::experimental_constrained_sqrt:
	case Intrinsic::experimental_constrained_pow:
	case Intrinsic::experimental_constrained_powi:
	case Intrinsic::experimental_constrained_sin:
	case Intrinsic::experimental_constrained_cos:
	case Intrinsic::experimental_constrained_exp:
	case Intrinsic::experimental_constrained_exp2:
	case Intrinsic::experimental_constrained_log:
	case Intrinsic::experimental_constrained_log10:
	case Intrinsic::experimental_constrained_log2:
	case Intrinsic::experimental_constrained_rint:
	case Intrinsic::experimental_constrained_nearbyint:
	case Intrinsic::experimental_constrained_maxnum:
	case Intrinsic::experimental_constrained_minnum:
	case Intrinsic::experimental_constrained_ceil:
	case Intrinsic::experimental_constrained_floor:
	case Intrinsic::experimental_constrained_round:
	case Intrinsic::experimental_constrained_trunc:
	visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
	return;
	case Intrinsic::fmuladd: {
	EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
	TLI.isFMAFasterThanFMulAndFAdd(VT)) {
	setValue(&I, DAG.getNode(ISD::FMA, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1)),
	getValue(I.getArgOperand(2))));
	} else {
	// TODO: Intrinsic calls should have fast-math-flags.
	SDValue Mul = DAG.getNode(ISD::FMUL, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1)));
	SDValue Add = DAG.getNode(ISD::FADD, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	Mul,
	getValue(I.getArgOperand(2)));
	setValue(&I, Add);
	}
	return;
	}
	case Intrinsic::convert_to_fp16:
	setValue(&I, DAG.getNode(ISD::BITCAST, sdl, MVT::i16,
	DAG.getNode(ISD::FP_ROUND, sdl, MVT::f16,
	getValue(I.getArgOperand(0)),
	DAG.getTargetConstant(0, sdl,
	MVT::i32))));
	return;
	case Intrinsic::convert_from_fp16:
	setValue(&I, DAG.getNode(ISD::FP_EXTEND, sdl,
	TLI.getValueType(DAG.getDataLayout(), I.getType()),
	DAG.getNode(ISD::BITCAST, sdl, MVT::f16,
	getValue(I.getArgOperand(0)))));
	return;
	case Intrinsic::pcmarker: {
	SDValue Tmp = getValue(I.getArgOperand(0));
	DAG.setRoot(DAG.getNode(ISD::PCMARKER, sdl, MVT::Other, getRoot(), Tmp));
	return;
	}
	case Intrinsic::readcyclecounter: {
	SDValue Op = getRoot();
	Res = DAG.getNode(ISD::READCYCLECOUNTER, sdl,
	DAG.getVTList(MVT::i64, MVT::Other), Op);
	setValue(&I, Res);
	DAG.setRoot(Res.getValue(1));
	return;
	}
	case Intrinsic::bitreverse:
	setValue(&I, DAG.getNode(ISD::BITREVERSE, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0))));
	return;
	case Intrinsic::bswap:
	setValue(&I, DAG.getNode(ISD::BSWAP, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0))));
	return;
	case Intrinsic::cttz: {
	SDValue Arg = getValue(I.getArgOperand(0));
	ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
	EVT Ty = Arg.getValueType();
	setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTTZ : ISD::CTTZ_ZERO_UNDEF,
	sdl, Ty, Arg));
	return;
	}
	case Intrinsic::ctlz: {
	SDValue Arg = getValue(I.getArgOperand(0));
	ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
	EVT Ty = Arg.getValueType();
	setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTLZ : ISD::CTLZ_ZERO_UNDEF,
	sdl, Ty, Arg));
	return;
	}
	case Intrinsic::ctpop: {
	SDValue Arg = getValue(I.getArgOperand(0));
	EVT Ty = Arg.getValueType();
	setValue(&I, DAG.getNode(ISD::CTPOP, sdl, Ty, Arg));
	return;
	}
	case Intrinsic::fshl:
	case Intrinsic::fshr: {
	bool IsFSHL = Intrinsic == Intrinsic::fshl;
	SDValue X = getValue(I.getArgOperand(0));
	SDValue Y = getValue(I.getArgOperand(1));
	SDValue Z = getValue(I.getArgOperand(2));
	EVT VT = X.getValueType();
	SDValue BitWidthC = DAG.getConstant(VT.getScalarSizeInBits(), sdl, VT);
	SDValue Zero = DAG.getConstant(0, sdl, VT);
	SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC);

	auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR;
	if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) {
	setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z));
	return;
	}

	// When X == Y, this is rotate. If the data type has a power-of-2 size, we
	// avoid the select that is necessary in the general case to filter out
	// the 0-shift possibility that leads to UB.
	if (X == Y && isPowerOf2_32(VT.getScalarSizeInBits())) {
	auto RotateOpcode = IsFSHL ? ISD::ROTL : ISD::ROTR;
	if (TLI.isOperationLegalOrCustom(RotateOpcode, VT)) {
	setValue(&I, DAG.getNode(RotateOpcode, sdl, VT, X, Z));
	return;
	}

	// Some targets only rotate one way. Try the opposite direction.
	RotateOpcode = IsFSHL ? ISD::ROTR : ISD::ROTL;
	if (TLI.isOperationLegalOrCustom(RotateOpcode, VT)) {
	// Negate the shift amount because it is safe to ignore the high bits.
	SDValue NegShAmt = DAG.getNode(ISD::SUB, sdl, VT, Zero, Z);
	setValue(&I, DAG.getNode(RotateOpcode, sdl, VT, X, NegShAmt));
	return;
	}

	// fshl (rotl): (X << (Z % BW)) \| (X >> ((0 - Z) % BW))
	// fshr (rotr): (X << ((0 - Z) % BW)) \| (X >> (Z % BW))
	SDValue NegZ = DAG.getNode(ISD::SUB, sdl, VT, Zero, Z);
	SDValue NShAmt = DAG.getNode(ISD::UREM, sdl, VT, NegZ, BitWidthC);
	SDValue ShX = DAG.getNode(ISD::SHL, sdl, VT, X, IsFSHL ? ShAmt : NShAmt);
	SDValue ShY = DAG.getNode(ISD::SRL, sdl, VT, X, IsFSHL ? NShAmt : ShAmt);
	setValue(&I, DAG.getNode(ISD::OR, sdl, VT, ShX, ShY));
	return;
	}

	// fshl: (X << (Z % BW)) \| (Y >> (BW - (Z % BW)))
	// fshr: (X << (BW - (Z % BW))) \| (Y >> (Z % BW))
	SDValue InvShAmt = DAG.getNode(ISD::SUB, sdl, VT, BitWidthC, ShAmt);
	SDValue ShX = DAG.getNode(ISD::SHL, sdl, VT, X, IsFSHL ? ShAmt : InvShAmt);
	SDValue ShY = DAG.getNode(ISD::SRL, sdl, VT, Y, IsFSHL ? InvShAmt : ShAmt);
	SDValue Or = DAG.getNode(ISD::OR, sdl, VT, ShX, ShY);

	// If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth,
	// and that is undefined. We must compare and select to avoid UB.
	EVT CCVT = MVT::i1;
	if (VT.isVector())
	CCVT = EVT::getVectorVT(*Context, CCVT, VT.getVectorNumElements());

	// For fshl, 0-shift returns the 1st arg (X).
	// For fshr, 0-shift returns the 2nd arg (Y).
	SDValue IsZeroShift = DAG.getSetCC(sdl, CCVT, ShAmt, Zero, ISD::SETEQ);
	setValue(&I, DAG.getSelect(sdl, VT, IsZeroShift, IsFSHL ? X : Y, Or));
	return;
	}
	case Intrinsic::sadd_sat: {
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2 = getValue(I.getArgOperand(1));
	setValue(&I, DAG.getNode(ISD::SADDSAT, sdl, Op1.getValueType(), Op1, Op2));
	return;
	}
	case Intrinsic::uadd_sat: {
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2 = getValue(I.getArgOperand(1));
	setValue(&I, DAG.getNode(ISD::UADDSAT, sdl, Op1.getValueType(), Op1, Op2));
	return;
	}
	case Intrinsic::ssub_sat: {
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2 = getValue(I.getArgOperand(1));
	setValue(&I, DAG.getNode(ISD::SSUBSAT, sdl, Op1.getValueType(), Op1, Op2));
	return;
	}
	case Intrinsic::usub_sat: {
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2 = getValue(I.getArgOperand(1));
	setValue(&I, DAG.getNode(ISD::USUBSAT, sdl, Op1.getValueType(), Op1, Op2));
	return;
	}
	case Intrinsic::smul_fix:
	case Intrinsic::umul_fix: {
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2 = getValue(I.getArgOperand(1));
	SDValue Op3 = getValue(I.getArgOperand(2));
	setValue(&I, DAG.getNode(FixedPointIntrinsicToOpcode(Intrinsic), sdl,
	Op1.getValueType(), Op1, Op2, Op3));
	return;
	}
	case Intrinsic::smul_fix_sat: {
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2 = getValue(I.getArgOperand(1));
	SDValue Op3 = getValue(I.getArgOperand(2));
	setValue(&I, DAG.getNode(ISD::SMULFIXSAT, sdl, Op1.getValueType(), Op1, Op2,
	Op3));
	return;
	}
	case Intrinsic::stacksave: {
	SDValue Op = getRoot();
	Res = DAG.getNode(
	ISD::STACKSAVE, sdl,
	DAG.getVTList(TLI.getPointerTy(DAG.getDataLayout()), MVT::Other), Op);
	setValue(&I, Res);
	DAG.setRoot(Res.getValue(1));
	return;
	}
	case Intrinsic::stackrestore:
	Res = getValue(I.getArgOperand(0));
	DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res));
	return;
	case Intrinsic::get_dynamic_area_offset: {
	SDValue Op = getRoot();
	EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
	EVT ResTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
	// Result type for @llvm.get.dynamic.area.offset should match PtrTy for
	// target.
	if (PtrTy.getSizeInBits() < ResTy.getSizeInBits())
	report_fatal_error("Wrong result type for @llvm.get.dynamic.area.offset"
	" intrinsic!");
	Res = DAG.getNode(ISD::GET_DYNAMIC_AREA_OFFSET, sdl, DAG.getVTList(ResTy),
	Op);
	DAG.setRoot(Op);
	setValue(&I, Res);
	return;
	}
	case Intrinsic::stackguard: {
	EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
	MachineFunction &MF = DAG.getMachineFunction();
	const Module &M = *MF.getFunction().getParent();
	SDValue Chain = getRoot();
	if (TLI.useLoadStackGuardNode()) {
	Res = getLoadStackGuard(DAG, sdl, Chain);
	} else {
	const Value *Global = TLI.getSDagStackGuard(M);
	unsigned Align = DL->getPrefTypeAlignment(Global->getType());
	Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global),
	MachinePointerInfo(Global, 0), Align,
	MachineMemOperand::MOVolatile);
	}
	if (TLI.useStackGuardXorFP())
	Res = TLI.emitStackGuardXorFP(DAG, Res, sdl);
	DAG.setRoot(Chain);
	setValue(&I, Res);
	return;
	}
	case Intrinsic::stackprotector: {
	// Emit code into the DAG to store the stack guard onto the stack.
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
	SDValue Src, Chain = getRoot();

	if (TLI.useLoadStackGuardNode())
	Src = getLoadStackGuard(DAG, sdl, Chain);
	else
	Src = getValue(I.getArgOperand(0)); // The guard's value.

	AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));

	int FI = FuncInfo.StaticAllocaMap[Slot];
	MFI.setStackProtectorIndex(FI);

	SDValue FIN = DAG.getFrameIndex(FI, PtrTy);

	// Store the stack protector onto the stack.
	Res = DAG.getStore(Chain, sdl, Src, FIN, MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), FI),
	/* Alignment = */ 0, MachineMemOperand::MOVolatile);
	setValue(&I, Res);
	DAG.setRoot(Res);
	return;
	}
	case Intrinsic::objectsize: {
	// If we don't know by now, we're never going to know.
	ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1));

	assert(CI && "Non-constant type in __builtin_object_size?");

	SDValue Arg = getValue(I.getCalledValue());
	EVT Ty = Arg.getValueType();

	if (CI->isZero())
	Res = DAG.getConstant(-1ULL, sdl, Ty);
	else
	Res = DAG.getConstant(0, sdl, Ty);

	setValue(&I, Res);
	return;
	}

	case Intrinsic::is_constant:
	// If this wasn't constant-folded away by now, then it's not a
	// constant.
	setValue(&I, DAG.getConstant(0, sdl, MVT::i1));
	return;

	case Intrinsic::annotation:
	case Intrinsic::ptr_annotation:
	case Intrinsic::launder_invariant_group:
	case Intrinsic::strip_invariant_group:
	// Drop the intrinsic, but forward the value
	setValue(&I, getValue(I.getOperand(0)));
	return;
	case Intrinsic::assume:
	case Intrinsic::var_annotation:
	case Intrinsic::sideeffect:
	// Discard annotate attributes, assumptions, and artificial side-effects.
	return;

	case Intrinsic::codeview_annotation: {
	// Emit a label associated with this metadata.
	MachineFunction &MF = DAG.getMachineFunction();
	MCSymbol *Label =
	MF.getMMI().getContext().createTempSymbol("annotation", true);
	Metadata *MD = cast<MetadataAsValue>(I.getArgOperand(0))->getMetadata();
	MF.addCodeViewAnnotation(Label, cast<MDNode>(MD));
	Res = DAG.getLabelNode(ISD::ANNOTATION_LABEL, sdl, getRoot(), Label);
	DAG.setRoot(Res);
	return;
	}

	case Intrinsic::init_trampoline: {
	const Function *F = cast<Function>(I.getArgOperand(1)->stripPointerCasts());

	SDValue Ops[6];
	Ops[0] = getRoot();
	Ops[1] = getValue(I.getArgOperand(0));
	Ops[2] = getValue(I.getArgOperand(1));
	Ops[3] = getValue(I.getArgOperand(2));
	Ops[4] = DAG.getSrcValue(I.getArgOperand(0));
	Ops[5] = DAG.getSrcValue(F);

	Res = DAG.getNode(ISD::INIT_TRAMPOLINE, sdl, MVT::Other, Ops);

	DAG.setRoot(Res);
	return;
	}
	case Intrinsic::adjust_trampoline:
	setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, sdl,
	TLI.getPointerTy(DAG.getDataLayout()),
	getValue(I.getArgOperand(0))));
	return;
	case Intrinsic::gcroot: {
	assert(DAG.getMachineFunction().getFunction().hasGC() &&
	"only valid in functions with gc specified, enforced by Verifier");
	assert(GFI && "implied by previous");
	const Value *Alloca = I.getArgOperand(0)->stripPointerCasts();
	const Constant *TypeMap = cast<Constant>(I.getArgOperand(1));

	FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode());
	GFI->addStackRoot(FI->getIndex(), TypeMap);
	return;
	}
	case Intrinsic::gcread:
	case Intrinsic::gcwrite:
	llvm_unreachable("GC failed to lower gcread/gcwrite intrinsics!");
	case Intrinsic::flt_rounds:
	setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, sdl, MVT::i32));
	return;

	case Intrinsic::expect:
	// Just replace __builtin_expect(exp, c) with EXP.
	setValue(&I, getValue(I.getArgOperand(0)));
	return;

	case Intrinsic::debugtrap:
	case Intrinsic::trap: {
	StringRef TrapFuncName =
	I.getAttributes()
	.getAttribute(AttributeList::FunctionIndex, "trap-func-name")
	.getValueAsString();
	if (TrapFuncName.empty()) {
	ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ?
	ISD::TRAP : ISD::DEBUGTRAP;
	DAG.setRoot(DAG.getNode(Op, sdl,MVT::Other, getRoot()));
	return;
	}
	TargetLowering::ArgListTy Args;

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
	CallingConv::C, I.getType(),
	DAG.getExternalSymbol(TrapFuncName.data(),
	TLI.getPointerTy(DAG.getDataLayout())),
	std::move(Args));

	std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
	DAG.setRoot(Result.second);
	return;
	}

	case Intrinsic::uadd_with_overflow:
	case Intrinsic::sadd_with_overflow:
	case Intrinsic::usub_with_overflow:
	case Intrinsic::ssub_with_overflow:
	case Intrinsic::umul_with_overflow:
	case Intrinsic::smul_with_overflow: {
	ISD::NodeType Op;
	switch (Intrinsic) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::uadd_with_overflow: Op = ISD::UADDO; break;
	case Intrinsic::sadd_with_overflow: Op = ISD::SADDO; break;
	case Intrinsic::usub_with_overflow: Op = ISD::USUBO; break;
	case Intrinsic::ssub_with_overflow: Op = ISD::SSUBO; break;
	case Intrinsic::umul_with_overflow: Op = ISD::UMULO; break;
	case Intrinsic::smul_with_overflow: Op = ISD::SMULO; break;
	}
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2 = getValue(I.getArgOperand(1));

	EVT ResultVT = Op1.getValueType();
	EVT OverflowVT = MVT::i1;
	if (ResultVT.isVector())
	OverflowVT = EVT::getVectorVT(
	*Context, OverflowVT, ResultVT.getVectorNumElements());

	SDVTList VTs = DAG.getVTList(ResultVT, OverflowVT);
	setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2));
	return;
	}
	case Intrinsic::prefetch: {
	SDValue Ops[5];
	unsigned rw = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
	auto Flags = rw == 0 ? MachineMemOperand::MOLoad :MachineMemOperand::MOStore;
	Ops[0] = DAG.getRoot();
	Ops[1] = getValue(I.getArgOperand(0));
	Ops[2] = getValue(I.getArgOperand(1));
	Ops[3] = getValue(I.getArgOperand(2));
	Ops[4] = getValue(I.getArgOperand(3));
	SDValue Result = DAG.getMemIntrinsicNode(ISD::PREFETCH, sdl,
	DAG.getVTList(MVT::Other), Ops,
	EVT::getIntegerVT(*Context, 8),
	MachinePointerInfo(I.getArgOperand(0)),
	0, /* align */
	Flags);

	// Chain the prefetch in parallell with any pending loads, to stay out of
	// the way of later optimizations.
	PendingLoads.push_back(Result);
	Result = getRoot();
	DAG.setRoot(Result);
	return;
	}
	case Intrinsic::lifetime_start:
	case Intrinsic::lifetime_end: {
	bool IsStart = (Intrinsic == Intrinsic::lifetime_start);
	// Stack coloring is not enabled in O0, discard region information.
	if (TM.getOptLevel() == CodeGenOpt::None)
	return;

	const int64_t ObjectSize =
	cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
	Value *const ObjectPtr = I.getArgOperand(1);
	SmallVector<const Value *, 4> Allocas;
	GetUnderlyingObjects(ObjectPtr, Allocas, *DL);

	for (SmallVectorImpl<const Value*>::iterator Object = Allocas.begin(),
	E = Allocas.end(); Object != E; ++Object) {
	const AllocaInst LifetimeObject = dyn_cast_or_null<AllocaInst>(Object);

	// Could not find an Alloca.
	if (!LifetimeObject)
	continue;

	// First check that the Alloca is static, otherwise it won't have a
	// valid frame index.
	auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject);
	if (SI == FuncInfo.StaticAllocaMap.end())
	return;

	const int FrameIndex = SI->second;
	int64_t Offset;
	if (GetPointerBaseWithConstantOffset(
	ObjectPtr, Offset, DAG.getDataLayout()) != LifetimeObject)
	Offset = -1; // Cannot determine offset from alloca to lifetime object.
	Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize,
	Offset);
	DAG.setRoot(Res);
	}
	return;
	}
	case Intrinsic::invariant_start:
	// Discard region information.
	setValue(&I, DAG.getUNDEF(TLI.getPointerTy(DAG.getDataLayout())));
	return;
	case Intrinsic::invariant_end:
	// Discard region information.
	return;
	case Intrinsic::clear_cache:
	/// FunctionName may be null.
	if (const char *FunctionName = TLI.getClearCacheBuiltinName())
	lowerCallToExternalSymbol(I, FunctionName);
	return;
	case Intrinsic::donothing:
	// ignore
	return;
	case Intrinsic::experimental_stackmap:
	visitStackmap(I);
	return;
	case Intrinsic::experimental_patchpoint_void:
	case Intrinsic::experimental_patchpoint_i64:
	visitPatchpoint(&I);
	return;
	case Intrinsic::experimental_gc_statepoint:
	LowerStatepoint(ImmutableStatepoint(&I));
	return;
	case Intrinsic::experimental_gc_result:
	visitGCResult(cast<GCResultInst>(I));
	return;
	case Intrinsic::experimental_gc_relocate:
	visitGCRelocate(cast<GCRelocateInst>(I));
	return;
	case Intrinsic::instrprof_increment:
	llvm_unreachable("instrprof failed to lower an increment");
	case Intrinsic::instrprof_value_profile:
	llvm_unreachable("instrprof failed to lower a value profiling call");
	case Intrinsic::localescape: {
	MachineFunction &MF = DAG.getMachineFunction();
	const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();

	// Directly emit some LOCAL_ESCAPE machine instrs. Label assignment emission
	// is the same on all targets.
	for (unsigned Idx = 0, E = I.getNumArgOperands(); Idx < E; ++Idx) {
	Value *Arg = I.getArgOperand(Idx)->stripPointerCasts();
	if (isa<ConstantPointerNull>(Arg))
	continue; // Skip null pointers. They represent a hole in index space.
	AllocaInst *Slot = cast<AllocaInst>(Arg);
	assert(FuncInfo.StaticAllocaMap.count(Slot) &&
	"can only escape static allocas");
	int FI = FuncInfo.StaticAllocaMap[Slot];
	MCSymbol *FrameAllocSym =
	MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
	GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
	TII->get(TargetOpcode::LOCAL_ESCAPE))
	.addSym(FrameAllocSym)
	.addFrameIndex(FI);
	}

	return;
	}

	case Intrinsic::localrecover: {
	// i8* @llvm.localrecover(i8* %fn, i8* %fp, i32 %idx)
	MachineFunction &MF = DAG.getMachineFunction();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout(), 0);

	// Get the symbol that defines the frame offset.
	auto *Fn = cast<Function>(I.getArgOperand(0)->stripPointerCasts());
	auto *Idx = cast<ConstantInt>(I.getArgOperand(2));
	unsigned IdxVal =
	unsigned(Idx->getLimitedValue(std::numeric_limits<int>::max()));
	MCSymbol *FrameAllocSym =
	MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal);

	// Create a MCSymbol for the label to avoid any target lowering
	// that would make this PC relative.
	SDValue OffsetSym = DAG.getMCSymbol(FrameAllocSym, PtrVT);
	SDValue OffsetVal =
	DAG.getNode(ISD::LOCAL_RECOVER, sdl, PtrVT, OffsetSym);

	// Add the offset to the FP.
	Value *FP = I.getArgOperand(1);
	SDValue FPVal = getValue(FP);
	SDValue Add = DAG.getNode(ISD::ADD, sdl, PtrVT, FPVal, OffsetVal);
	setValue(&I, Add);

	return;
	}

	case Intrinsic::eh_exceptionpointer:
	case Intrinsic::eh_exceptioncode: {
	// Get the exception pointer vreg, copy from it, and resize it to fit.
	const auto *CPI = cast<CatchPadInst>(I.getArgOperand(0));
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	const TargetRegisterClass *PtrRC = TLI.getRegClassFor(PtrVT);
	unsigned VReg = FuncInfo.getCatchPadExceptionPointerVReg(CPI, PtrRC);
	SDValue N =
	DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), VReg, PtrVT);
	if (Intrinsic == Intrinsic::eh_exceptioncode)
	N = DAG.getZExtOrTrunc(N, getCurSDLoc(), MVT::i32);
	setValue(&I, N);
	return;
	}
	case Intrinsic::xray_customevent: {
	// Here we want to make sure that the intrinsic behaves as if it has a
	// specific calling convention, and only for x86_64.
	// FIXME: Support other platforms later.
	const auto &Triple = DAG.getTarget().getTargetTriple();
	if (Triple.getArch() != Triple::x86_64 \|\| !Triple.isOSLinux())
	return;

	SDLoc DL = getCurSDLoc();
	SmallVector<SDValue, 8> Ops;

	// We want to say that we always want the arguments in registers.
	SDValue LogEntryVal = getValue(I.getArgOperand(0));
	SDValue StrSizeVal = getValue(I.getArgOperand(1));
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue Chain = getRoot();
	Ops.push_back(LogEntryVal);
	Ops.push_back(StrSizeVal);
	Ops.push_back(Chain);

	// We need to enforce the calling convention for the callsite, so that
	// argument ordering is enforced correctly, and that register allocation can
	// see that some registers may be assumed clobbered and have to preserve
	// them across calls to the intrinsic.
	MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHABLE_EVENT_CALL,
	DL, NodeTys, Ops);
	SDValue patchableNode = SDValue(MN, 0);
	DAG.setRoot(patchableNode);
	setValue(&I, patchableNode);
	return;
	}
	case Intrinsic::xray_typedevent: {
	// Here we want to make sure that the intrinsic behaves as if it has a
	// specific calling convention, and only for x86_64.
	// FIXME: Support other platforms later.
	const auto &Triple = DAG.getTarget().getTargetTriple();
	if (Triple.getArch() != Triple::x86_64 \|\| !Triple.isOSLinux())
	return;

	SDLoc DL = getCurSDLoc();
	SmallVector<SDValue, 8> Ops;

	// We want to say that we always want the arguments in registers.
	// It's unclear to me how manipulating the selection DAG here forces callers
	// to provide arguments in registers instead of on the stack.
	SDValue LogTypeId = getValue(I.getArgOperand(0));
	SDValue LogEntryVal = getValue(I.getArgOperand(1));
	SDValue StrSizeVal = getValue(I.getArgOperand(2));
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue Chain = getRoot();
	Ops.push_back(LogTypeId);
	Ops.push_back(LogEntryVal);
	Ops.push_back(StrSizeVal);
	Ops.push_back(Chain);

	// We need to enforce the calling convention for the callsite, so that
	// argument ordering is enforced correctly, and that register allocation can
	// see that some registers may be assumed clobbered and have to preserve
	// them across calls to the intrinsic.
	MachineSDNode *MN = DAG.getMachineNode(
	TargetOpcode::PATCHABLE_TYPED_EVENT_CALL, DL, NodeTys, Ops);
	SDValue patchableNode = SDValue(MN, 0);
	DAG.setRoot(patchableNode);
	setValue(&I, patchableNode);
	return;
	}
	case Intrinsic::experimental_deoptimize:
	LowerDeoptimizeCall(&I);
	return;

	case Intrinsic::experimental_vector_reduce_v2_fadd:
	case Intrinsic::experimental_vector_reduce_v2_fmul:
	case Intrinsic::experimental_vector_reduce_add:
	case Intrinsic::experimental_vector_reduce_mul:
	case Intrinsic::experimental_vector_reduce_and:
	case Intrinsic::experimental_vector_reduce_or:
	case Intrinsic::experimental_vector_reduce_xor:
	case Intrinsic::experimental_vector_reduce_smax:
	case Intrinsic::experimental_vector_reduce_smin:
	case Intrinsic::experimental_vector_reduce_umax:
	case Intrinsic::experimental_vector_reduce_umin:
	case Intrinsic::experimental_vector_reduce_fmax:
	case Intrinsic::experimental_vector_reduce_fmin:
	visitVectorReduce(I, Intrinsic);
	return;

	case Intrinsic::icall_branch_funnel: {
	SmallVector<SDValue, 16> Ops;
	Ops.push_back(getValue(I.getArgOperand(0)));

	int64_t Offset;
	auto *Base = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
	I.getArgOperand(1), Offset, DAG.getDataLayout()));
	if (!Base)
	report_fatal_error(
	"llvm.icall.branch.funnel operand must be a GlobalValue");
	Ops.push_back(DAG.getTargetGlobalAddress(Base, getCurSDLoc(), MVT::i64, 0));

	struct BranchFunnelTarget {
	int64_t Offset;
	SDValue Target;
	};
	SmallVector<BranchFunnelTarget, 8> Targets;

	for (unsigned Op = 1, N = I.getNumArgOperands(); Op != N; Op += 2) {
	auto *ElemBase = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
	I.getArgOperand(Op), Offset, DAG.getDataLayout()));
	if (ElemBase != Base)
	report_fatal_error("all llvm.icall.branch.funnel operands must refer "
	"to the same GlobalValue");

	SDValue Val = getValue(I.getArgOperand(Op + 1));
	auto *GA = dyn_cast<GlobalAddressSDNode>(Val);
	if (!GA)
	report_fatal_error(
	"llvm.icall.branch.funnel operand must be a GlobalValue");
	Targets.push_back({Offset, DAG.getTargetGlobalAddress(
	GA->getGlobal(), getCurSDLoc(),
	Val.getValueType(), GA->getOffset())});
	}
	llvm::sort(Targets,
	[](const BranchFunnelTarget &T1, const BranchFunnelTarget &T2) {
	return T1.Offset < T2.Offset;
	});

	for (auto &T : Targets) {
	Ops.push_back(DAG.getTargetConstant(T.Offset, getCurSDLoc(), MVT::i32));
	Ops.push_back(T.Target);
	}

	Ops.push_back(DAG.getRoot()); // Chain
	SDValue N(DAG.getMachineNode(TargetOpcode::ICALL_BRANCH_FUNNEL,
	getCurSDLoc(), MVT::Other, Ops),
	0);
	DAG.setRoot(N);
	setValue(&I, N);
	HasTailCall = true;
	return;
	}

	case Intrinsic::wasm_landingpad_index:
	// Information this intrinsic contained has been transferred to
	// MachineFunction in SelectionDAGISel::PrepareEHLandingPad. We can safely
	// delete it now.
	return;

	case Intrinsic::aarch64_settag:
	case Intrinsic::aarch64_settag_zero: {
	const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
	bool ZeroMemory = Intrinsic == Intrinsic::aarch64_settag_zero;
	SDValue Val = TSI.EmitTargetCodeForSetTag(
	DAG, getCurSDLoc(), getRoot(), getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1)), MachinePointerInfo(I.getArgOperand(0)),
	ZeroMemory);
	DAG.setRoot(Val);
	setValue(&I, Val);
	return;
	}
	}
	}

	void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
	const ConstrainedFPIntrinsic &FPI) {
	SDLoc sdl = getCurSDLoc();
	unsigned Opcode;
	switch (FPI.getIntrinsicID()) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::experimental_constrained_fadd:
	Opcode = ISD::STRICT_FADD;
	break;
	case Intrinsic::experimental_constrained_fsub:
	Opcode = ISD::STRICT_FSUB;
	break;
	case Intrinsic::experimental_constrained_fmul:
	Opcode = ISD::STRICT_FMUL;
	break;
	case Intrinsic::experimental_constrained_fdiv:
	Opcode = ISD::STRICT_FDIV;
	break;
	case Intrinsic::experimental_constrained_frem:
	Opcode = ISD::STRICT_FREM;
	break;
	case Intrinsic::experimental_constrained_fma:
	Opcode = ISD::STRICT_FMA;
	break;
	case Intrinsic::experimental_constrained_fptrunc:
	Opcode = ISD::STRICT_FP_ROUND;
	break;
	case Intrinsic::experimental_constrained_fpext:
	Opcode = ISD::STRICT_FP_EXTEND;
	break;
	case Intrinsic::experimental_constrained_sqrt:
	Opcode = ISD::STRICT_FSQRT;
	break;
	case Intrinsic::experimental_constrained_pow:
	Opcode = ISD::STRICT_FPOW;
	break;
	case Intrinsic::experimental_constrained_powi:
	Opcode = ISD::STRICT_FPOWI;
	break;
	case Intrinsic::experimental_constrained_sin:
	Opcode = ISD::STRICT_FSIN;
	break;
	case Intrinsic::experimental_constrained_cos:
	Opcode = ISD::STRICT_FCOS;
	break;
	case Intrinsic::experimental_constrained_exp:
	Opcode = ISD::STRICT_FEXP;
	break;
	case Intrinsic::experimental_constrained_exp2:
	Opcode = ISD::STRICT_FEXP2;
	break;
	case Intrinsic::experimental_constrained_log:
	Opcode = ISD::STRICT_FLOG;
	break;
	case Intrinsic::experimental_constrained_log10:
	Opcode = ISD::STRICT_FLOG10;
	break;
	case Intrinsic::experimental_constrained_log2:
	Opcode = ISD::STRICT_FLOG2;
	break;
	case Intrinsic::experimental_constrained_rint:
	Opcode = ISD::STRICT_FRINT;
	break;
	case Intrinsic::experimental_constrained_nearbyint:
	Opcode = ISD::STRICT_FNEARBYINT;
	break;
	case Intrinsic::experimental_constrained_maxnum:
	Opcode = ISD::STRICT_FMAXNUM;
	break;
	case Intrinsic::experimental_constrained_minnum:
	Opcode = ISD::STRICT_FMINNUM;
	break;
	case Intrinsic::experimental_constrained_ceil:
	Opcode = ISD::STRICT_FCEIL;
	break;
	case Intrinsic::experimental_constrained_floor:
	Opcode = ISD::STRICT_FFLOOR;
	break;
	case Intrinsic::experimental_constrained_round:
	Opcode = ISD::STRICT_FROUND;
	break;
	case Intrinsic::experimental_constrained_trunc:
	Opcode = ISD::STRICT_FTRUNC;
	break;
	}
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Chain = getRoot();
	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), FPI.getType(), ValueVTs);
	ValueVTs.push_back(MVT::Other); // Out chain

	SDVTList VTs = DAG.getVTList(ValueVTs);
	SDValue Result;
	if (Opcode == ISD::STRICT_FP_ROUND)
	Result = DAG.getNode(Opcode, sdl, VTs,
	{ Chain, getValue(FPI.getArgOperand(0)),
	DAG.getTargetConstant(0, sdl,
	TLI.getPointerTy(DAG.getDataLayout())) });
	else if (FPI.isUnaryOp())
	Result = DAG.getNode(Opcode, sdl, VTs,
	{ Chain, getValue(FPI.getArgOperand(0)) });
	else if (FPI.isTernaryOp())
	Result = DAG.getNode(Opcode, sdl, VTs,
	{ Chain, getValue(FPI.getArgOperand(0)),
	getValue(FPI.getArgOperand(1)),
	getValue(FPI.getArgOperand(2)) });
	else
	Result = DAG.getNode(Opcode, sdl, VTs,
	{ Chain, getValue(FPI.getArgOperand(0)),
	getValue(FPI.getArgOperand(1)) });

	if (FPI.getExceptionBehavior() !=
	ConstrainedFPIntrinsic::ExceptionBehavior::ebIgnore) {
	SDNodeFlags Flags;
	Flags.setFPExcept(true);
	Result->setFlags(Flags);
	}

	assert(Result.getNode()->getNumValues() == 2);
	SDValue OutChain = Result.getValue(1);
	DAG.setRoot(OutChain);
	SDValue FPResult = Result.getValue(0);
	setValue(&FPI, FPResult);
	}

	std::pair<SDValue, SDValue>
	SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
	const BasicBlock *EHPadBB) {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineModuleInfo &MMI = MF.getMMI();
	MCSymbol *BeginLabel = nullptr;

	if (EHPadBB) {
	// Insert a label before the invoke call to mark the try range. This can be
	// used to detect deletion of the invoke via the MachineModuleInfo.
	BeginLabel = MMI.getContext().createTempSymbol();

	// For SjLj, keep track of which landing pads go with which invokes
	// so as to maintain the ordering of pads in the LSDA.
	unsigned CallSiteIndex = MMI.getCurrentCallSite();
	if (CallSiteIndex) {
	MF.setCallSiteBeginLabel(BeginLabel, CallSiteIndex);
	LPadToCallSiteMap[FuncInfo.MBBMap[EHPadBB]].push_back(CallSiteIndex);

	// Now that the call site is handled, stop tracking it.
	MMI.setCurrentCallSite(0);
	}

	// Both PendingLoads and PendingExports must be flushed here;
	// this call might not return.
	(void)getRoot();
	DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getControlRoot(), BeginLabel));

	CLI.setChain(getRoot());
	}
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);

	assert((CLI.IsTailCall \|\| Result.second.getNode()) &&
	"Non-null chain expected with non-tail call!");
	assert((Result.second.getNode() \|\| !Result.first.getNode()) &&
	"Null value expected with tail call!");

	if (!Result.second.getNode()) {
	// As a special case, a null chain means that a tail call has been emitted
	// and the DAG root is already updated.
	HasTailCall = true;

	// Since there's no actual continuation from this block, nothing can be
	// relying on us setting vregs for them.
	PendingExports.clear();
	} else {
	DAG.setRoot(Result.second);
	}

	if (EHPadBB) {
	// Insert a label at the end of the invoke call to mark the try range. This
	// can be used to detect deletion of the invoke via the MachineModuleInfo.
	MCSymbol *EndLabel = MMI.getContext().createTempSymbol();
	DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getRoot(), EndLabel));

	// Inform MachineModuleInfo of range.
	auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
	// There is a platform (e.g. wasm) that uses funclet style IR but does not
	// actually use outlined funclets and their LSDA info style.
	if (MF.hasEHFunclets() && isFuncletEHPersonality(Pers)) {
	assert(CLI.CS);
	WinEHFuncInfo *EHInfo = DAG.getMachineFunction().getWinEHFuncInfo();
	EHInfo->addIPToStateRange(cast<InvokeInst>(CLI.CS.getInstruction()),
	BeginLabel, EndLabel);
	} else if (!isScopedEHPersonality(Pers)) {
	MF.addInvoke(FuncInfo.MBBMap[EHPadBB], BeginLabel, EndLabel);
	}
	}

	return Result;
	}

	void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
	bool isTailCall,
	const BasicBlock *EHPadBB) {
	auto &DL = DAG.getDataLayout();
	FunctionType *FTy = CS.getFunctionType();
	Type *RetTy = CS.getType();

	TargetLowering::ArgListTy Args;
	Args.reserve(CS.arg_size());

	const Value *SwiftErrorVal = nullptr;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// We can't tail call inside a function with a swifterror argument. Lowering
	// does not support this yet. It would have to move into the swifterror
	// register before the call.
	auto *Caller = CS.getInstruction()->getParent()->getParent();
	if (TLI.supportSwiftError() &&
	Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
	isTailCall = false;

	for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
	i != e; ++i) {
	TargetLowering::ArgListEntry Entry;
	const Value V = i;

	// Skip empty types
	if (V->getType()->isEmptyTy())
	continue;

	SDValue ArgNode = getValue(V);
	Entry.Node = ArgNode; Entry.Ty = V->getType();

	Entry.setAttributes(&CS, i - CS.arg_begin());

	// Use swifterror virtual register as input to the call.
	if (Entry.IsSwiftError && TLI.supportSwiftError()) {
	SwiftErrorVal = V;
	// We find the virtual register for the actual swifterror argument.
	// Instead of using the Value, we use the virtual register instead.
	Entry.Node = DAG.getRegister(
	SwiftError.getOrCreateVRegUseAt(CS.getInstruction(), FuncInfo.MBB, V),
	EVT(TLI.getPointerTy(DL)));
	}

	Args.push_back(Entry);

	// If we have an explicit sret argument that is an Instruction, (i.e., it
	// might point to function-local memory), we can't meaningfully tail-call.
	if (Entry.IsSRet && isa<Instruction>(V))
	isTailCall = false;
	}

	// Check if target-independent constraints permit a tail call here.
	// Target-dependent constraints are checked within TLI->LowerCallTo.
	if (isTailCall && !isInTailCallPosition(CS, DAG.getTarget()))
	isTailCall = false;

	// Disable tail calls if there is an swifterror argument. Targets have not
	// been updated to support tail calls.
	if (TLI.supportSwiftError() && SwiftErrorVal)
	isTailCall = false;

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(getCurSDLoc())
	.setChain(getRoot())
	.setCallee(RetTy, FTy, Callee, std::move(Args), CS)
	.setTailCall(isTailCall)
	.setConvergent(CS.isConvergent());
	std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);

	if (Result.first.getNode()) {
	const Instruction *Inst = CS.getInstruction();
	Result.first = lowerRangeToAssertZExt(DAG, *Inst, Result.first);
	setValue(Inst, Result.first);
	}

	// The last element of CLI.InVals has the SDValue for swifterror return.
	// Here we copy it to a virtual register and update SwiftErrorMap for
	// book-keeping.
	if (SwiftErrorVal && TLI.supportSwiftError()) {
	// Get the last element of InVals.
	SDValue Src = CLI.InVals.back();
	unsigned VReg = SwiftError.getOrCreateVRegDefAt(
	CS.getInstruction(), FuncInfo.MBB, SwiftErrorVal);
	SDValue CopyNode = CLI.DAG.getCopyToReg(Result.second, CLI.DL, VReg, Src);
	DAG.setRoot(CopyNode);
	}
	}

	static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
	SelectionDAGBuilder &Builder) {
	// Check to see if this load can be trivially constant folded, e.g. if the
	// input is from a string literal.
	if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
	// Cast pointer to the type we really want to load.
	Type *LoadTy =
	Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits());
	if (LoadVT.isVector())
	LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements());

	LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
	PointerType::getUnqual(LoadTy));

	if (const Constant *LoadCst = ConstantFoldLoadFromConstPtr(
	const_cast<Constant >(LoadInput), LoadTy, Builder.DL))
	return Builder.getValue(LoadCst);
	}

	// Otherwise, we have to emit the load. If the pointer is to unfoldable but
	// still constant memory, the input chain can be the entry node.
	SDValue Root;
	bool ConstantMemory = false;

	// Do not serialize (non-volatile) loads of constant memory with anything.
	if (Builder.AA && Builder.AA->pointsToConstantMemory(PtrVal)) {
	Root = Builder.DAG.getEntryNode();
	ConstantMemory = true;
	} else {
	// Do not serialize non-volatile loads against each other.
	Root = Builder.DAG.getRoot();
	}

	SDValue Ptr = Builder.getValue(PtrVal);
	SDValue LoadVal = Builder.DAG.getLoad(LoadVT, Builder.getCurSDLoc(), Root,
	Ptr, MachinePointerInfo(PtrVal),
	/* Alignment = */ 1);

	if (!ConstantMemory)
	Builder.PendingLoads.push_back(LoadVal.getValue(1));
	return LoadVal;
	}

	/// Record the value for an instruction that produces an integer result,
	/// converting the type where necessary.
	void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
	SDValue Value,
	bool IsSigned) {
	EVT VT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType(), true);
	if (IsSigned)
	Value = DAG.getSExtOrTrunc(Value, getCurSDLoc(), VT);
	else
	Value = DAG.getZExtOrTrunc(Value, getCurSDLoc(), VT);
	setValue(&I, Value);
	}

	/// See if we can lower a memcmp call into an optimized form. If so, return
	/// true and lower it. Otherwise return false, and it will be lowered like a
	/// normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
	const Value LHS = I.getArgOperand(0), RHS = I.getArgOperand(1);
	const Value *Size = I.getArgOperand(2);
	const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
	if (CSize && CSize->getZExtValue() == 0) {
	EVT CallVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType(), true);
	setValue(&I, DAG.getConstant(0, getCurSDLoc(), CallVT));
	return true;
	}

	const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
	std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp(
	DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS),
	getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS));
	if (Res.first.getNode()) {
	processIntegerCallValue(I, Res.first, true);
	PendingLoads.push_back(Res.second);
	return true;
	}

	// memcmp(S1,S2,2) != 0 -> ((short)LHS != (short)RHS) != 0
	// memcmp(S1,S2,4) != 0 -> ((int)LHS != (int)RHS) != 0
	if (!CSize \|\| !isOnlyUsedInZeroEqualityComparison(&I))
	return false;

	// If the target has a fast compare for the given size, it will return a
	// preferred load type for that size. Require that the load VT is legal and
	// that the target supports unaligned loads of that type. Otherwise, return
	// INVALID.
	auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT LVT = TLI.hasFastEqualityCompare(NumBits);
	if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
	// TODO: Handle 5 byte compare as 4-byte + 1 byte.
	// TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
	// TODO: Check alignment of src and dest ptrs.
	unsigned DstAS = LHS->getType()->getPointerAddressSpace();
	unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
	if (!TLI.isTypeLegal(LVT) \|\|
	!TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) \|\|
	!TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
	LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	}

	return LVT;
	};

	// This turns into unaligned loads. We only do this if the target natively
	// supports the MVT we'll be loading or if it is small enough (<= 4) that
	// we'll only produce a small number of byte loads.
	MVT LoadVT;
	unsigned NumBitsToCompare = CSize->getZExtValue() * 8;
	switch (NumBitsToCompare) {
	default:
	return false;
	case 16:
	LoadVT = MVT::i16;
	break;
	case 32:
	LoadVT = MVT::i32;
	break;
	case 64:
	case 128:
	case 256:
	LoadVT = hasFastLoadsAndCompare(NumBitsToCompare);
	break;
	}

	if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE)
	return false;

	SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
	SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);

	// Bitcast to a wide integer type if the loads are vectors.
	if (LoadVT.isVector()) {
	EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
	LoadL = DAG.getBitcast(CmpVT, LoadL);
	LoadR = DAG.getBitcast(CmpVT, LoadR);
	}

	SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
	processIntegerCallValue(I, Cmp, false);
	return true;
	}

	/// See if we can lower a memchr call into an optimized form. If so, return
	/// true and lower it. Otherwise return false, and it will be lowered like a
	/// normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
	const Value *Src = I.getArgOperand(0);
	const Value *Char = I.getArgOperand(1);
	const Value *Length = I.getArgOperand(2);

	const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
	std::pair<SDValue, SDValue> Res =
	TSI.EmitTargetCodeForMemchr(DAG, getCurSDLoc(), DAG.getRoot(),
	getValue(Src), getValue(Char), getValue(Length),
	MachinePointerInfo(Src));
	if (Res.first.getNode()) {
	setValue(&I, Res.first);
	PendingLoads.push_back(Res.second);
	return true;
	}

	return false;
	}

	/// See if we can lower a mempcpy call into an optimized form. If so, return
	/// true and lower it. Otherwise return false, and it will be lowered like a
	/// normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
	SDValue Dst = getValue(I.getArgOperand(0));
	SDValue Src = getValue(I.getArgOperand(1));
	SDValue Size = getValue(I.getArgOperand(2));

	unsigned DstAlign = DAG.InferPtrAlignment(Dst);
	unsigned SrcAlign = DAG.InferPtrAlignment(Src);
	unsigned Align = std::min(DstAlign, SrcAlign);
	if (Align == 0) // Alignment of one or both could not be inferred.
	Align = 1; // 0 and 1 both specify no alignment, but 0 is reserved.

	bool isVol = false;
	SDLoc sdl = getCurSDLoc();

	// In the mempcpy context we need to pass in a false value for isTailCall
	// because the return pointer needs to be adjusted by the size of
	// the copied memory.
	SDValue MC = DAG.getMemcpy(getRoot(), sdl, Dst, Src, Size, Align, isVol,
	false, /isTailCall=/false,
	MachinePointerInfo(I.getArgOperand(0)),
	MachinePointerInfo(I.getArgOperand(1)));
	assert(MC.getNode() != nullptr &&
	" memcpy should not be lowered as TailCall in mempcpy context ");
	DAG.setRoot(MC);

	// Check if Size needs to be truncated or extended.
	Size = DAG.getSExtOrTrunc(Size, sdl, Dst.getValueType());

	// Adjust return pointer to point just past the last dst byte.
	SDValue DstPlusSize = DAG.getNode(ISD::ADD, sdl, Dst.getValueType(),
	Dst, Size);
	setValue(&I, DstPlusSize);
	return true;
	}

	/// See if we can lower a strcpy call into an optimized form. If so, return
	/// true and lower it, otherwise return false and it will be lowered like a
	/// normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
	const Value Arg0 = I.getArgOperand(0), Arg1 = I.getArgOperand(1);

	const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
	std::pair<SDValue, SDValue> Res =
	TSI.EmitTargetCodeForStrcpy(DAG, getCurSDLoc(), getRoot(),
	getValue(Arg0), getValue(Arg1),
	MachinePointerInfo(Arg0),
	MachinePointerInfo(Arg1), isStpcpy);
	if (Res.first.getNode()) {
	setValue(&I, Res.first);
	DAG.setRoot(Res.second);
	return true;
	}

	return false;
	}

	/// See if we can lower a strcmp call into an optimized form. If so, return
	/// true and lower it, otherwise return false and it will be lowered like a
	/// normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
	const Value Arg0 = I.getArgOperand(0), Arg1 = I.getArgOperand(1);

	const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
	std::pair<SDValue, SDValue> Res =
	TSI.EmitTargetCodeForStrcmp(DAG, getCurSDLoc(), DAG.getRoot(),
	getValue(Arg0), getValue(Arg1),
	MachinePointerInfo(Arg0),
	MachinePointerInfo(Arg1));
	if (Res.first.getNode()) {
	processIntegerCallValue(I, Res.first, true);
	PendingLoads.push_back(Res.second);
	return true;
	}

	return false;
	}

	/// See if we can lower a strlen call into an optimized form. If so, return
	/// true and lower it, otherwise return false and it will be lowered like a
	/// normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
	const Value *Arg0 = I.getArgOperand(0);

	const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
	std::pair<SDValue, SDValue> Res =
	TSI.EmitTargetCodeForStrlen(DAG, getCurSDLoc(), DAG.getRoot(),
	getValue(Arg0), MachinePointerInfo(Arg0));
	if (Res.first.getNode()) {
	processIntegerCallValue(I, Res.first, false);
	PendingLoads.push_back(Res.second);
	return true;
	}

	return false;
	}

	/// See if we can lower a strnlen call into an optimized form. If so, return
	/// true and lower it, otherwise return false and it will be lowered like a
	/// normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
	const Value Arg0 = I.getArgOperand(0), Arg1 = I.getArgOperand(1);

	const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
	std::pair<SDValue, SDValue> Res =
	TSI.EmitTargetCodeForStrnlen(DAG, getCurSDLoc(), DAG.getRoot(),
	getValue(Arg0), getValue(Arg1),
	MachinePointerInfo(Arg0));
	if (Res.first.getNode()) {
	processIntegerCallValue(I, Res.first, false);
	PendingLoads.push_back(Res.second);
	return true;
	}

	return false;
	}

	/// See if we can lower a unary floating-point operation into an SDNode with
	/// the specified Opcode. If so, return true and lower it, otherwise return
	/// false and it will be lowered like a normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
	unsigned Opcode) {
	// We already checked this call's prototype; verify it doesn't modify errno.
	if (!I.onlyReadsMemory())
	return false;

	SDValue Tmp = getValue(I.getArgOperand(0));
	setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), Tmp.getValueType(), Tmp));
	return true;
	}

	/// See if we can lower a binary floating-point operation into an SDNode with
	/// the specified Opcode. If so, return true and lower it. Otherwise return
	/// false, and it will be lowered like a normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,
	unsigned Opcode) {
	// We already checked this call's prototype; verify it doesn't modify errno.
	if (!I.onlyReadsMemory())
	return false;

	SDValue Tmp0 = getValue(I.getArgOperand(0));
	SDValue Tmp1 = getValue(I.getArgOperand(1));
	EVT VT = Tmp0.getValueType();
	setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), VT, Tmp0, Tmp1));
	return true;
	}

	void SelectionDAGBuilder::visitCall(const CallInst &I) {
	// Handle inline assembly differently.
	if (isa<InlineAsm>(I.getCalledValue())) {
	visitInlineAsm(&I);
	return;
	}

	if (Function *F = I.getCalledFunction()) {
	if (F->isDeclaration()) {
	// Is this an LLVM intrinsic or a target-specific intrinsic?
	unsigned IID = F->getIntrinsicID();
	if (!IID)
	if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo())
	IID = II->getIntrinsicID(F);

	if (IID) {
	visitIntrinsicCall(I, IID);
	return;
	}
	}

	// Check for well-known libc/libm calls. If the function is internal, it
	// can't be a library call. Don't do the check if marked as nobuiltin for
	// some reason or the call site requires strict floating point semantics.
	LibFunc Func;
	if (!I.isNoBuiltin() && !I.isStrictFP() && !F->hasLocalLinkage() &&
	F->hasName() && LibInfo->getLibFunc(*F, Func) &&
	LibInfo->hasOptimizedCodeGen(Func)) {
	switch (Func) {
	default: break;
	case LibFunc_copysign:
	case LibFunc_copysignf:
	case LibFunc_copysignl:
	// We already checked this call's prototype; verify it doesn't modify
	// errno.
	if (I.onlyReadsMemory()) {
	SDValue LHS = getValue(I.getArgOperand(0));
	SDValue RHS = getValue(I.getArgOperand(1));
	setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(),
	LHS.getValueType(), LHS, RHS));
	return;
	}
	break;
	case LibFunc_fabs:
	case LibFunc_fabsf:
	case LibFunc_fabsl:
	if (visitUnaryFloatCall(I, ISD::FABS))
	return;
	break;
	case LibFunc_fmin:
	case LibFunc_fminf:
	case LibFunc_fminl:
	if (visitBinaryFloatCall(I, ISD::FMINNUM))
	return;
	break;
	case LibFunc_fmax:
	case LibFunc_fmaxf:
	case LibFunc_fmaxl:
	if (visitBinaryFloatCall(I, ISD::FMAXNUM))
	return;
	break;
	case LibFunc_sin:
	case LibFunc_sinf:
	case LibFunc_sinl:
	if (visitUnaryFloatCall(I, ISD::FSIN))
	return;
	break;
	case LibFunc_cos:
	case LibFunc_cosf:
	case LibFunc_cosl:
	if (visitUnaryFloatCall(I, ISD::FCOS))
	return;
	break;
	case LibFunc_sqrt:
	case LibFunc_sqrtf:
	case LibFunc_sqrtl:
	case LibFunc_sqrt_finite:
	case LibFunc_sqrtf_finite:
	case LibFunc_sqrtl_finite:
	if (visitUnaryFloatCall(I, ISD::FSQRT))
	return;
	break;
	case LibFunc_floor:
	case LibFunc_floorf:
	case LibFunc_floorl:
	if (visitUnaryFloatCall(I, ISD::FFLOOR))
	return;
	break;
	case LibFunc_nearbyint:
	case LibFunc_nearbyintf:
	case LibFunc_nearbyintl:
	if (visitUnaryFloatCall(I, ISD::FNEARBYINT))
	return;
	break;
	case LibFunc_ceil:
	case LibFunc_ceilf:
	case LibFunc_ceill:
	if (visitUnaryFloatCall(I, ISD::FCEIL))
	return;
	break;
	case LibFunc_rint:
	case LibFunc_rintf:
	case LibFunc_rintl:
	if (visitUnaryFloatCall(I, ISD::FRINT))
	return;
	break;
	case LibFunc_round:
	case LibFunc_roundf:
	case LibFunc_roundl:
	if (visitUnaryFloatCall(I, ISD::FROUND))
	return;
	break;
	case LibFunc_trunc:
	case LibFunc_truncf:
	case LibFunc_truncl:
	if (visitUnaryFloatCall(I, ISD::FTRUNC))
	return;
	break;
	case LibFunc_log2:
	case LibFunc_log2f:
	case LibFunc_log2l:
	if (visitUnaryFloatCall(I, ISD::FLOG2))
	return;
	break;
	case LibFunc_exp2:
	case LibFunc_exp2f:
	case LibFunc_exp2l:
	if (visitUnaryFloatCall(I, ISD::FEXP2))
	return;
	break;
	case LibFunc_memcmp:
	if (visitMemCmpCall(I))
	return;
	break;
	case LibFunc_mempcpy:
	if (visitMemPCpyCall(I))
	return;
	break;
	case LibFunc_memchr:
	if (visitMemChrCall(I))
	return;
	break;
	case LibFunc_strcpy:
	if (visitStrCpyCall(I, false))
	return;
	break;
	case LibFunc_stpcpy:
	if (visitStrCpyCall(I, true))
	return;
	break;
	case LibFunc_strcmp:
	if (visitStrCmpCall(I))
	return;
	break;
	case LibFunc_strlen:
	if (visitStrLenCall(I))
	return;
	break;
	case LibFunc_strnlen:
	if (visitStrNLenCall(I))
	return;
	break;
	}
	}
	}

	// Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
	// have to do anything here to lower funclet bundles.
	assert(!I.hasOperandBundlesOtherThan(
	{LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
	"Cannot lower calls with arbitrary operand bundles!");

	SDValue Callee = getValue(I.getCalledValue());

	if (I.countOperandBundlesOfType(LLVMContext::OB_deopt))
	LowerCallSiteWithDeoptBundle(&I, Callee, nullptr);
	else
	// Check if we can potentially perform a tail call. More detailed checking
	// is be done within LowerCallTo, after more information about the call is
	// known.
	LowerCallTo(&I, Callee, I.isTailCall());
	}

	namespace {

	/// AsmOperandInfo - This contains information for each constraint that we are
	/// lowering.
	class SDISelAsmOperandInfo : public TargetLowering::AsmOperandInfo {
	public:
	/// CallOperand - If this is the result output operand or a clobber
	/// this is null, otherwise it is the incoming operand to the CallInst.
	/// This gets modified as the asm is processed.
	SDValue CallOperand;

	/// AssignedRegs - If this is a register or register class operand, this
	/// contains the set of register corresponding to the operand.
	RegsForValue AssignedRegs;

	explicit SDISelAsmOperandInfo(const TargetLowering::AsmOperandInfo &info)
	: TargetLowering::AsmOperandInfo(info), CallOperand(nullptr, 0) {
	}

	/// Whether or not this operand accesses memory
	bool hasMemory(const TargetLowering &TLI) const {
	// Indirect operand accesses access memory.
	if (isIndirect)
	return true;

	for (const auto &Code : Codes)
	if (TLI.getConstraintType(Code) == TargetLowering::C_Memory)
	return true;

	return false;
	}

	/// getCallOperandValEVT - Return the EVT of the Value* that this operand
	/// corresponds to. If there is no Value* for this operand, it returns
	/// MVT::Other.
	EVT getCallOperandValEVT(LLVMContext &Context, const TargetLowering &TLI,
	const DataLayout &DL) const {
	if (!CallOperandVal) return MVT::Other;

	if (isa<BasicBlock>(CallOperandVal))
	return TLI.getPointerTy(DL);

	llvm::Type *OpTy = CallOperandVal->getType();

	// FIXME: code duplicated from TargetLowering::ParseConstraints().
	// If this is an indirect operand, the operand is a pointer to the
	// accessed type.
	if (isIndirect) {
	PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
	if (!PtrTy)
	report_fatal_error("Indirect operand for inline asm not a pointer!");
	OpTy = PtrTy->getElementType();
	}

	// Look for vector wrapped in a struct. e.g. { <16 x i8> }.
	if (StructType *STy = dyn_cast<StructType>(OpTy))
	if (STy->getNumElements() == 1)
	OpTy = STy->getElementType(0);

	// If OpTy is not a single value, it may be a struct/union that we
	// can tile with integers.
	if (!OpTy->isSingleValueType() && OpTy->isSized()) {
	unsigned BitSize = DL.getTypeSizeInBits(OpTy);
	switch (BitSize) {
	default: break;
	case 1:
	case 8:
	case 16:
	case 32:
	case 64:
	case 128:
	OpTy = IntegerType::get(Context, BitSize);
	break;
	}
	}

	return TLI.getValueType(DL, OpTy, true);
	}
	};

	using SDISelAsmOperandInfoVector = SmallVector<SDISelAsmOperandInfo, 16>;

	} // end anonymous namespace

	/// Make sure that the output operand \p OpInfo and its corresponding input
	/// operand \p MatchingOpInfo have compatible constraint types (otherwise error
	/// out).
	static void patchMatchingInput(const SDISelAsmOperandInfo &OpInfo,
	SDISelAsmOperandInfo &MatchingOpInfo,
	SelectionDAG &DAG) {
	if (OpInfo.ConstraintVT == MatchingOpInfo.ConstraintVT)
	return;

	const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
	const auto &TLI = DAG.getTargetLoweringInfo();

	std::pair<unsigned, const TargetRegisterClass *> MatchRC =
	TLI.getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
	OpInfo.ConstraintVT);
	std::pair<unsigned, const TargetRegisterClass *> InputRC =
	TLI.getRegForInlineAsmConstraint(TRI, MatchingOpInfo.ConstraintCode,
	MatchingOpInfo.ConstraintVT);
	if ((OpInfo.ConstraintVT.isInteger() !=
	MatchingOpInfo.ConstraintVT.isInteger()) \|\|
	(MatchRC.second != InputRC.second)) {
	// FIXME: error out in a more elegant fashion
	report_fatal_error("Unsupported asm: input constraint"
	" with a matching output constraint of"
	" incompatible type!");
	}
	MatchingOpInfo.ConstraintVT = OpInfo.ConstraintVT;
	}

	/// Get a direct memory input to behave well as an indirect operand.
	/// This may introduce stores, hence the need for a \p Chain.
	/// \return The (possibly updated) chain.
	static SDValue getAddressForMemoryInput(SDValue Chain, const SDLoc &Location,
	SDISelAsmOperandInfo &OpInfo,
	SelectionDAG &DAG) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// If we don't have an indirect input, put it in the constpool if we can,
	// otherwise spill it to a stack slot.
	// TODO: This isn't quite right. We need to handle these according to
	// the addressing mode that the constraint wants. Also, this may take
	// an additional register for the computation and we don't want that
	// either.

	// If the operand is a float, integer, or vector constant, spill to a
	// constant pool entry to get its address.
	const Value *OpVal = OpInfo.CallOperandVal;
	if (isa<ConstantFP>(OpVal) \|\| isa<ConstantInt>(OpVal) \|\|
	isa<ConstantVector>(OpVal) \|\| isa<ConstantDataVector>(OpVal)) {
	OpInfo.CallOperand = DAG.getConstantPool(
	cast<Constant>(OpVal), TLI.getPointerTy(DAG.getDataLayout()));
	return Chain;
	}

	// Otherwise, create a stack slot and emit a store to it before the asm.
	Type *Ty = OpVal->getType();
	auto &DL = DAG.getDataLayout();
	uint64_t TySize = DL.getTypeAllocSize(Ty);
	unsigned Align = DL.getPrefTypeAlignment(Ty);
	MachineFunction &MF = DAG.getMachineFunction();
	int SSFI = MF.getFrameInfo().CreateStackObject(TySize, Align, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getFrameIndexTy(DL));
	Chain = DAG.getTruncStore(Chain, Location, OpInfo.CallOperand, StackSlot,
	MachinePointerInfo::getFixedStack(MF, SSFI),
	TLI.getMemValueType(DL, Ty));
	OpInfo.CallOperand = StackSlot;

	return Chain;
	}

	/// GetRegistersForValue - Assign registers (virtual or physical) for the
	/// specified operand. We prefer to assign virtual registers, to allow the
	/// register allocator to handle the assignment process. However, if the asm
	/// uses features that we can't model on machineinstrs, we have SDISel do the
	/// allocation. This produces generally horrible, but correct, code.
	///
	/// OpInfo describes the operand
	/// RefOpInfo describes the matching operand if any, the operand otherwise
	static void GetRegistersForValue(SelectionDAG &DAG, const SDLoc &DL,
	SDISelAsmOperandInfo &OpInfo,
	SDISelAsmOperandInfo &RefOpInfo) {
	LLVMContext &Context = *DAG.getContext();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	MachineFunction &MF = DAG.getMachineFunction();
	SmallVector<unsigned, 4> Regs;
	const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();

	// No work to do for memory operations.
	if (OpInfo.ConstraintType == TargetLowering::C_Memory)
	return;

	// If this is a constraint for a single physreg, or a constraint for a
	// register class, find it.
	unsigned AssignedReg;
	const TargetRegisterClass *RC;
	std::tie(AssignedReg, RC) = TLI.getRegForInlineAsmConstraint(
	&TRI, RefOpInfo.ConstraintCode, RefOpInfo.ConstraintVT);
	// RC is unset only on failure. Return immediately.
	if (!RC)
	return;

	// Get the actual register value type. This is important, because the user
	// may have asked for (e.g.) the AX register in i32 type. We need to
	// remember that AX is actually i16 to get the right extension.
	const MVT RegVT = TRI.legalclasstypes_begin(RC);

	if (OpInfo.ConstraintVT != MVT::Other) {
	// If this is an FP operand in an integer register (or visa versa), or more
	// generally if the operand value disagrees with the register class we plan
	// to stick it in, fix the operand type.
	//
	// If this is an input value, the bitcast to the new type is done now.
	// Bitcast for output value is done at the end of visitInlineAsm().
	if ((OpInfo.Type == InlineAsm::isOutput \|\|
	OpInfo.Type == InlineAsm::isInput) &&
	!TRI.isTypeLegalForClass(*RC, OpInfo.ConstraintVT)) {
	// Try to convert to the first EVT that the reg class contains. If the
	// types are identical size, use a bitcast to convert (e.g. two differing
	// vector types). Note: output bitcast is done at the end of
	// visitInlineAsm().
	if (RegVT.getSizeInBits() == OpInfo.ConstraintVT.getSizeInBits()) {
	// Exclude indirect inputs while they are unsupported because the code
	// to perform the load is missing and thus OpInfo.CallOperand still
	// refers to the input address rather than the pointed-to value.
	if (OpInfo.Type == InlineAsm::isInput && !OpInfo.isIndirect)
	OpInfo.CallOperand =
	DAG.getNode(ISD::BITCAST, DL, RegVT, OpInfo.CallOperand);
	OpInfo.ConstraintVT = RegVT;
	// If the operand is an FP value and we want it in integer registers,
	// use the corresponding integer type. This turns an f64 value into
	// i64, which can be passed with two i32 values on a 32-bit machine.
	} else if (RegVT.isInteger() && OpInfo.ConstraintVT.isFloatingPoint()) {
	MVT VT = MVT::getIntegerVT(OpInfo.ConstraintVT.getSizeInBits());
	if (OpInfo.Type == InlineAsm::isInput)
	OpInfo.CallOperand =
	DAG.getNode(ISD::BITCAST, DL, VT, OpInfo.CallOperand);
	OpInfo.ConstraintVT = VT;
	}
	}
	}

	// No need to allocate a matching input constraint since the constraint it's
	// matching to has already been allocated.
	if (OpInfo.isMatchingInputConstraint())
	return;

	EVT ValueVT = OpInfo.ConstraintVT;
	if (OpInfo.ConstraintVT == MVT::Other)
	ValueVT = RegVT;

	// Initialize NumRegs.
	unsigned NumRegs = 1;
	if (OpInfo.ConstraintVT != MVT::Other)
	NumRegs = TLI.getNumRegisters(Context, OpInfo.ConstraintVT);

	// If this is a constraint for a specific physical register, like {r17},
	// assign it now.

	// If this associated to a specific register, initialize iterator to correct
	// place. If virtual, make sure we have enough registers

	// Initialize iterator if necessary
	TargetRegisterClass::iterator I = RC->begin();
	MachineRegisterInfo &RegInfo = MF.getRegInfo();

	// Do not check for single registers.
	if (AssignedReg) {
	for (; *I != AssignedReg; ++I)
	assert(I != RC->end() && "AssignedReg should be member of RC");
	}

	for (; NumRegs; --NumRegs, ++I) {
	assert(I != RC->end() && "Ran out of registers to allocate!");
	Register R = AssignedReg ? Register(*I) : RegInfo.createVirtualRegister(RC);
	Regs.push_back(R);
	}

	OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
	}

	static unsigned
	findMatchingInlineAsmOperand(unsigned OperandNo,
	const std::vector<SDValue> &AsmNodeOperands) {
	// Scan until we find the definition we already emitted of this operand.
	unsigned CurOp = InlineAsm::Op_FirstOperand;
	for (; OperandNo; --OperandNo) {
	// Advance to the next operand.
	unsigned OpFlag =
	cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
	assert((InlineAsm::isRegDefKind(OpFlag) \|\|
	InlineAsm::isRegDefEarlyClobberKind(OpFlag) \|\|
	InlineAsm::isMemKind(OpFlag)) &&
	"Skipped past definitions?");
	CurOp += InlineAsm::getNumOperandRegisters(OpFlag) + 1;
	}
	return CurOp;
	}

	namespace {

	class ExtraFlags {
	unsigned Flags = 0;

	public:
	explicit ExtraFlags(ImmutableCallSite CS) {
	const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
	if (IA->hasSideEffects())
	Flags \|= InlineAsm::Extra_HasSideEffects;
	if (IA->isAlignStack())
	Flags \|= InlineAsm::Extra_IsAlignStack;
	if (CS.isConvergent())
	Flags \|= InlineAsm::Extra_IsConvergent;
	Flags \|= IA->getDialect() * InlineAsm::Extra_AsmDialect;
	}

	void update(const TargetLowering::AsmOperandInfo &OpInfo) {
	// Ideally, we would only check against memory constraints. However, the
	// meaning of an Other constraint can be target-specific and we can't easily
	// reason about it. Therefore, be conservative and set MayLoad/MayStore
	// for Other constraints as well.
	if (OpInfo.ConstraintType == TargetLowering::C_Memory \|\|
	OpInfo.ConstraintType == TargetLowering::C_Other) {
	if (OpInfo.Type == InlineAsm::isInput)
	Flags \|= InlineAsm::Extra_MayLoad;
	else if (OpInfo.Type == InlineAsm::isOutput)
	Flags \|= InlineAsm::Extra_MayStore;
	else if (OpInfo.Type == InlineAsm::isClobber)
	Flags \|= (InlineAsm::Extra_MayLoad \| InlineAsm::Extra_MayStore);
	}
	}

	unsigned get() const { return Flags; }
	};

	} // end anonymous namespace

	/// visitInlineAsm - Handle a call to an InlineAsm object.
	void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
	const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());

	/// ConstraintOperands - Information about all of the constraints.
	SDISelAsmOperandInfoVector ConstraintOperands;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(
	DAG.getDataLayout(), DAG.getSubtarget().getRegisterInfo(), CS);

	// First Pass: Calculate HasSideEffects and ExtraFlags (AlignStack,
	// AsmDialect, MayLoad, MayStore).
	bool HasSideEffect = IA->hasSideEffects();
	ExtraFlags ExtraInfo(CS);

	unsigned ArgNo = 0; // ArgNo - The argument of the CallInst.
	unsigned ResNo = 0; // ResNo - The result number of the next output.
	for (auto &T : TargetConstraints) {
	ConstraintOperands.push_back(SDISelAsmOperandInfo(T));
	SDISelAsmOperandInfo &OpInfo = ConstraintOperands.back();

	// Compute the value type for each operand.
	if (OpInfo.Type == InlineAsm::isInput \|\|
	(OpInfo.Type == InlineAsm::isOutput && OpInfo.isIndirect)) {
	OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));

	// Process the call argument. BasicBlocks are labels, currently appearing
	// only in asm's.
	const Instruction *I = CS.getInstruction();
	if (isa<CallBrInst>(I) &&
	(ArgNo - 1) >= (cast<CallBrInst>(I)->getNumArgOperands() -
	cast<CallBrInst>(I)->getNumIndirectDests())) {
	const auto *BA = cast<BlockAddress>(OpInfo.CallOperandVal);
	EVT VT = TLI.getValueType(DAG.getDataLayout(), BA->getType(), true);
	OpInfo.CallOperand = DAG.getTargetBlockAddress(BA, VT);
	} else if (const auto *BB = dyn_cast<BasicBlock>(OpInfo.CallOperandVal)) {
	OpInfo.CallOperand = DAG.getBasicBlock(FuncInfo.MBBMap[BB]);
	} else {
	OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
	}

	OpInfo.ConstraintVT =
	OpInfo
	.getCallOperandValEVT(*DAG.getContext(), TLI, DAG.getDataLayout())
	.getSimpleVT();
	} else if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) {
	// The return value of the call is this value. As such, there is no
	// corresponding argument.
	assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
	if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
	OpInfo.ConstraintVT = TLI.getSimpleValueType(
	DAG.getDataLayout(), STy->getElementType(ResNo));
	} else {
	assert(ResNo == 0 && "Asm only has one result!");
	OpInfo.ConstraintVT =
	TLI.getSimpleValueType(DAG.getDataLayout(), CS.getType());
	}
	++ResNo;
	} else {
	OpInfo.ConstraintVT = MVT::Other;
	}

	if (!HasSideEffect)
	HasSideEffect = OpInfo.hasMemory(TLI);

	// Determine if this InlineAsm MayLoad or MayStore based on the constraints.
	// FIXME: Could we compute this on OpInfo rather than T?

	// Compute the constraint code and ConstraintType to use.
	TLI.ComputeConstraintToUse(T, SDValue());

	+ if (T.ConstraintType == TargetLowering::C_Immediate &&
	+ OpInfo.CallOperand && !isa<ConstantSDNode>(OpInfo.CallOperand))
	+ // We've delayed emitting a diagnostic like the "n" constraint because
	+ // inlining could cause an integer showing up.
	+ return emitInlineAsmError(
	+ CS, "constraint '" + Twine(T.ConstraintCode) + "' expects an "
	+ "integer constant expression");
	+
	ExtraInfo.update(T);
	}


	// We won't need to flush pending loads if this asm doesn't touch
	// memory and is nonvolatile.
	SDValue Flag, Chain = (HasSideEffect) ? getRoot() : DAG.getRoot();

	bool IsCallBr = isa<CallBrInst>(CS.getInstruction());
	if (IsCallBr) {
	// If this is a callbr we need to flush pending exports since inlineasm_br
	// is a terminator. We need to do this before nodes are glued to
	// the inlineasm_br node.
	Chain = getControlRoot();
	}

	// Second pass over the constraints: compute which constraint option to use.
	for (SDISelAsmOperandInfo &OpInfo : ConstraintOperands) {
	// If this is an output operand with a matching input operand, look up the
	// matching input. If their types mismatch, e.g. one is an integer, the
	// other is floating point, or their sizes are different, flag it as an
	// error.
	if (OpInfo.hasMatchingInput()) {
	SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
	patchMatchingInput(OpInfo, Input, DAG);
	}

	// Compute the constraint code and ConstraintType to use.
	TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);

	if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
	OpInfo.Type == InlineAsm::isClobber)
	continue;

	// If this is a memory input, and if the operand is not indirect, do what we
	// need to provide an address for the memory input.
	if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
	!OpInfo.isIndirect) {
	assert((OpInfo.isMultipleAlternative \|\|
	(OpInfo.Type == InlineAsm::isInput)) &&
	"Can only indirectify direct input operands!");

	// Memory operands really want the address of the value.
	Chain = getAddressForMemoryInput(Chain, getCurSDLoc(), OpInfo, DAG);

	// There is no longer a Value* corresponding to this operand.
	OpInfo.CallOperandVal = nullptr;

	// It is now an indirect operand.
	OpInfo.isIndirect = true;
	}

	}

	// AsmNodeOperands - The operands for the ISD::INLINEASM node.
	std::vector<SDValue> AsmNodeOperands;
	AsmNodeOperands.push_back(SDValue()); // reserve space for input chain
	AsmNodeOperands.push_back(DAG.getTargetExternalSymbol(
	IA->getAsmString().c_str(), TLI.getPointerTy(DAG.getDataLayout())));

	// If we have a !srcloc metadata node associated with it, we want to attach
	// this to the ultimately generated inline asm machineinstr. To do this, we
	// pass in the third operand as this (potentially null) inline asm MDNode.
	const MDNode *SrcLoc = CS.getInstruction()->getMetadata("srcloc");
	AsmNodeOperands.push_back(DAG.getMDNode(SrcLoc));

	// Remember the HasSideEffect, AlignStack, AsmDialect, MayLoad and MayStore
	// bits as operand 3.
	AsmNodeOperands.push_back(DAG.getTargetConstant(
	ExtraInfo.get(), getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));

	// Third pass: Loop over operands to prepare DAG-level operands.. As part of
	// this, assign virtual and physical registers for inputs and otput.
	for (SDISelAsmOperandInfo &OpInfo : ConstraintOperands) {
	// Assign Registers.
	SDISelAsmOperandInfo &RefOpInfo =
	OpInfo.isMatchingInputConstraint()
	? ConstraintOperands[OpInfo.getMatchedOperand()]
	: OpInfo;
	GetRegistersForValue(DAG, getCurSDLoc(), OpInfo, RefOpInfo);

	switch (OpInfo.Type) {
	case InlineAsm::isOutput:
	if (OpInfo.ConstraintType == TargetLowering::C_Memory \|\|
	- (OpInfo.ConstraintType == TargetLowering::C_Other &&
	+ ((OpInfo.ConstraintType == TargetLowering::C_Immediate \|\|
	+ OpInfo.ConstraintType == TargetLowering::C_Other) &&
	OpInfo.isIndirect)) {
	unsigned ConstraintID =
	TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
	assert(ConstraintID != InlineAsm::Constraint_Unknown &&
	"Failed to convert memory constraint code to constraint id.");

	// Add information to the INLINEASM node to know about this output.
	unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
	OpFlags = InlineAsm::getFlagWordForMem(OpFlags, ConstraintID);
	AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlags, getCurSDLoc(),
	MVT::i32));
	AsmNodeOperands.push_back(OpInfo.CallOperand);
	break;
	- } else if ((OpInfo.ConstraintType == TargetLowering::C_Other &&
	+ } else if (((OpInfo.ConstraintType == TargetLowering::C_Immediate \|\|
	+ OpInfo.ConstraintType == TargetLowering::C_Other) &&
	!OpInfo.isIndirect) \|\|
	OpInfo.ConstraintType == TargetLowering::C_Register \|\|
	OpInfo.ConstraintType == TargetLowering::C_RegisterClass) {
	// Otherwise, this outputs to a register (directly for C_Register /
	- // C_RegisterClass, and a target-defined fashion for C_Other). Find a
	- // register that we can use.
	+ // C_RegisterClass, and a target-defined fashion for
	+ // C_Immediate/C_Other). Find a register that we can use.
	if (OpInfo.AssignedRegs.Regs.empty()) {
	emitInlineAsmError(
	CS, "couldn't allocate output register for constraint '" +
	Twine(OpInfo.ConstraintCode) + "'");
	return;
	}

	// Add information to the INLINEASM node to know that this register is
	// set.
	OpInfo.AssignedRegs.AddInlineAsmOperands(
	OpInfo.isEarlyClobber ? InlineAsm::Kind_RegDefEarlyClobber
	: InlineAsm::Kind_RegDef,
	false, 0, getCurSDLoc(), DAG, AsmNodeOperands);
	}
	break;

	case InlineAsm::isInput: {
	SDValue InOperandVal = OpInfo.CallOperand;

	if (OpInfo.isMatchingInputConstraint()) {
	// If this is required to match an output register we have already set,
	// just use its register.
	auto CurOp = findMatchingInlineAsmOperand(OpInfo.getMatchedOperand(),
	AsmNodeOperands);
	unsigned OpFlag =
	cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
	if (InlineAsm::isRegDefKind(OpFlag) \|\|
	InlineAsm::isRegDefEarlyClobberKind(OpFlag)) {
	// Add (OpFlag&0xffff)>>3 registers to MatchedRegs.
	if (OpInfo.isIndirect) {
	// This happens on gcc/testsuite/gcc.dg/pr8788-1.c
	emitInlineAsmError(CS, "inline asm not supported yet:"
	" don't know how to handle tied "
	"indirect register inputs");
	return;
	}

	MVT RegVT = AsmNodeOperands[CurOp+1].getSimpleValueType();
	SmallVector<unsigned, 4> Regs;

	if (const TargetRegisterClass *RC = TLI.getRegClassFor(RegVT)) {
	unsigned NumRegs = InlineAsm::getNumOperandRegisters(OpFlag);
	MachineRegisterInfo &RegInfo =
	DAG.getMachineFunction().getRegInfo();
	for (unsigned i = 0; i != NumRegs; ++i)
	Regs.push_back(RegInfo.createVirtualRegister(RC));
	} else {
	emitInlineAsmError(CS, "inline asm error: This value type register "
	"class is not natively supported!");
	return;
	}

	RegsForValue MatchedRegs(Regs, RegVT, InOperandVal.getValueType());

	SDLoc dl = getCurSDLoc();
	// Use the produced MatchedRegs object to
	MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Flag,
	CS.getInstruction());
	MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse,
	true, OpInfo.getMatchedOperand(), dl,
	DAG, AsmNodeOperands);
	break;
	}

	assert(InlineAsm::isMemKind(OpFlag) && "Unknown matching constraint!");
	assert(InlineAsm::getNumOperandRegisters(OpFlag) == 1 &&
	"Unexpected number of operands");
	// Add information to the INLINEASM node to know about this input.
	// See InlineAsm.h isUseOperandTiedToDef.
	OpFlag = InlineAsm::convertMemFlagWordToMatchingFlagWord(OpFlag);
	OpFlag = InlineAsm::getFlagWordForMatchingOp(OpFlag,
	OpInfo.getMatchedOperand());
	AsmNodeOperands.push_back(DAG.getTargetConstant(
	OpFlag, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
	AsmNodeOperands.push_back(AsmNodeOperands[CurOp+1]);
	break;
	}

	// Treat indirect 'X' constraint as memory.
	- if (OpInfo.ConstraintType == TargetLowering::C_Other &&
	+ if ((OpInfo.ConstraintType == TargetLowering::C_Immediate \|\|
	+ OpInfo.ConstraintType == TargetLowering::C_Other) &&
	OpInfo.isIndirect)
	OpInfo.ConstraintType = TargetLowering::C_Memory;

	- if (OpInfo.ConstraintType == TargetLowering::C_Other) {
	+ if (OpInfo.ConstraintType == TargetLowering::C_Immediate \|\|
	+ OpInfo.ConstraintType == TargetLowering::C_Other) {
	std::vector<SDValue> Ops;
	TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
	Ops, DAG);
	if (Ops.empty()) {
	+ if (OpInfo.ConstraintType == TargetLowering::C_Immediate)
	+ if (isa<ConstantSDNode>(InOperandVal)) {
	+ emitInlineAsmError(CS, "value out of range for constraint '" +
	+ Twine(OpInfo.ConstraintCode) + "'");
	+ return;
	+ }
	+
	emitInlineAsmError(CS, "invalid operand for inline asm constraint '" +
	Twine(OpInfo.ConstraintCode) + "'");
	return;
	}

	// Add information to the INLINEASM node to know about this input.
	unsigned ResOpType =
	InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size());
	AsmNodeOperands.push_back(DAG.getTargetConstant(
	ResOpType, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
	AsmNodeOperands.insert(AsmNodeOperands.end(), Ops.begin(), Ops.end());
	break;
	}

	if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
	assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!");
	assert(InOperandVal.getValueType() ==
	TLI.getPointerTy(DAG.getDataLayout()) &&
	"Memory operands expect pointer values");

	unsigned ConstraintID =
	TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
	assert(ConstraintID != InlineAsm::Constraint_Unknown &&
	"Failed to convert memory constraint code to constraint id.");

	// Add information to the INLINEASM node to know about this input.
	unsigned ResOpType = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
	ResOpType = InlineAsm::getFlagWordForMem(ResOpType, ConstraintID);
	AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
	getCurSDLoc(),
	MVT::i32));
	AsmNodeOperands.push_back(InOperandVal);
	break;
	}

	assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass \|\|
	- OpInfo.ConstraintType == TargetLowering::C_Register) &&
	+ OpInfo.ConstraintType == TargetLowering::C_Register \|\|
	+ OpInfo.ConstraintType == TargetLowering::C_Immediate) &&
	"Unknown constraint type!");

	// TODO: Support this.
	if (OpInfo.isIndirect) {
	emitInlineAsmError(
	CS, "Don't know how to handle indirect register inputs yet "
	"for constraint '" +
	Twine(OpInfo.ConstraintCode) + "'");
	return;
	}

	// Copy the input into the appropriate registers.
	if (OpInfo.AssignedRegs.Regs.empty()) {
	emitInlineAsmError(CS, "couldn't allocate input reg for constraint '" +
	Twine(OpInfo.ConstraintCode) + "'");
	return;
	}

	SDLoc dl = getCurSDLoc();

	OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, dl,
	Chain, &Flag, CS.getInstruction());

	OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, false, 0,
	dl, DAG, AsmNodeOperands);
	break;
	}
	case InlineAsm::isClobber:
	// Add the clobbered value to the operand list, so that the register
	// allocator is aware that the physreg got clobbered.
	if (!OpInfo.AssignedRegs.Regs.empty())
	OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_Clobber,
	false, 0, getCurSDLoc(), DAG,
	AsmNodeOperands);
	break;
	}
	}

	// Finish up input operands. Set the input chain and add the flag last.
	AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
	if (Flag.getNode()) AsmNodeOperands.push_back(Flag);

	unsigned ISDOpc = IsCallBr ? ISD::INLINEASM_BR : ISD::INLINEASM;
	Chain = DAG.getNode(ISDOpc, getCurSDLoc(),
	DAG.getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
	Flag = Chain.getValue(1);

	// Do additional work to generate outputs.

	SmallVector<EVT, 1> ResultVTs;
	SmallVector<SDValue, 1> ResultValues;
	SmallVector<SDValue, 8> OutChains;

	llvm::Type *CSResultType = CS.getType();
	ArrayRef<Type *> ResultTypes;
	if (StructType *StructResult = dyn_cast<StructType>(CSResultType))
	ResultTypes = StructResult->elements();
	else if (!CSResultType->isVoidTy())
	ResultTypes = makeArrayRef(CSResultType);

	auto CurResultType = ResultTypes.begin();
	auto handleRegAssign = [&](SDValue V) {
	assert(CurResultType != ResultTypes.end() && "Unexpected value");
	assert((*CurResultType)->isSized() && "Unexpected unsized type");
	EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), *CurResultType);
	++CurResultType;
	// If the type of the inline asm call site return value is different but has
	// same size as the type of the asm output bitcast it. One example of this
	// is for vectors with different width / number of elements. This can
	// happen for register classes that can contain multiple different value
	// types. The preg or vreg allocated may not have the same VT as was
	// expected.
	//
	// This can also happen for a return value that disagrees with the register
	// class it is put in, eg. a double in a general-purpose register on a
	// 32-bit machine.
	if (ResultVT != V.getValueType() &&
	ResultVT.getSizeInBits() == V.getValueSizeInBits())
	V = DAG.getNode(ISD::BITCAST, getCurSDLoc(), ResultVT, V);
	else if (ResultVT != V.getValueType() && ResultVT.isInteger() &&
	V.getValueType().isInteger()) {
	// If a result value was tied to an input value, the computed result
	// may have a wider width than the expected result. Extract the
	// relevant portion.
	V = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), ResultVT, V);
	}
	assert(ResultVT == V.getValueType() && "Asm result value mismatch!");
	ResultVTs.push_back(ResultVT);
	ResultValues.push_back(V);
	};

	// Deal with output operands.
	for (SDISelAsmOperandInfo &OpInfo : ConstraintOperands) {
	if (OpInfo.Type == InlineAsm::isOutput) {
	SDValue Val;
	// Skip trivial output operands.
	if (OpInfo.AssignedRegs.Regs.empty())
	continue;

	switch (OpInfo.ConstraintType) {
	case TargetLowering::C_Register:
	case TargetLowering::C_RegisterClass:
	Val = OpInfo.AssignedRegs.getCopyFromRegs(
	DAG, FuncInfo, getCurSDLoc(), Chain, &Flag, CS.getInstruction());
	break;
	+ case TargetLowering::C_Immediate:
	case TargetLowering::C_Other:
	Val = TLI.LowerAsmOutputForConstraint(Chain, Flag, getCurSDLoc(),
	OpInfo, DAG);
	break;
	case TargetLowering::C_Memory:
	break; // Already handled.
	case TargetLowering::C_Unknown:
	assert(false && "Unexpected unknown constraint");
	}

	// Indirect output manifest as stores. Record output chains.
	if (OpInfo.isIndirect) {
	const Value *Ptr = OpInfo.CallOperandVal;
	assert(Ptr && "Expected value CallOperandVal for indirect asm operand");
	SDValue Store = DAG.getStore(Chain, getCurSDLoc(), Val, getValue(Ptr),
	MachinePointerInfo(Ptr));
	OutChains.push_back(Store);
	} else {
	// generate CopyFromRegs to associated registers.
	assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
	if (Val.getOpcode() == ISD::MERGE_VALUES) {
	for (const SDValue &V : Val->op_values())
	handleRegAssign(V);
	} else
	handleRegAssign(Val);
	}
	}
	}

	// Set results.
	if (!ResultValues.empty()) {
	assert(CurResultType == ResultTypes.end() &&
	"Mismatch in number of ResultTypes");
	assert(ResultValues.size() == ResultTypes.size() &&
	"Mismatch in number of output operands in asm result");

	SDValue V = DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
	DAG.getVTList(ResultVTs), ResultValues);
	setValue(CS.getInstruction(), V);
	}

	// Collect store chains.
	if (!OutChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, OutChains);

	// Only Update Root if inline assembly has a memory effect.
	if (ResultValues.empty() \|\| HasSideEffect \|\| !OutChains.empty() \|\| IsCallBr)
	DAG.setRoot(Chain);
	}

	void SelectionDAGBuilder::emitInlineAsmError(ImmutableCallSite CS,
	const Twine &Message) {
	LLVMContext &Ctx = *DAG.getContext();
	Ctx.emitError(CS.getInstruction(), Message);

	// Make sure we leave the DAG in a valid state
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SmallVector<EVT, 1> ValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), CS->getType(), ValueVTs);

	if (ValueVTs.empty())
	return;

	SmallVector<SDValue, 1> Ops;
	for (unsigned i = 0, e = ValueVTs.size(); i != e; ++i)
	Ops.push_back(DAG.getUNDEF(ValueVTs[i]));

	setValue(CS.getInstruction(), DAG.getMergeValues(Ops, getCurSDLoc()));
	}

	void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
	DAG.setRoot(DAG.getNode(ISD::VASTART, getCurSDLoc(),
	MVT::Other, getRoot(),
	getValue(I.getArgOperand(0)),
	DAG.getSrcValue(I.getArgOperand(0))));
	}

	void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const DataLayout &DL = DAG.getDataLayout();
	SDValue V = DAG.getVAArg(
	TLI.getMemValueType(DAG.getDataLayout(), I.getType()), getCurSDLoc(),
	getRoot(), getValue(I.getOperand(0)), DAG.getSrcValue(I.getOperand(0)),
	DL.getABITypeAlignment(I.getType()));
	DAG.setRoot(V.getValue(1));

	if (I.getType()->isPointerTy())
	V = DAG.getPtrExtOrTrunc(
	V, getCurSDLoc(), TLI.getValueType(DAG.getDataLayout(), I.getType()));
	setValue(&I, V);
	}

	void SelectionDAGBuilder::visitVAEnd(const CallInst &I) {
	DAG.setRoot(DAG.getNode(ISD::VAEND, getCurSDLoc(),
	MVT::Other, getRoot(),
	getValue(I.getArgOperand(0)),
	DAG.getSrcValue(I.getArgOperand(0))));
	}

	void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
	DAG.setRoot(DAG.getNode(ISD::VACOPY, getCurSDLoc(),
	MVT::Other, getRoot(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1)),
	DAG.getSrcValue(I.getArgOperand(0)),
	DAG.getSrcValue(I.getArgOperand(1))));
	}

	SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
	const Instruction &I,
	SDValue Op) {
	const MDNode *Range = I.getMetadata(LLVMContext::MD_range);
	if (!Range)
	return Op;

	ConstantRange CR = getConstantRangeFromMetadata(*Range);
	if (CR.isFullSet() \|\| CR.isEmptySet() \|\| CR.isUpperWrapped())
	return Op;

	APInt Lo = CR.getUnsignedMin();
	if (!Lo.isMinValue())
	return Op;

	APInt Hi = CR.getUnsignedMax();
	unsigned Bits = std::max(Hi.getActiveBits(),
	static_cast<unsigned>(IntegerType::MIN_INT_BITS));

	EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), Bits);

	SDLoc SL = getCurSDLoc();

	SDValue ZExt = DAG.getNode(ISD::AssertZext, SL, Op.getValueType(), Op,
	DAG.getValueType(SmallVT));
	unsigned NumVals = Op.getNode()->getNumValues();
	if (NumVals == 1)
	return ZExt;

	SmallVector<SDValue, 4> Ops;

	Ops.push_back(ZExt);
	for (unsigned I = 1; I != NumVals; ++I)
	Ops.push_back(Op.getValue(I));

	return DAG.getMergeValues(Ops, SL);
	}

	/// Populate a CallLowerinInfo (into \p CLI) based on the properties of
	/// the call being lowered.
	///
	/// This is a helper for lowering intrinsics that follow a target calling
	/// convention or require stack pointer adjustment. Only a subset of the
	/// intrinsic's operands need to participate in the calling convention.
	void SelectionDAGBuilder::populateCallLoweringInfo(
	TargetLowering::CallLoweringInfo &CLI, const CallBase *Call,
	unsigned ArgIdx, unsigned NumArgs, SDValue Callee, Type *ReturnTy,
	bool IsPatchPoint) {
	TargetLowering::ArgListTy Args;
	Args.reserve(NumArgs);

	// Populate the argument list.
	// Attributes for args start at offset 1, after the return attribute.
	for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs;
	ArgI != ArgE; ++ArgI) {
	const Value *V = Call->getOperand(ArgI);

	assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");

	TargetLowering::ArgListEntry Entry;
	Entry.Node = getValue(V);
	Entry.Ty = V->getType();
	Entry.setAttributes(Call, ArgI);
	Args.push_back(Entry);
	}

	CLI.setDebugLoc(getCurSDLoc())
	.setChain(getRoot())
	.setCallee(Call->getCallingConv(), ReturnTy, Callee, std::move(Args))
	.setDiscardResult(Call->use_empty())
	.setIsPatchPoint(IsPatchPoint);
	}

	/// Add a stack map intrinsic call's live variable operands to a stackmap
	/// or patchpoint target node's operand list.
	///
	/// Constants are converted to TargetConstants purely as an optimization to
	/// avoid constant materialization and register allocation.
	///
	/// FrameIndex operands are converted to TargetFrameIndex so that ISEL does not
	/// generate addess computation nodes, and so FinalizeISel can convert the
	/// TargetFrameIndex into a DirectMemRefOp StackMap location. This avoids
	/// address materialization and register allocation, but may also be required
	/// for correctness. If a StackMap (or PatchPoint) intrinsic directly uses an
	/// alloca in the entry block, then the runtime may assume that the alloca's
	/// StackMap location can be read immediately after compilation and that the
	/// location is valid at any point during execution (this is similar to the
	/// assumption made by the llvm.gcroot intrinsic). If the alloca's location were
	/// only available in a register, then the runtime would need to trap when
	/// execution reaches the StackMap in order to read the alloca's location.
	static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx,
	const SDLoc &DL, SmallVectorImpl<SDValue> &Ops,
	SelectionDAGBuilder &Builder) {
	for (unsigned i = StartIdx, e = CS.arg_size(); i != e; ++i) {
	SDValue OpVal = Builder.getValue(CS.getArgument(i));
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpVal)) {
	Ops.push_back(
	Builder.DAG.getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
	Ops.push_back(
	Builder.DAG.getTargetConstant(C->getSExtValue(), DL, MVT::i64));
	} else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(OpVal)) {
	const TargetLowering &TLI = Builder.DAG.getTargetLoweringInfo();
	Ops.push_back(Builder.DAG.getTargetFrameIndex(
	FI->getIndex(), TLI.getFrameIndexTy(Builder.DAG.getDataLayout())));
	} else
	Ops.push_back(OpVal);
	}
	}

	/// Lower llvm.experimental.stackmap directly to its target opcode.
	void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
	// void @llvm.experimental.stackmap(i32 <id>, i32 <numShadowBytes>,
	// [live variables...])

	assert(CI.getType()->isVoidTy() && "Stackmap cannot return a value.");

	SDValue Chain, InFlag, Callee, NullPtr;
	SmallVector<SDValue, 32> Ops;

	SDLoc DL = getCurSDLoc();
	Callee = getValue(CI.getCalledValue());
	NullPtr = DAG.getIntPtrConstant(0, DL, true);

	// The stackmap intrinsic only records the live variables (the arguemnts
	// passed to it) and emits NOPS (if requested). Unlike the patchpoint
	// intrinsic, this won't be lowered to a function call. This means we don't
	// have to worry about calling conventions and target specific lowering code.
	// Instead we perform the call lowering right here.
	//
	// chain, flag = CALLSEQ_START(chain, 0, 0)
	// chain, flag = STACKMAP(id, nbytes, ..., chain, flag)
	// chain, flag = CALLSEQ_END(chain, 0, 0, flag)
	//
	Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL);
	InFlag = Chain.getValue(1);

	// Add the <id> and <numBytes> constants.
	SDValue IDVal = getValue(CI.getOperand(PatchPointOpers::IDPos));
	Ops.push_back(DAG.getTargetConstant(
	cast<ConstantSDNode>(IDVal)->getZExtValue(), DL, MVT::i64));
	SDValue NBytesVal = getValue(CI.getOperand(PatchPointOpers::NBytesPos));
	Ops.push_back(DAG.getTargetConstant(
	cast<ConstantSDNode>(NBytesVal)->getZExtValue(), DL,
	MVT::i32));

	// Push live variables for the stack map.
	addStackMapLiveVars(&CI, 2, DL, Ops, *this);

	// We are not pushing any register mask info here on the operands list,
	// because the stackmap doesn't clobber anything.

	// Push the chain and the glue flag.
	Ops.push_back(Chain);
	Ops.push_back(InFlag);

	// Create the STACKMAP node.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDNode *SM = DAG.getMachineNode(TargetOpcode::STACKMAP, DL, NodeTys, Ops);
	Chain = SDValue(SM, 0);
	InFlag = Chain.getValue(1);

	Chain = DAG.getCALLSEQ_END(Chain, NullPtr, NullPtr, InFlag, DL);

	// Stackmaps don't generate values, so nothing goes into the NodeMap.

	// Set the root to the target-lowered call chain.
	DAG.setRoot(Chain);

	// Inform the Frame Information that we have a stackmap in this function.
	FuncInfo.MF->getFrameInfo().setHasStackMap();
	}

	/// Lower llvm.experimental.patchpoint directly to its target opcode.
	void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
	const BasicBlock *EHPadBB) {
	// void\|i64 @llvm.experimental.patchpoint.void\|i64(i64 <id>,
	// i32 <numBytes>,
	// i8* <target>,
	// i32 <numArgs>,
	// [Args...],
	// [live variables...])

	CallingConv::ID CC = CS.getCallingConv();
	bool IsAnyRegCC = CC == CallingConv::AnyReg;
	bool HasDef = !CS->getType()->isVoidTy();
	SDLoc dl = getCurSDLoc();
	SDValue Callee = getValue(CS->getOperand(PatchPointOpers::TargetPos));

	// Handle immediate and symbolic callees.
	if (auto* ConstCallee = dyn_cast<ConstantSDNode>(Callee))
	Callee = DAG.getIntPtrConstant(ConstCallee->getZExtValue(), dl,
	/isTarget=/true);
	else if (auto* SymbolicCallee = dyn_cast<GlobalAddressSDNode>(Callee))
	Callee = DAG.getTargetGlobalAddress(SymbolicCallee->getGlobal(),
	SDLoc(SymbolicCallee),
	SymbolicCallee->getValueType(0));

	// Get the real number of arguments participating in the call <numArgs>
	SDValue NArgVal = getValue(CS.getArgument(PatchPointOpers::NArgPos));
	unsigned NumArgs = cast<ConstantSDNode>(NArgVal)->getZExtValue();

	// Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
	// Intrinsics include all meta-operands up to but not including CC.
	unsigned NumMetaOpers = PatchPointOpers::CCPos;
	assert(CS.arg_size() >= NumMetaOpers + NumArgs &&
	"Not enough arguments provided to the patchpoint intrinsic");

	// For AnyRegCC the arguments are lowered later on manually.
	unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
	Type *ReturnTy =
	IsAnyRegCC ? Type::getVoidTy(*DAG.getContext()) : CS->getType();

	TargetLowering::CallLoweringInfo CLI(DAG);
	populateCallLoweringInfo(CLI, cast<CallBase>(CS.getInstruction()),
	NumMetaOpers, NumCallArgs, Callee, ReturnTy, true);
	std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);

	SDNode *CallEnd = Result.second.getNode();
	if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
	CallEnd = CallEnd->getOperand(0).getNode();

	/// Get a call instruction from the call sequence chain.
	/// Tail calls are not allowed.
	assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
	"Expected a callseq node.");
	SDNode *Call = CallEnd->getOperand(0).getNode();
	bool HasGlue = Call->getGluedNode();

	// Replace the target specific call node with the patchable intrinsic.
	SmallVector<SDValue, 8> Ops;

	// Add the <id> and <numBytes> constants.
	SDValue IDVal = getValue(CS->getOperand(PatchPointOpers::IDPos));
	Ops.push_back(DAG.getTargetConstant(
	cast<ConstantSDNode>(IDVal)->getZExtValue(), dl, MVT::i64));
	SDValue NBytesVal = getValue(CS->getOperand(PatchPointOpers::NBytesPos));
	Ops.push_back(DAG.getTargetConstant(
	cast<ConstantSDNode>(NBytesVal)->getZExtValue(), dl,
	MVT::i32));

	// Add the callee.
	Ops.push_back(Callee);

	// Adjust <numArgs> to account for any arguments that have been passed on the
	// stack instead.
	// Call Node: Chain, Target, {Args}, RegMask, [Glue]
	unsigned NumCallRegArgs = Call->getNumOperands() - (HasGlue ? 4 : 3);
	NumCallRegArgs = IsAnyRegCC ? NumArgs : NumCallRegArgs;
	Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, dl, MVT::i32));

	// Add the calling convention
	Ops.push_back(DAG.getTargetConstant((unsigned)CC, dl, MVT::i32));

	// Add the arguments we omitted previously. The register allocator should
	// place these in any free register.
	if (IsAnyRegCC)
	for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i)
	Ops.push_back(getValue(CS.getArgument(i)));

	// Push the arguments from the call instruction up to the register mask.
	SDNode::op_iterator e = HasGlue ? Call->op_end()-2 : Call->op_end()-1;
	Ops.append(Call->op_begin() + 2, e);

	// Push live variables for the stack map.
	addStackMapLiveVars(CS, NumMetaOpers + NumArgs, dl, Ops, *this);

	// Push the register mask info.
	if (HasGlue)
	Ops.push_back(*(Call->op_end()-2));
	else
	Ops.push_back(*(Call->op_end()-1));

	// Push the chain (this is originally the first operand of the call, but
	// becomes now the last or second to last operand).
	Ops.push_back(*(Call->op_begin()));

	// Push the glue flag (last operand).
	if (HasGlue)
	Ops.push_back(*(Call->op_end()-1));

	SDVTList NodeTys;
	if (IsAnyRegCC && HasDef) {
	// Create the return types based on the intrinsic definition
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SmallVector<EVT, 3> ValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), CS->getType(), ValueVTs);
	assert(ValueVTs.size() == 1 && "Expected only one return value type.");

	// There is always a chain and a glue type at the end
	ValueVTs.push_back(MVT::Other);
	ValueVTs.push_back(MVT::Glue);
	NodeTys = DAG.getVTList(ValueVTs);
	} else
	NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	// Replace the target specific call node with a PATCHPOINT node.
	MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHPOINT,
	dl, NodeTys, Ops);

	// Update the NodeMap.
	if (HasDef) {
	if (IsAnyRegCC)
	setValue(CS.getInstruction(), SDValue(MN, 0));
	else
	setValue(CS.getInstruction(), Result.first);
	}

	// Fixup the consumers of the intrinsic. The chain and glue may be used in the
	// call sequence. Furthermore the location of the chain and glue can change
	// when the AnyReg calling convention is used and the intrinsic returns a
	// value.
	if (IsAnyRegCC && HasDef) {
	SDValue From[] = {SDValue(Call, 0), SDValue(Call, 1)};
	SDValue To[] = {SDValue(MN, 1), SDValue(MN, 2)};
	DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
	} else
	DAG.ReplaceAllUsesWith(Call, MN);
	DAG.DeleteNode(Call);

	// Inform the Frame Information that we have a patchpoint in this function.
	FuncInfo.MF->getFrameInfo().setHasPatchPoint();
	}

	void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
	unsigned Intrinsic) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2;
	if (I.getNumArgOperands() > 1)
	Op2 = getValue(I.getArgOperand(1));
	SDLoc dl = getCurSDLoc();
	EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	SDValue Res;
	FastMathFlags FMF;
	if (isa<FPMathOperator>(I))
	FMF = I.getFastMathFlags();

	switch (Intrinsic) {
	case Intrinsic::experimental_vector_reduce_v2_fadd:
	if (FMF.allowReassoc())
	Res = DAG.getNode(ISD::FADD, dl, VT, Op1,
	DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2));
	else
	Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
	break;
	case Intrinsic::experimental_vector_reduce_v2_fmul:
	if (FMF.allowReassoc())
	Res = DAG.getNode(ISD::FMUL, dl, VT, Op1,
	DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2));
	else
	Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
	break;
	case Intrinsic::experimental_vector_reduce_add:
	Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_mul:
	Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_and:
	Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_or:
	Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_xor:
	Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_smax:
	Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_smin:
	Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_umax:
	Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_umin:
	Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_fmax:
	Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_fmin:
	Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1);
	break;
	default:
	llvm_unreachable("Unhandled vector reduce intrinsic");
	}
	setValue(&I, Res);
	}

	/// Returns an AttributeList representing the attributes applied to the return
	/// value of the given call.
	static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
	SmallVector<Attribute::AttrKind, 2> Attrs;
	if (CLI.RetSExt)
	Attrs.push_back(Attribute::SExt);
	if (CLI.RetZExt)
	Attrs.push_back(Attribute::ZExt);
	if (CLI.IsInReg)
	Attrs.push_back(Attribute::InReg);

	return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex,
	Attrs);
	}

	/// TargetLowering::LowerCallTo - This is the default LowerCallTo
	/// implementation, which just calls LowerCall.
	/// FIXME: When all targets are
	/// migrated to using LowerCall, this hook should be integrated into SDISel.
	std::pair<SDValue, SDValue>
	TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
	// Handle the incoming return values from the call.
	CLI.Ins.clear();
	Type *OrigRetTy = CLI.RetTy;
	SmallVector<EVT, 4> RetTys;
	SmallVector<uint64_t, 4> Offsets;
	auto &DL = CLI.DAG.getDataLayout();
	ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets);

	if (CLI.IsPostTypeLegalization) {
	// If we are lowering a libcall after legalization, split the return type.
	SmallVector<EVT, 4> OldRetTys;
	SmallVector<uint64_t, 4> OldOffsets;
	RetTys.swap(OldRetTys);
	Offsets.swap(OldOffsets);

	for (size_t i = 0, e = OldRetTys.size(); i != e; ++i) {
	EVT RetVT = OldRetTys[i];
	uint64_t Offset = OldOffsets[i];
	MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), RetVT);
	unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), RetVT);
	unsigned RegisterVTByteSZ = RegisterVT.getSizeInBits() / 8;
	RetTys.append(NumRegs, RegisterVT);
	for (unsigned j = 0; j != NumRegs; ++j)
	Offsets.push_back(Offset + j * RegisterVTByteSZ);
	}
	}

	SmallVector<ISD::OutputArg, 4> Outs;
	GetReturnInfo(CLI.CallConv, CLI.RetTy, getReturnAttrs(CLI), Outs, *this, DL);

	bool CanLowerReturn =
	this->CanLowerReturn(CLI.CallConv, CLI.DAG.getMachineFunction(),
	CLI.IsVarArg, Outs, CLI.RetTy->getContext());

	SDValue DemoteStackSlot;
	int DemoteStackIdx = -100;
	if (!CanLowerReturn) {
	// FIXME: equivalent assert?
	// assert(!CS.hasInAllocaArgument() &&
	// "sret demotion is incompatible with inalloca");
	uint64_t TySize = DL.getTypeAllocSize(CLI.RetTy);
	unsigned Align = DL.getPrefTypeAlignment(CLI.RetTy);
	MachineFunction &MF = CLI.DAG.getMachineFunction();
	DemoteStackIdx = MF.getFrameInfo().CreateStackObject(TySize, Align, false);
	Type *StackSlotPtrType = PointerType::get(CLI.RetTy,
	DL.getAllocaAddrSpace());

	DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getFrameIndexTy(DL));
	ArgListEntry Entry;
	Entry.Node = DemoteStackSlot;
	Entry.Ty = StackSlotPtrType;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Entry.IsInReg = false;
	Entry.IsSRet = true;
	Entry.IsNest = false;
	Entry.IsByVal = false;
	Entry.IsReturned = false;
	Entry.IsSwiftSelf = false;
	Entry.IsSwiftError = false;
	Entry.Alignment = Align;
	CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
	CLI.NumFixedArgs += 1;
	CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());

	// sret demotion isn't compatible with tail-calls, since the sret argument
	// points into the callers stack frame.
	CLI.IsTailCall = false;
	} else {
	bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
	CLI.RetTy, CLI.CallConv, CLI.IsVarArg);
	for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
	ISD::ArgFlagsTy Flags;
	if (NeedsRegBlock) {
	Flags.setInConsecutiveRegs();
	if (I == RetTys.size() - 1)
	Flags.setInConsecutiveRegsLast();
	}
	EVT VT = RetTys[I];
	MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
	CLI.CallConv, VT);
	unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
	CLI.CallConv, VT);
	for (unsigned i = 0; i != NumRegs; ++i) {
	ISD::InputArg MyFlags;
	MyFlags.Flags = Flags;
	MyFlags.VT = RegisterVT;
	MyFlags.ArgVT = VT;
	MyFlags.Used = CLI.IsReturnValueUsed;
	if (CLI.RetTy->isPointerTy()) {
	MyFlags.Flags.setPointer();
	MyFlags.Flags.setPointerAddrSpace(
	cast<PointerType>(CLI.RetTy)->getAddressSpace());
	}
	if (CLI.RetSExt)
	MyFlags.Flags.setSExt();
	if (CLI.RetZExt)
	MyFlags.Flags.setZExt();
	if (CLI.IsInReg)
	MyFlags.Flags.setInReg();
	CLI.Ins.push_back(MyFlags);
	}
	}
	}

	// We push in swifterror return as the last element of CLI.Ins.
	ArgListTy &Args = CLI.getArgs();
	if (supportSwiftError()) {
	for (unsigned i = 0, e = Args.size(); i != e; ++i) {
	if (Args[i].IsSwiftError) {
	ISD::InputArg MyFlags;
	MyFlags.VT = getPointerTy(DL);
	MyFlags.ArgVT = EVT(getPointerTy(DL));
	MyFlags.Flags.setSwiftError();
	CLI.Ins.push_back(MyFlags);
	}
	}
	}

	// Handle all of the outgoing arguments.
	CLI.Outs.clear();
	CLI.OutVals.clear();
	for (unsigned i = 0, e = Args.size(); i != e; ++i) {
	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);
	// FIXME: Split arguments if CLI.IsPostTypeLegalization
	Type *FinalType = Args[i].Ty;
	if (Args[i].IsByVal)
	FinalType = cast<PointerType>(Args[i].Ty)->getElementType();
	bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
	FinalType, CLI.CallConv, CLI.IsVarArg);
	for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues;
	++Value) {
	EVT VT = ValueVTs[Value];
	Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext());
	SDValue Op = SDValue(Args[i].Node.getNode(),
	Args[i].Node.getResNo() + Value);
	ISD::ArgFlagsTy Flags;

	// Certain targets (such as MIPS), may have a different ABI alignment
	// for a type depending on the context. Give the target a chance to
	// specify the alignment it wants.
	unsigned OriginalAlignment = getABIAlignmentForCallingConv(ArgTy, DL);

	if (Args[i].Ty->isPointerTy()) {
	Flags.setPointer();
	Flags.setPointerAddrSpace(
	cast<PointerType>(Args[i].Ty)->getAddressSpace());
	}
	if (Args[i].IsZExt)
	Flags.setZExt();
	if (Args[i].IsSExt)
	Flags.setSExt();
	if (Args[i].IsInReg) {
	// If we are using vectorcall calling convention, a structure that is
	// passed InReg - is surely an HVA
	if (CLI.CallConv == CallingConv::X86_VectorCall &&
	isa<StructType>(FinalType)) {
	// The first value of a structure is marked
	if (0 == Value)
	Flags.setHvaStart();
	Flags.setHva();
	}
	// Set InReg Flag
	Flags.setInReg();
	}
	if (Args[i].IsSRet)
	Flags.setSRet();
	if (Args[i].IsSwiftSelf)
	Flags.setSwiftSelf();
	if (Args[i].IsSwiftError)
	Flags.setSwiftError();
	if (Args[i].IsByVal)
	Flags.setByVal();
	if (Args[i].IsInAlloca) {
	Flags.setInAlloca();
	// Set the byval flag for CCAssignFn callbacks that don't know about
	// inalloca. This way we can know how many bytes we should've allocated
	// and how many bytes a callee cleanup function will pop. If we port
	// inalloca to more targets, we'll have to add custom inalloca handling
	// in the various CC lowering callbacks.
	Flags.setByVal();
	}
	if (Args[i].IsByVal \|\| Args[i].IsInAlloca) {
	PointerType *Ty = cast<PointerType>(Args[i].Ty);
	Type *ElementTy = Ty->getElementType();

	unsigned FrameSize = DL.getTypeAllocSize(
	Args[i].ByValType ? Args[i].ByValType : ElementTy);
	Flags.setByValSize(FrameSize);

	// info is not there but there are cases it cannot get right.
	unsigned FrameAlign;
	if (Args[i].Alignment)
	FrameAlign = Args[i].Alignment;
	else
	FrameAlign = getByValTypeAlignment(ElementTy, DL);
	Flags.setByValAlign(FrameAlign);
	}
	if (Args[i].IsNest)
	Flags.setNest();
	if (NeedsRegBlock)
	Flags.setInConsecutiveRegs();
	Flags.setOrigAlign(OriginalAlignment);

	MVT PartVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
	CLI.CallConv, VT);
	unsigned NumParts = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
	CLI.CallConv, VT);
	SmallVector<SDValue, 4> Parts(NumParts);
	ISD::NodeType ExtendKind = ISD::ANY_EXTEND;

	if (Args[i].IsSExt)
	ExtendKind = ISD::SIGN_EXTEND;
	else if (Args[i].IsZExt)
	ExtendKind = ISD::ZERO_EXTEND;

	// Conservatively only handle 'returned' on non-vectors that can be lowered,
	// for now.
	if (Args[i].IsReturned && !Op.getValueType().isVector() &&
	CanLowerReturn) {
	assert((CLI.RetTy == Args[i].Ty \|\|
	(CLI.RetTy->isPointerTy() && Args[i].Ty->isPointerTy() &&
	CLI.RetTy->getPointerAddressSpace() ==
	Args[i].Ty->getPointerAddressSpace())) &&
	RetTys.size() == NumValues && "unexpected use of 'returned'");
	// Before passing 'returned' to the target lowering code, ensure that
	// either the register MVT and the actual EVT are the same size or that
	// the return value and argument are extended in the same way; in these
	// cases it's safe to pass the argument register value unchanged as the
	// return register value (although it's at the target's option whether
	// to do so)
	// TODO: allow code generation to take advantage of partially preserved
	// registers rather than clobbering the entire register when the
	// parameter extension method is not compatible with the return
	// extension method
	if ((NumParts * PartVT.getSizeInBits() == VT.getSizeInBits()) \|\|
	(ExtendKind != ISD::ANY_EXTEND && CLI.RetSExt == Args[i].IsSExt &&
	CLI.RetZExt == Args[i].IsZExt))
	Flags.setReturned();
	}

	getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT,
	CLI.CS.getInstruction(), CLI.CallConv, ExtendKind);

	for (unsigned j = 0; j != NumParts; ++j) {
	// if it isn't first piece, alignment must be 1
	ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(), VT,
	i < CLI.NumFixedArgs,
	i, j*Parts[j].getValueType().getStoreSize());
	if (NumParts > 1 && j == 0)
	MyFlags.Flags.setSplit();
	else if (j != 0) {
	MyFlags.Flags.setOrigAlign(1);
	if (j == NumParts - 1)
	MyFlags.Flags.setSplitEnd();
	}

	CLI.Outs.push_back(MyFlags);
	CLI.OutVals.push_back(Parts[j]);
	}

	if (NeedsRegBlock && Value == NumValues - 1)
	CLI.Outs[CLI.Outs.size() - 1].Flags.setInConsecutiveRegsLast();
	}
	}

	SmallVector<SDValue, 4> InVals;
	CLI.Chain = LowerCall(CLI, InVals);

	// Update CLI.InVals to use outside of this function.
	CLI.InVals = InVals;

	// Verify that the target's LowerCall behaved as expected.
	assert(CLI.Chain.getNode() && CLI.Chain.getValueType() == MVT::Other &&
	"LowerCall didn't return a valid chain!");
	assert((!CLI.IsTailCall \|\| InVals.empty()) &&
	"LowerCall emitted a return value for a tail call!");
	assert((CLI.IsTailCall \|\| InVals.size() == CLI.Ins.size()) &&
	"LowerCall didn't emit the correct number of values!");

	// For a tail call, the return value is merely live-out and there aren't
	// any nodes in the DAG representing it. Return a special value to
	// indicate that a tail call has been emitted and no more Instructions
	// should be processed in the current block.
	if (CLI.IsTailCall) {
	CLI.DAG.setRoot(CLI.Chain);
	return std::make_pair(SDValue(), SDValue());
	}

	#ifndef NDEBUG
	for (unsigned i = 0, e = CLI.Ins.size(); i != e; ++i) {
	assert(InVals[i].getNode() && "LowerCall emitted a null value!");
	assert(EVT(CLI.Ins[i].VT) == InVals[i].getValueType() &&
	"LowerCall emitted a value with the wrong type!");
	}
	#endif

	SmallVector<SDValue, 4> ReturnValues;
	if (!CanLowerReturn) {
	// The instruction result is the result of loading from the
	// hidden sret parameter.
	SmallVector<EVT, 1> PVTs;
	Type *PtrRetTy = OrigRetTy->getPointerTo(DL.getAllocaAddrSpace());

	ComputeValueVTs(*this, DL, PtrRetTy, PVTs);
	assert(PVTs.size() == 1 && "Pointers should fit in one register");
	EVT PtrVT = PVTs[0];

	unsigned NumValues = RetTys.size();
	ReturnValues.resize(NumValues);
	SmallVector<SDValue, 4> Chains(NumValues);

	// An aggregate return value cannot wrap around the address space, so
	// offsets to its parts don't wrap either.
	SDNodeFlags Flags;
	Flags.setNoUnsignedWrap(true);

	for (unsigned i = 0; i < NumValues; ++i) {
	SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot,
	CLI.DAG.getConstant(Offsets[i], CLI.DL,
	PtrVT), Flags);
	SDValue L = CLI.DAG.getLoad(
	RetTys[i], CLI.DL, CLI.Chain, Add,
	MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
	DemoteStackIdx, Offsets[i]),
	/* Alignment = */ 1);
	ReturnValues[i] = L;
	Chains[i] = L.getValue(1);
	}

	CLI.Chain = CLI.DAG.getNode(ISD::TokenFactor, CLI.DL, MVT::Other, Chains);
	} else {
	// Collect the legal value parts into potentially illegal values
	// that correspond to the original function's return values.
	Optional<ISD::NodeType> AssertOp;
	if (CLI.RetSExt)
	AssertOp = ISD::AssertSext;
	else if (CLI.RetZExt)
	AssertOp = ISD::AssertZext;
	unsigned CurReg = 0;
	for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
	EVT VT = RetTys[I];
	MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
	CLI.CallConv, VT);
	unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
	CLI.CallConv, VT);

	ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
	NumRegs, RegisterVT, VT, nullptr,
	CLI.CallConv, AssertOp));
	CurReg += NumRegs;
	}

	// For a function returning void, there is no return value. We can't create
	// such a node, so we just return a null return value in that case. In
	// that case, nothing will actually look at the value.
	if (ReturnValues.empty())
	return std::make_pair(SDValue(), CLI.Chain);
	}

	SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL,
	CLI.DAG.getVTList(RetTys), ReturnValues);
	return std::make_pair(Res, CLI.Chain);
	}

	void TargetLowering::LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	if (SDValue Res = LowerOperation(SDValue(N, 0), DAG))
	Results.push_back(Res);
	}

	SDValue TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	llvm_unreachable("LowerOperation not implemented for this target!");
	}

	void
	SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
	SDValue Op = getNonRegisterValue(V);
	assert((Op.getOpcode() != ISD::CopyFromReg \|\|
	cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
	"Copy from a reg to the same reg!");
	assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	// If this is an InlineAsm we have to match the registers required, not the
	// notional registers required by the type.

	RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg, V->getType(),
	None); // This is not an ABI copy.
	SDValue Chain = DAG.getEntryNode();

	ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==
	FuncInfo.PreferredExtendType.end())
	? ISD::ANY_EXTEND
	: FuncInfo.PreferredExtendType[V];
	RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, nullptr, V, ExtendType);
	PendingExports.push_back(Chain);
	}

	#include "llvm/CodeGen/SelectionDAGISel.h"

	/// isOnlyUsedInEntryBlock - If the specified argument is only used in the
	/// entry block, return true. This includes arguments used by switches, since
	/// the switch may expand into multiple basic blocks.
	static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {
	// With FastISel active, we may be splitting blocks, so force creation
	// of virtual registers for all non-dead arguments.
	if (FastISel)
	return A->use_empty();

	const BasicBlock &Entry = A->getParent()->front();
	for (const User *U : A->users())
	if (cast<Instruction>(U)->getParent() != &Entry \|\| isa<SwitchInst>(U))
	return false; // Use not in entry block.

	return true;
	}

	using ArgCopyElisionMapTy =
	DenseMap<const Argument *,
	std::pair<const AllocaInst , const StoreInst >>;

	/// Scan the entry block of the function in FuncInfo for arguments that look
	/// like copies into a local alloca. Record any copied arguments in
	/// ArgCopyElisionCandidates.
	static void
	findArgumentCopyElisionCandidates(const DataLayout &DL,
	FunctionLoweringInfo *FuncInfo,
	ArgCopyElisionMapTy &ArgCopyElisionCandidates) {
	// Record the state of every static alloca used in the entry block. Argument
	// allocas are all used in the entry block, so we need approximately as many
	// entries as we have arguments.
	enum StaticAllocaInfo { Unknown, Clobbered, Elidable };
	SmallDenseMap<const AllocaInst *, StaticAllocaInfo, 8> StaticAllocas;
	unsigned NumArgs = FuncInfo->Fn->arg_size();
	StaticAllocas.reserve(NumArgs * 2);

	auto GetInfoIfStaticAlloca = [&](const Value V) -> StaticAllocaInfo {
	if (!V)
	return nullptr;
	V = V->stripPointerCasts();
	const auto *AI = dyn_cast<AllocaInst>(V);
	if (!AI \|\| !AI->isStaticAlloca() \|\| !FuncInfo->StaticAllocaMap.count(AI))
	return nullptr;
	auto Iter = StaticAllocas.insert({AI, Unknown});
	return &Iter.first->second;
	};

	// Look for stores of arguments to static allocas. Look through bitcasts and
	// GEPs to handle type coercions, as long as the alloca is fully initialized
	// by the store. Any non-store use of an alloca escapes it and any subsequent
	// unanalyzed store might write it.
	// FIXME: Handle structs initialized with multiple stores.
	for (const Instruction &I : FuncInfo->Fn->getEntryBlock()) {
	// Look for stores, and handle non-store uses conservatively.
	const auto *SI = dyn_cast<StoreInst>(&I);
	if (!SI) {
	// We will look through cast uses, so ignore them completely.
	if (I.isCast())
	continue;
	// Ignore debug info intrinsics, they don't escape or store to allocas.
	if (isa<DbgInfoIntrinsic>(I))
	continue;
	// This is an unknown instruction. Assume it escapes or writes to all
	// static alloca operands.
	for (const Use &U : I.operands()) {
	if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(U))
	*Info = StaticAllocaInfo::Clobbered;
	}
	continue;
	}

	// If the stored value is a static alloca, mark it as escaped.
	if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(SI->getValueOperand()))
	*Info = StaticAllocaInfo::Clobbered;

	// Check if the destination is a static alloca.
	const Value *Dst = SI->getPointerOperand()->stripPointerCasts();
	StaticAllocaInfo *Info = GetInfoIfStaticAlloca(Dst);
	if (!Info)
	continue;
	const AllocaInst *AI = cast<AllocaInst>(Dst);

	// Skip allocas that have been initialized or clobbered.
	if (*Info != StaticAllocaInfo::Unknown)
	continue;

	// Check if the stored value is an argument, and that this store fully
	// initializes the alloca. Don't elide copies from the same argument twice.
	const Value *Val = SI->getValueOperand()->stripPointerCasts();
	const auto *Arg = dyn_cast<Argument>(Val);
	if (!Arg \|\| Arg->hasInAllocaAttr() \|\| Arg->hasByValAttr() \|\|
	Arg->getType()->isEmptyTy() \|\|
	DL.getTypeStoreSize(Arg->getType()) !=
	DL.getTypeAllocSize(AI->getAllocatedType()) \|\|
	ArgCopyElisionCandidates.count(Arg)) {
	*Info = StaticAllocaInfo::Clobbered;
	continue;
	}

	LLVM_DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI
	<< '\n');

	// Mark this alloca and store for argument copy elision.
	*Info = StaticAllocaInfo::Elidable;
	ArgCopyElisionCandidates.insert({Arg, {AI, SI}});

	// Stop scanning if we've seen all arguments. This will happen early in -O0
	// builds, which is useful, because -O0 builds have large entry blocks and
	// many allocas.
	if (ArgCopyElisionCandidates.size() == NumArgs)
	break;
	}
	}

	/// Try to elide argument copies from memory into a local alloca. Succeeds if
	/// ArgVal is a load from a suitable fixed stack object.
	static void tryToElideArgumentCopy(
	FunctionLoweringInfo *FuncInfo, SmallVectorImpl<SDValue> &Chains,
	DenseMap<int, int> &ArgCopyElisionFrameIndexMap,
	SmallPtrSetImpl<const Instruction *> &ElidedArgCopyInstrs,
	ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg,
	SDValue ArgVal, bool &ArgHasUses) {
	// Check if this is a load from a fixed stack object.
	auto *LNode = dyn_cast<LoadSDNode>(ArgVal);
	if (!LNode)
	return;
	auto *FINode = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode());
	if (!FINode)
	return;

	// Check that the fixed stack object is the right size and alignment.
	// Look at the alignment that the user wrote on the alloca instead of looking
	// at the stack object.
	auto ArgCopyIter = ArgCopyElisionCandidates.find(&Arg);
	assert(ArgCopyIter != ArgCopyElisionCandidates.end());
	const AllocaInst *AI = ArgCopyIter->second.first;
	int FixedIndex = FINode->getIndex();
	int &AllocaIndex = FuncInfo->StaticAllocaMap[AI];
	int OldIndex = AllocaIndex;
	MachineFrameInfo &MFI = FuncInfo->MF->getFrameInfo();
	if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) {
	LLVM_DEBUG(
	dbgs() << " argument copy elision failed due to bad fixed stack "
	"object size\n");
	return;
	}
	unsigned RequiredAlignment = AI->getAlignment();
	if (!RequiredAlignment) {
	RequiredAlignment = FuncInfo->MF->getDataLayout().getABITypeAlignment(
	AI->getAllocatedType());
	}
	if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) {
	LLVM_DEBUG(dbgs() << " argument copy elision failed: alignment of alloca "
	"greater than stack argument alignment ("
	<< RequiredAlignment << " vs "
	<< MFI.getObjectAlignment(FixedIndex) << ")\n");
	return;
	}

	// Perform the elision. Delete the old stack object and replace its only use
	// in the variable info map. Mark the stack object as mutable.
	LLVM_DEBUG({
	dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n'
	<< " Replacing frame index " << OldIndex << " with " << FixedIndex
	<< '\n';
	});
	MFI.RemoveStackObject(OldIndex);
	MFI.setIsImmutableObjectIndex(FixedIndex, false);
	AllocaIndex = FixedIndex;
	ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex});
	Chains.push_back(ArgVal.getValue(1));

	// Avoid emitting code for the store implementing the copy.
	const StoreInst *SI = ArgCopyIter->second.second;
	ElidedArgCopyInstrs.insert(SI);

	// Check for uses of the argument again so that we can avoid exporting ArgVal
	// if it is't used by anything other than the store.
	for (const Value *U : Arg.users()) {
	if (U != SI) {
	ArgHasUses = true;
	break;
	}
	}
	}

	void SelectionDAGISel::LowerArguments(const Function &F) {
	SelectionDAG &DAG = SDB->DAG;
	SDLoc dl = SDB->getCurSDLoc();
	const DataLayout &DL = DAG.getDataLayout();
	SmallVector<ISD::InputArg, 16> Ins;

	if (!FuncInfo->CanLowerReturn) {
	// Put in an sret pointer parameter before all the other parameters.
	SmallVector<EVT, 1> ValueVTs;
	ComputeValueVTs(*TLI, DAG.getDataLayout(),
	F.getReturnType()->getPointerTo(
	DAG.getDataLayout().getAllocaAddrSpace()),
	ValueVTs);

	// NOTE: Assuming that a pointer will never break down to more than one VT
	// or one register.
	ISD::ArgFlagsTy Flags;
	Flags.setSRet();
	MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVTs[0]);
	ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true,
	ISD::InputArg::NoArgIndex, 0);
	Ins.push_back(RetArg);
	}

	// Look for stores of arguments to static allocas. Mark such arguments with a
	// flag to ask the target to give us the memory location of that argument if
	// available.
	ArgCopyElisionMapTy ArgCopyElisionCandidates;
	findArgumentCopyElisionCandidates(DL, FuncInfo, ArgCopyElisionCandidates);

	// Set up the incoming argument description vector.
	for (const Argument &Arg : F.args()) {
	unsigned ArgNo = Arg.getArgNo();
	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
	bool isArgValueUsed = !Arg.use_empty();
	unsigned PartBase = 0;
	Type *FinalType = Arg.getType();
	if (Arg.hasAttribute(Attribute::ByVal))
	FinalType = Arg.getParamByValType();
	bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
	FinalType, F.getCallingConv(), F.isVarArg());
	for (unsigned Value = 0, NumValues = ValueVTs.size();
	Value != NumValues; ++Value) {
	EVT VT = ValueVTs[Value];
	Type ArgTy = VT.getTypeForEVT(DAG.getContext());
	ISD::ArgFlagsTy Flags;

	// Certain targets (such as MIPS), may have a different ABI alignment
	// for a type depending on the context. Give the target a chance to
	// specify the alignment it wants.
	unsigned OriginalAlignment =
	TLI->getABIAlignmentForCallingConv(ArgTy, DL);

	if (Arg.getType()->isPointerTy()) {
	Flags.setPointer();
	Flags.setPointerAddrSpace(
	cast<PointerType>(Arg.getType())->getAddressSpace());
	}
	if (Arg.hasAttribute(Attribute::ZExt))
	Flags.setZExt();
	if (Arg.hasAttribute(Attribute::SExt))
	Flags.setSExt();
	if (Arg.hasAttribute(Attribute::InReg)) {
	// If we are using vectorcall calling convention, a structure that is
	// passed InReg - is surely an HVA
	if (F.getCallingConv() == CallingConv::X86_VectorCall &&
	isa<StructType>(Arg.getType())) {
	// The first value of a structure is marked
	if (0 == Value)
	Flags.setHvaStart();
	Flags.setHva();
	}
	// Set InReg Flag
	Flags.setInReg();
	}
	if (Arg.hasAttribute(Attribute::StructRet))
	Flags.setSRet();
	if (Arg.hasAttribute(Attribute::SwiftSelf))
	Flags.setSwiftSelf();
	if (Arg.hasAttribute(Attribute::SwiftError))
	Flags.setSwiftError();
	if (Arg.hasAttribute(Attribute::ByVal))
	Flags.setByVal();
	if (Arg.hasAttribute(Attribute::InAlloca)) {
	Flags.setInAlloca();
	// Set the byval flag for CCAssignFn callbacks that don't know about
	// inalloca. This way we can know how many bytes we should've allocated
	// and how many bytes a callee cleanup function will pop. If we port
	// inalloca to more targets, we'll have to add custom inalloca handling
	// in the various CC lowering callbacks.
	Flags.setByVal();
	}
	if (F.getCallingConv() == CallingConv::X86_INTR) {
	// IA Interrupt passes frame (1st parameter) by value in the stack.
	if (ArgNo == 0)
	Flags.setByVal();
	}
	if (Flags.isByVal() \|\| Flags.isInAlloca()) {
	Type *ElementTy = Arg.getParamByValType();

	// For ByVal, size and alignment should be passed from FE. BE will
	// guess if this info is not there but there are cases it cannot get
	// right.
	unsigned FrameSize = DL.getTypeAllocSize(Arg.getParamByValType());
	Flags.setByValSize(FrameSize);

	unsigned FrameAlign;
	if (Arg.getParamAlignment())
	FrameAlign = Arg.getParamAlignment();
	else
	FrameAlign = TLI->getByValTypeAlignment(ElementTy, DL);
	Flags.setByValAlign(FrameAlign);
	}
	if (Arg.hasAttribute(Attribute::Nest))
	Flags.setNest();
	if (NeedsRegBlock)
	Flags.setInConsecutiveRegs();
	Flags.setOrigAlign(OriginalAlignment);
	if (ArgCopyElisionCandidates.count(&Arg))
	Flags.setCopyElisionCandidate();

	MVT RegisterVT = TLI->getRegisterTypeForCallingConv(
	*CurDAG->getContext(), F.getCallingConv(), VT);
	unsigned NumRegs = TLI->getNumRegistersForCallingConv(
	*CurDAG->getContext(), F.getCallingConv(), VT);
	for (unsigned i = 0; i != NumRegs; ++i) {
	ISD::InputArg MyFlags(Flags, RegisterVT, VT, isArgValueUsed,
	ArgNo, PartBase+i*RegisterVT.getStoreSize());
	if (NumRegs > 1 && i == 0)
	MyFlags.Flags.setSplit();
	// if it isn't first piece, alignment must be 1
	else if (i > 0) {
	MyFlags.Flags.setOrigAlign(1);
	if (i == NumRegs - 1)
	MyFlags.Flags.setSplitEnd();
	}
	Ins.push_back(MyFlags);
	}
	if (NeedsRegBlock && Value == NumValues - 1)
	Ins[Ins.size() - 1].Flags.setInConsecutiveRegsLast();
	PartBase += VT.getStoreSize();
	}
	}

	// Call the target to set up the argument values.
	SmallVector<SDValue, 8> InVals;
	SDValue NewRoot = TLI->LowerFormalArguments(
	DAG.getRoot(), F.getCallingConv(), F.isVarArg(), Ins, dl, DAG, InVals);

	// Verify that the target's LowerFormalArguments behaved as expected.
	assert(NewRoot.getNode() && NewRoot.getValueType() == MVT::Other &&
	"LowerFormalArguments didn't return a valid chain!");
	assert(InVals.size() == Ins.size() &&
	"LowerFormalArguments didn't emit the correct number of values!");
	LLVM_DEBUG({
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	assert(InVals[i].getNode() &&
	"LowerFormalArguments emitted a null value!");
	assert(EVT(Ins[i].VT) == InVals[i].getValueType() &&
	"LowerFormalArguments emitted a value with the wrong type!");
	}
	});

	// Update the DAG with the new chain value resulting from argument lowering.
	DAG.setRoot(NewRoot);

	// Set up the argument values.
	unsigned i = 0;
	if (!FuncInfo->CanLowerReturn) {
	// Create a virtual register for the sret pointer, and put in a copy
	// from the sret argument into it.
	SmallVector<EVT, 1> ValueVTs;
	ComputeValueVTs(*TLI, DAG.getDataLayout(),
	F.getReturnType()->getPointerTo(
	DAG.getDataLayout().getAllocaAddrSpace()),
	ValueVTs);
	MVT VT = ValueVTs[0].getSimpleVT();
	MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
	Optional<ISD::NodeType> AssertOp = None;
	SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1, RegVT, VT,
	nullptr, F.getCallingConv(), AssertOp);

	MachineFunction& MF = SDB->DAG.getMachineFunction();
	MachineRegisterInfo& RegInfo = MF.getRegInfo();
	unsigned SRetReg = RegInfo.createVirtualRegister(TLI->getRegClassFor(RegVT));
	FuncInfo->DemoteRegister = SRetReg;
	NewRoot =
	SDB->DAG.getCopyToReg(NewRoot, SDB->getCurSDLoc(), SRetReg, ArgValue);
	DAG.setRoot(NewRoot);

	// i indexes lowered arguments. Bump it past the hidden sret argument.
	++i;
	}

	SmallVector<SDValue, 4> Chains;
	DenseMap<int, int> ArgCopyElisionFrameIndexMap;
	for (const Argument &Arg : F.args()) {
	SmallVector<SDValue, 4> ArgValues;
	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
	unsigned NumValues = ValueVTs.size();
	if (NumValues == 0)
	continue;

	bool ArgHasUses = !Arg.use_empty();

	// Elide the copying store if the target loaded this argument from a
	// suitable fixed stack object.
	if (Ins[i].Flags.isCopyElisionCandidate()) {
	tryToElideArgumentCopy(FuncInfo, Chains, ArgCopyElisionFrameIndexMap,
	ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg,
	InVals[i], ArgHasUses);
	}

	// If this argument is unused then remember its value. It is used to generate
	// debugging information.
	bool isSwiftErrorArg =
	TLI->supportSwiftError() &&
	Arg.hasAttribute(Attribute::SwiftError);
	if (!ArgHasUses && !isSwiftErrorArg) {
	SDB->setUnusedArgValue(&Arg, InVals[i]);

	// Also remember any frame index for use in FastISel.
	if (FrameIndexSDNode *FI =
	dyn_cast<FrameIndexSDNode>(InVals[i].getNode()))
	FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
	}

	for (unsigned Val = 0; Val != NumValues; ++Val) {
	EVT VT = ValueVTs[Val];
	MVT PartVT = TLI->getRegisterTypeForCallingConv(*CurDAG->getContext(),
	F.getCallingConv(), VT);
	unsigned NumParts = TLI->getNumRegistersForCallingConv(
	*CurDAG->getContext(), F.getCallingConv(), VT);

	// Even an apparant 'unused' swifterror argument needs to be returned. So
	// we do generate a copy for it that can be used on return from the
	// function.
	if (ArgHasUses \|\| isSwiftErrorArg) {
	Optional<ISD::NodeType> AssertOp;
	if (Arg.hasAttribute(Attribute::SExt))
	AssertOp = ISD::AssertSext;
	else if (Arg.hasAttribute(Attribute::ZExt))
	AssertOp = ISD::AssertZext;

	ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts,
	PartVT, VT, nullptr,
	F.getCallingConv(), AssertOp));
	}

	i += NumParts;
	}

	// We don't need to do anything else for unused arguments.
	if (ArgValues.empty())
	continue;

	// Note down frame index.
	if (FrameIndexSDNode *FI =
	dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
	FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());

	SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues),
	SDB->getCurSDLoc());

	SDB->setValue(&Arg, Res);
	if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) {
	// We want to associate the argument with the frame index, among
	// involved operands, that correspond to the lowest address. The
	// getCopyFromParts function, called earlier, is swapping the order of
	// the operands to BUILD_PAIR depending on endianness. The result of
	// that swapping is that the least significant bits of the argument will
	// be in the first operand of the BUILD_PAIR node, and the most
	// significant bits will be in the second operand.
	unsigned LowAddressOp = DAG.getDataLayout().isBigEndian() ? 1 : 0;
	if (LoadSDNode *LNode =
	dyn_cast<LoadSDNode>(Res.getOperand(LowAddressOp).getNode()))
	if (FrameIndexSDNode *FI =
	dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
	FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
	}

	// Update the SwiftErrorVRegDefMap.
	if (Res.getOpcode() == ISD::CopyFromReg && isSwiftErrorArg) {
	unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
	if (TargetRegisterInfo::isVirtualRegister(Reg))
	SwiftError->setCurrentVReg(FuncInfo->MBB, SwiftError->getFunctionArg(),
	Reg);
	}

	// If this argument is live outside of the entry block, insert a copy from
	// wherever we got it to the vreg that other BB's will reference it as.
	if (Res.getOpcode() == ISD::CopyFromReg) {
	// If we can, though, try to skip creating an unnecessary vreg.
	// FIXME: This isn't very clean... it would be nice to make this more
	// general.
	unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
	if (TargetRegisterInfo::isVirtualRegister(Reg)) {
	FuncInfo->ValueMap[&Arg] = Reg;
	continue;
	}
	}
	if (!isOnlyUsedInEntryBlock(&Arg, TM.Options.EnableFastISel)) {
	FuncInfo->InitializeRegForValue(&Arg);
	SDB->CopyToExportRegsIfNeeded(&Arg);
	}
	}

	if (!Chains.empty()) {
	Chains.push_back(NewRoot);
	NewRoot = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	}

	DAG.setRoot(NewRoot);

	assert(i == InVals.size() && "Argument register count mismatch!");

	// If any argument copy elisions occurred and we have debug info, update the
	// stale frame indices used in the dbg.declare variable info table.
	MachineFunction::VariableDbgInfoMapTy &DbgDeclareInfo = MF->getVariableDbgInfo();
	if (!DbgDeclareInfo.empty() && !ArgCopyElisionFrameIndexMap.empty()) {
	for (MachineFunction::VariableDbgInfo &VI : DbgDeclareInfo) {
	auto I = ArgCopyElisionFrameIndexMap.find(VI.Slot);
	if (I != ArgCopyElisionFrameIndexMap.end())
	VI.Slot = I->second;
	}
	}

	// Finally, if the target has anything special to do, allow it to do so.
	EmitFunctionEntryCode();
	}

	/// Handle PHI nodes in successor blocks. Emit code into the SelectionDAG to
	/// ensure constants are generated when needed. Remember the virtual registers
	/// that need to be added to the Machine PHI nodes as input. We cannot just
	/// directly add them, because expansion might result in multiple MBB's for one
	/// BB. As such, the start of the BB might correspond to a different MBB than
	/// the end.
	void
	SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
	const Instruction *TI = LLVMBB->getTerminator();

	SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;

	// Check PHI nodes in successors that expect a value to be available from this
	// block.
	for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
	const BasicBlock *SuccBB = TI->getSuccessor(succ);
	if (!isa<PHINode>(SuccBB->begin())) continue;
	MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];

	// If this terminator has multiple identical successors (common for
	// switches), only handle each succ once.
	if (!SuccsHandled.insert(SuccMBB).second)
	continue;

	MachineBasicBlock::iterator MBBI = SuccMBB->begin();

	// At this point we know that there is a 1-1 correspondence between LLVM PHI
	// nodes and Machine PHI nodes, but the incoming operands have not been
	// emitted yet.
	for (const PHINode &PN : SuccBB->phis()) {
	// Ignore dead phi's.
	if (PN.use_empty())
	continue;

	// Skip empty types
	if (PN.getType()->isEmptyTy())
	continue;

	unsigned Reg;
	const Value *PHIOp = PN.getIncomingValueForBlock(LLVMBB);

	if (const Constant *C = dyn_cast<Constant>(PHIOp)) {
	unsigned &RegOut = ConstantsOut[C];
	if (RegOut == 0) {
	RegOut = FuncInfo.CreateRegs(C);
	CopyValueToVirtualRegister(C, RegOut);
	}
	Reg = RegOut;
	} else {
	DenseMap<const Value *, unsigned>::iterator I =
	FuncInfo.ValueMap.find(PHIOp);
	if (I != FuncInfo.ValueMap.end())
	Reg = I->second;
	else {
	assert(isa<AllocaInst>(PHIOp) &&
	FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(PHIOp)) &&
	"Didn't codegen value into a register!??");
	Reg = FuncInfo.CreateRegs(PHIOp);
	CopyValueToVirtualRegister(PHIOp, Reg);
	}
	}

	// Remember that this register needs to added to the machine PHI node as
	// the input for this MBB.
	SmallVector<EVT, 4> ValueVTs;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	ComputeValueVTs(TLI, DAG.getDataLayout(), PN.getType(), ValueVTs);
	for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
	EVT VT = ValueVTs[vti];
	unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT);
	for (unsigned i = 0, e = NumRegisters; i != e; ++i)
	FuncInfo.PHINodesToUpdate.push_back(
	std::make_pair(&*MBBI++, Reg + i));
	Reg += NumRegisters;
	}
	}
	}

	ConstantsOut.clear();
	}

	/// Add a successor MBB to ParentMBB< creating a new MachineBB for BB if SuccMBB
	/// is 0.
	MachineBasicBlock *
	SelectionDAGBuilder::StackProtectorDescriptor::
	AddSuccessorMBB(const BasicBlock *BB,
	MachineBasicBlock *ParentMBB,
	bool IsLikely,
	MachineBasicBlock *SuccMBB) {
	// If SuccBB has not been created yet, create it.
	if (!SuccMBB) {
	MachineFunction *MF = ParentMBB->getParent();
	MachineFunction::iterator BBI(ParentMBB);
	SuccMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(++BBI, SuccMBB);
	}
	// Add it as a successor of ParentMBB.
	ParentMBB->addSuccessor(
	SuccMBB, BranchProbabilityInfo::getBranchProbStackProtector(IsLikely));
	return SuccMBB;
	}

	MachineBasicBlock SelectionDAGBuilder::NextBlock(MachineBasicBlock MBB) {
	MachineFunction::iterator I(MBB);
	if (++I == FuncInfo.MF->end())
	return nullptr;
	return &*I;
	}

	/// During lowering new call nodes can be created (such as memset, etc.).
	/// Those will become new roots of the current DAG, but complications arise
	/// when they are tail calls. In such cases, the call lowering will update
	/// the root, but the builder still needs to know that a tail call has been
	/// lowered in order to avoid generating an additional return.
	void SelectionDAGBuilder::updateDAGForMaybeTailCall(SDValue MaybeTC) {
	// If the node is null, we do have a tail call.
	if (MaybeTC.getNode() != nullptr)
	DAG.setRoot(MaybeTC);
	else
	HasTailCall = true;
	}

	void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
	MachineBasicBlock *SwitchMBB,
	MachineBasicBlock *DefaultMBB) {
	MachineFunction *CurMF = FuncInfo.MF;
	MachineBasicBlock *NextMBB = nullptr;
	MachineFunction::iterator BBI(W.MBB);
	if (++BBI != FuncInfo.MF->end())
	NextMBB = &*BBI;

	unsigned Size = W.LastCluster - W.FirstCluster + 1;

	BranchProbabilityInfo *BPI = FuncInfo.BPI;

	if (Size == 2 && W.MBB == SwitchMBB) {
	// If any two of the cases has the same destination, and if one value
	// is the same as the other, but has one bit unset that the other has set,
	// use bit manipulation to do two compares at once. For example:
	// "if (X == 6 \|\| X == 4)" -> "if ((X\|2) == 6)"
	// TODO: This could be extended to merge any 2 cases in switches with 3
	// cases.
	// TODO: Handle cases where W.CaseBB != SwitchBB.
	CaseCluster &Small = *W.FirstCluster;
	CaseCluster &Big = *W.LastCluster;

	if (Small.Low == Small.High && Big.Low == Big.High &&
	Small.MBB == Big.MBB) {
	const APInt &SmallValue = Small.Low->getValue();
	const APInt &BigValue = Big.Low->getValue();

	// Check that there is only one bit different.
	APInt CommonBit = BigValue ^ SmallValue;
	if (CommonBit.isPowerOf2()) {
	SDValue CondLHS = getValue(Cond);
	EVT VT = CondLHS.getValueType();
	SDLoc DL = getCurSDLoc();

	SDValue Or = DAG.getNode(ISD::OR, DL, VT, CondLHS,
	DAG.getConstant(CommonBit, DL, VT));
	SDValue Cond = DAG.getSetCC(
	DL, MVT::i1, Or, DAG.getConstant(BigValue \| SmallValue, DL, VT),
	ISD::SETEQ);

	// Update successor info.
	// Both Small and Big will jump to Small.BB, so we sum up the
	// probabilities.
	addSuccessorWithProb(SwitchMBB, Small.MBB, Small.Prob + Big.Prob);
	if (BPI)
	addSuccessorWithProb(
	SwitchMBB, DefaultMBB,
	// The default destination is the first successor in IR.
	BPI->getEdgeProbability(SwitchMBB->getBasicBlock(), (unsigned)0));
	else
	addSuccessorWithProb(SwitchMBB, DefaultMBB);

	// Insert the true branch.
	SDValue BrCond =
	DAG.getNode(ISD::BRCOND, DL, MVT::Other, getControlRoot(), Cond,
	DAG.getBasicBlock(Small.MBB));
	// Insert the false branch.
	BrCond = DAG.getNode(ISD::BR, DL, MVT::Other, BrCond,
	DAG.getBasicBlock(DefaultMBB));

	DAG.setRoot(BrCond);
	return;
	}
	}
	}

	if (TM.getOptLevel() != CodeGenOpt::None) {
	// Here, we order cases by probability so the most likely case will be
	// checked first. However, two clusters can have the same probability in
	// which case their relative ordering is non-deterministic. So we use Low
	// as a tie-breaker as clusters are guaranteed to never overlap.
	llvm::sort(W.FirstCluster, W.LastCluster + 1,
	[](const CaseCluster &a, const CaseCluster &b) {
	return a.Prob != b.Prob ?
	a.Prob > b.Prob :
	a.Low->getValue().slt(b.Low->getValue());
	});

	// Rearrange the case blocks so that the last one falls through if possible
	// without changing the order of probabilities.
	for (CaseClusterIt I = W.LastCluster; I > W.FirstCluster; ) {
	--I;
	if (I->Prob > W.LastCluster->Prob)
	break;
	if (I->Kind == CC_Range && I->MBB == NextMBB) {
	std::swap(I, W.LastCluster);
	break;
	}
	}
	}

	// Compute total probability.
	BranchProbability DefaultProb = W.DefaultProb;
	BranchProbability UnhandledProbs = DefaultProb;
	for (CaseClusterIt I = W.FirstCluster; I <= W.LastCluster; ++I)
	UnhandledProbs += I->Prob;

	MachineBasicBlock *CurMBB = W.MBB;
	for (CaseClusterIt I = W.FirstCluster, E = W.LastCluster; I <= E; ++I) {
	bool FallthroughUnreachable = false;
	MachineBasicBlock *Fallthrough;
	if (I == W.LastCluster) {
	// For the last cluster, fall through to the default destination.
	Fallthrough = DefaultMBB;
	FallthroughUnreachable = isa<UnreachableInst>(
	DefaultMBB->getBasicBlock()->getFirstNonPHIOrDbg());
	} else {
	Fallthrough = CurMF->CreateMachineBasicBlock(CurMBB->getBasicBlock());
	CurMF->insert(BBI, Fallthrough);
	// Put Cond in a virtual register to make it available from the new blocks.
	ExportFromCurrentBlock(Cond);
	}
	UnhandledProbs -= I->Prob;

	switch (I->Kind) {
	case CC_JumpTable: {
	// FIXME: Optimize away range check based on pivot comparisons.
	JumpTableHeader *JTH = &SL->JTCases[I->JTCasesIndex].first;
	SwitchCG::JumpTable *JT = &SL->JTCases[I->JTCasesIndex].second;

	// The jump block hasn't been inserted yet; insert it here.
	MachineBasicBlock *JumpMBB = JT->MBB;
	CurMF->insert(BBI, JumpMBB);

	auto JumpProb = I->Prob;
	auto FallthroughProb = UnhandledProbs;

	// If the default statement is a target of the jump table, we evenly
	// distribute the default probability to successors of CurMBB. Also
	// update the probability on the edge from JumpMBB to Fallthrough.
	for (MachineBasicBlock::succ_iterator SI = JumpMBB->succ_begin(),
	SE = JumpMBB->succ_end();
	SI != SE; ++SI) {
	if (*SI == DefaultMBB) {
	JumpProb += DefaultProb / 2;
	FallthroughProb -= DefaultProb / 2;
	JumpMBB->setSuccProbability(SI, DefaultProb / 2);
	JumpMBB->normalizeSuccProbs();
	break;
	}
	}

	if (FallthroughUnreachable) {
	// Skip the range check if the fallthrough block is unreachable.
	JTH->OmitRangeCheck = true;
	}

	if (!JTH->OmitRangeCheck)
	addSuccessorWithProb(CurMBB, Fallthrough, FallthroughProb);
	addSuccessorWithProb(CurMBB, JumpMBB, JumpProb);
	CurMBB->normalizeSuccProbs();

	// The jump table header will be inserted in our current block, do the
	// range check, and fall through to our fallthrough block.
	JTH->HeaderBB = CurMBB;
	JT->Default = Fallthrough; // FIXME: Move Default to JumpTableHeader.

	// If we're in the right place, emit the jump table header right now.
	if (CurMBB == SwitchMBB) {
	visitJumpTableHeader(JT, JTH, SwitchMBB);
	JTH->Emitted = true;
	}
	break;
	}
	case CC_BitTests: {
	// FIXME: If Fallthrough is unreachable, skip the range check.

	// FIXME: Optimize away range check based on pivot comparisons.
	BitTestBlock *BTB = &SL->BitTestCases[I->BTCasesIndex];

	// The bit test blocks haven't been inserted yet; insert them here.
	for (BitTestCase &BTC : BTB->Cases)
	CurMF->insert(BBI, BTC.ThisBB);

	// Fill in fields of the BitTestBlock.
	BTB->Parent = CurMBB;
	BTB->Default = Fallthrough;

	BTB->DefaultProb = UnhandledProbs;
	// If the cases in bit test don't form a contiguous range, we evenly
	// distribute the probability on the edge to Fallthrough to two
	// successors of CurMBB.
	if (!BTB->ContiguousRange) {
	BTB->Prob += DefaultProb / 2;
	BTB->DefaultProb -= DefaultProb / 2;
	}

	// If we're in the right place, emit the bit test header right now.
	if (CurMBB == SwitchMBB) {
	visitBitTestHeader(*BTB, SwitchMBB);
	BTB->Emitted = true;
	}
	break;
	}
	case CC_Range: {
	const Value RHS, LHS, *MHS;
	ISD::CondCode CC;
	if (I->Low == I->High) {
	// Check Cond == I->Low.
	CC = ISD::SETEQ;
	LHS = Cond;
	RHS=I->Low;
	MHS = nullptr;
	} else {
	// Check I->Low <= Cond <= I->High.
	CC = ISD::SETLE;
	LHS = I->Low;
	MHS = Cond;
	RHS = I->High;
	}

	// If Fallthrough is unreachable, fold away the comparison.
	if (FallthroughUnreachable)
	CC = ISD::SETTRUE;

	// The false probability is the sum of all unhandled cases.
	CaseBlock CB(CC, LHS, RHS, MHS, I->MBB, Fallthrough, CurMBB,
	getCurSDLoc(), I->Prob, UnhandledProbs);

	if (CurMBB == SwitchMBB)
	visitSwitchCase(CB, SwitchMBB);
	else
	SL->SwitchCases.push_back(CB);

	break;
	}
	}
	CurMBB = Fallthrough;
	}
	}

	unsigned SelectionDAGBuilder::caseClusterRank(const CaseCluster &CC,
	CaseClusterIt First,
	CaseClusterIt Last) {
	return std::count_if(First, Last + 1, [&](const CaseCluster &X) {
	if (X.Prob != CC.Prob)
	return X.Prob > CC.Prob;

	// Ties are broken by comparing the case value.
	return X.Low->getValue().slt(CC.Low->getValue());
	});
	}

	void SelectionDAGBuilder::splitWorkItem(SwitchWorkList &WorkList,
	const SwitchWorkListItem &W,
	Value *Cond,
	MachineBasicBlock *SwitchMBB) {
	assert(W.FirstCluster->Low->getValue().slt(W.LastCluster->Low->getValue()) &&
	"Clusters not sorted?");

	assert(W.LastCluster - W.FirstCluster + 1 >= 2 && "Too small to split!");

	// Balance the tree based on branch probabilities to create a near-optimal (in
	// terms of search time given key frequency) binary search tree. See e.g. Kurt
	// Mehlhorn "Nearly Optimal Binary Search Trees" (1975).
	CaseClusterIt LastLeft = W.FirstCluster;
	CaseClusterIt FirstRight = W.LastCluster;
	auto LeftProb = LastLeft->Prob + W.DefaultProb / 2;
	auto RightProb = FirstRight->Prob + W.DefaultProb / 2;

	// Move LastLeft and FirstRight towards each other from opposite directions to
	// find a partitioning of the clusters which balances the probability on both
	// sides. If LeftProb and RightProb are equal, alternate which side is
	// taken to ensure 0-probability nodes are distributed evenly.
	unsigned I = 0;
	while (LastLeft + 1 < FirstRight) {
	if (LeftProb < RightProb \|\| (LeftProb == RightProb && (I & 1)))
	LeftProb += (++LastLeft)->Prob;
	else
	RightProb += (--FirstRight)->Prob;
	I++;
	}

	while (true) {
	// Our binary search tree differs from a typical BST in that ours can have up
	// to three values in each leaf. The pivot selection above doesn't take that
	// into account, which means the tree might require more nodes and be less
	// efficient. We compensate for this here.

	unsigned NumLeft = LastLeft - W.FirstCluster + 1;
	unsigned NumRight = W.LastCluster - FirstRight + 1;

	if (std::min(NumLeft, NumRight) < 3 && std::max(NumLeft, NumRight) > 3) {
	// If one side has less than 3 clusters, and the other has more than 3,
	// consider taking a cluster from the other side.

	if (NumLeft < NumRight) {
	// Consider moving the first cluster on the right to the left side.
	CaseCluster &CC = *FirstRight;
	unsigned RightSideRank = caseClusterRank(CC, FirstRight, W.LastCluster);
	unsigned LeftSideRank = caseClusterRank(CC, W.FirstCluster, LastLeft);
	if (LeftSideRank <= RightSideRank) {
	// Moving the cluster to the left does not demote it.
	++LastLeft;
	++FirstRight;
	continue;
	}
	} else {
	assert(NumRight < NumLeft);
	// Consider moving the last element on the left to the right side.
	CaseCluster &CC = *LastLeft;
	unsigned LeftSideRank = caseClusterRank(CC, W.FirstCluster, LastLeft);
	unsigned RightSideRank = caseClusterRank(CC, FirstRight, W.LastCluster);
	if (RightSideRank <= LeftSideRank) {
	// Moving the cluster to the right does not demot it.
	--LastLeft;
	--FirstRight;
	continue;
	}
	}
	}
	break;
	}

	assert(LastLeft + 1 == FirstRight);
	assert(LastLeft >= W.FirstCluster);
	assert(FirstRight <= W.LastCluster);

	// Use the first element on the right as pivot since we will make less-than
	// comparisons against it.
	CaseClusterIt PivotCluster = FirstRight;
	assert(PivotCluster > W.FirstCluster);
	assert(PivotCluster <= W.LastCluster);

	CaseClusterIt FirstLeft = W.FirstCluster;
	CaseClusterIt LastRight = W.LastCluster;

	const ConstantInt *Pivot = PivotCluster->Low;

	// New blocks will be inserted immediately after the current one.
	MachineFunction::iterator BBI(W.MBB);
	++BBI;

	// We will branch to the LHS if Value < Pivot. If LHS is a single cluster,
	// we can branch to its destination directly if it's squeezed exactly in
	// between the known lower bound and Pivot - 1.
	MachineBasicBlock *LeftMBB;
	if (FirstLeft == LastLeft && FirstLeft->Kind == CC_Range &&
	FirstLeft->Low == W.GE &&
	(FirstLeft->High->getValue() + 1LL) == Pivot->getValue()) {
	LeftMBB = FirstLeft->MBB;
	} else {
	LeftMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock());
	FuncInfo.MF->insert(BBI, LeftMBB);
	WorkList.push_back(
	{LeftMBB, FirstLeft, LastLeft, W.GE, Pivot, W.DefaultProb / 2});
	// Put Cond in a virtual register to make it available from the new blocks.
	ExportFromCurrentBlock(Cond);
	}

	// Similarly, we will branch to the RHS if Value >= Pivot. If RHS is a
	// single cluster, RHS.Low == Pivot, and we can branch to its destination
	// directly if RHS.High equals the current upper bound.
	MachineBasicBlock *RightMBB;
	if (FirstRight == LastRight && FirstRight->Kind == CC_Range &&
	W.LT && (FirstRight->High->getValue() + 1ULL) == W.LT->getValue()) {
	RightMBB = FirstRight->MBB;
	} else {
	RightMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock());
	FuncInfo.MF->insert(BBI, RightMBB);
	WorkList.push_back(
	{RightMBB, FirstRight, LastRight, Pivot, W.LT, W.DefaultProb / 2});
	// Put Cond in a virtual register to make it available from the new blocks.
	ExportFromCurrentBlock(Cond);
	}

	// Create the CaseBlock record that will be used to lower the branch.
	CaseBlock CB(ISD::SETLT, Cond, Pivot, nullptr, LeftMBB, RightMBB, W.MBB,
	getCurSDLoc(), LeftProb, RightProb);

	if (W.MBB == SwitchMBB)
	visitSwitchCase(CB, SwitchMBB);
	else
	SL->SwitchCases.push_back(CB);
	}

	// Scale CaseProb after peeling a case with the probablity of PeeledCaseProb
	// from the swith statement.
	static BranchProbability scaleCaseProbality(BranchProbability CaseProb,
	BranchProbability PeeledCaseProb) {
	if (PeeledCaseProb == BranchProbability::getOne())
	return BranchProbability::getZero();
	BranchProbability SwitchProb = PeeledCaseProb.getCompl();

	uint32_t Numerator = CaseProb.getNumerator();
	uint32_t Denominator = SwitchProb.scale(CaseProb.getDenominator());
	return BranchProbability(Numerator, std::max(Numerator, Denominator));
	}

	// Try to peel the top probability case if it exceeds the threshold.
	// Return current MachineBasicBlock for the switch statement if the peeling
	// does not occur.
	// If the peeling is performed, return the newly created MachineBasicBlock
	// for the peeled switch statement. Also update Clusters to remove the peeled
	// case. PeeledCaseProb is the BranchProbability for the peeled case.
	MachineBasicBlock *SelectionDAGBuilder::peelDominantCaseCluster(
	const SwitchInst &SI, CaseClusterVector &Clusters,
	BranchProbability &PeeledCaseProb) {
	MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
	// Don't perform if there is only one cluster or optimizing for size.
	if (SwitchPeelThreshold > 100 \|\| !FuncInfo.BPI \|\| Clusters.size() < 2 \|\|
	TM.getOptLevel() == CodeGenOpt::None \|\|
	SwitchMBB->getParent()->getFunction().hasMinSize())
	return SwitchMBB;

	BranchProbability TopCaseProb = BranchProbability(SwitchPeelThreshold, 100);
	unsigned PeeledCaseIndex = 0;
	bool SwitchPeeled = false;
	for (unsigned Index = 0; Index < Clusters.size(); ++Index) {
	CaseCluster &CC = Clusters[Index];
	if (CC.Prob < TopCaseProb)
	continue;
	TopCaseProb = CC.Prob;
	PeeledCaseIndex = Index;
	SwitchPeeled = true;
	}
	if (!SwitchPeeled)
	return SwitchMBB;

	LLVM_DEBUG(dbgs() << "Peeled one top case in switch stmt, prob: "
	<< TopCaseProb << "\n");

	// Record the MBB for the peeled switch statement.
	MachineFunction::iterator BBI(SwitchMBB);
	++BBI;
	MachineBasicBlock *PeeledSwitchMBB =
	FuncInfo.MF->CreateMachineBasicBlock(SwitchMBB->getBasicBlock());
	FuncInfo.MF->insert(BBI, PeeledSwitchMBB);

	ExportFromCurrentBlock(SI.getCondition());
	auto PeeledCaseIt = Clusters.begin() + PeeledCaseIndex;
	SwitchWorkListItem W = {SwitchMBB, PeeledCaseIt, PeeledCaseIt,
	nullptr, nullptr, TopCaseProb.getCompl()};
	lowerWorkItem(W, SI.getCondition(), SwitchMBB, PeeledSwitchMBB);

	Clusters.erase(PeeledCaseIt);
	for (CaseCluster &CC : Clusters) {
	LLVM_DEBUG(
	dbgs() << "Scale the probablity for one cluster, before scaling: "
	<< CC.Prob << "\n");
	CC.Prob = scaleCaseProbality(CC.Prob, TopCaseProb);
	LLVM_DEBUG(dbgs() << "After scaling: " << CC.Prob << "\n");
	}
	PeeledCaseProb = TopCaseProb;
	return PeeledSwitchMBB;
	}

	void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
	// Extract cases from the switch.
	BranchProbabilityInfo *BPI = FuncInfo.BPI;
	CaseClusterVector Clusters;
	Clusters.reserve(SI.getNumCases());
	for (auto I : SI.cases()) {
	MachineBasicBlock *Succ = FuncInfo.MBBMap[I.getCaseSuccessor()];
	const ConstantInt *CaseVal = I.getCaseValue();
	BranchProbability Prob =
	BPI ? BPI->getEdgeProbability(SI.getParent(), I.getSuccessorIndex())
	: BranchProbability(1, SI.getNumCases() + 1);
	Clusters.push_back(CaseCluster::range(CaseVal, CaseVal, Succ, Prob));
	}

	MachineBasicBlock *DefaultMBB = FuncInfo.MBBMap[SI.getDefaultDest()];

	// Cluster adjacent cases with the same destination. We do this at all
	// optimization levels because it's cheap to do and will make codegen faster
	// if there are many clusters.
	sortAndRangeify(Clusters);

	// The branch probablity of the peeled case.
	BranchProbability PeeledCaseProb = BranchProbability::getZero();
	MachineBasicBlock *PeeledSwitchMBB =
	peelDominantCaseCluster(SI, Clusters, PeeledCaseProb);

	// If there is only the default destination, jump there directly.
	MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
	if (Clusters.empty()) {
	assert(PeeledSwitchMBB == SwitchMBB);
	SwitchMBB->addSuccessor(DefaultMBB);
	if (DefaultMBB != NextBlock(SwitchMBB)) {
	DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
	getControlRoot(), DAG.getBasicBlock(DefaultMBB)));
	}
	return;
	}

	SL->findJumpTables(Clusters, &SI, DefaultMBB);
	SL->findBitTestClusters(Clusters, &SI);

	LLVM_DEBUG({
	dbgs() << "Case clusters: ";
	for (const CaseCluster &C : Clusters) {
	if (C.Kind == CC_JumpTable)
	dbgs() << "JT:";
	if (C.Kind == CC_BitTests)
	dbgs() << "BT:";

	C.Low->getValue().print(dbgs(), true);
	if (C.Low != C.High) {
	dbgs() << '-';
	C.High->getValue().print(dbgs(), true);
	}
	dbgs() << ' ';
	}
	dbgs() << '\n';
	});

	assert(!Clusters.empty());
	SwitchWorkList WorkList;
	CaseClusterIt First = Clusters.begin();
	CaseClusterIt Last = Clusters.end() - 1;
	auto DefaultProb = getEdgeProbability(PeeledSwitchMBB, DefaultMBB);
	// Scale the branchprobability for DefaultMBB if the peel occurs and
	// DefaultMBB is not replaced.
	if (PeeledCaseProb != BranchProbability::getZero() &&
	DefaultMBB == FuncInfo.MBBMap[SI.getDefaultDest()])
	DefaultProb = scaleCaseProbality(DefaultProb, PeeledCaseProb);
	WorkList.push_back(
	{PeeledSwitchMBB, First, Last, nullptr, nullptr, DefaultProb});

	while (!WorkList.empty()) {
	SwitchWorkListItem W = WorkList.back();
	WorkList.pop_back();
	unsigned NumClusters = W.LastCluster - W.FirstCluster + 1;

	if (NumClusters > 3 && TM.getOptLevel() != CodeGenOpt::None &&
	!DefaultMBB->getParent()->getFunction().hasMinSize()) {
	// For optimized builds, lower large range as a balanced binary tree.
	splitWorkItem(WorkList, W, SI.getCondition(), SwitchMBB);
	continue;
	}

	lowerWorkItem(W, SI.getCondition(), SwitchMBB, DefaultMBB);
	}
	}
	Index: vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/TargetLowering.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/TargetLowering.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/CodeGen/SelectionDAG/TargetLowering.cpp (revision 351303)
	@@ -1,6284 +1,6288 @@
	//===-- TargetLowering.cpp - Implement the TargetLowering class -----------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This implements the TargetLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetLoweringObjectFile.h"
	#include "llvm/Target/TargetMachine.h"
	#include <cctype>
	using namespace llvm;

	/// NOTE: The TargetMachine owns TLOF.
	TargetLowering::TargetLowering(const TargetMachine &tm)
	: TargetLoweringBase(tm) {}

	const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
	return nullptr;
	}

	bool TargetLowering::isPositionIndependent() const {
	return getTargetMachine().isPositionIndependent();
	}

	/// Check whether a given call node is in tail position within its function. If
	/// so, it sets Chain to the input chain of the tail call.
	bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
	SDValue &Chain) const {
	const Function &F = DAG.getMachineFunction().getFunction();

	// Conservatively require the attributes of the call to match those of
	// the return. Ignore NoAlias and NonNull because they don't affect the
	// call sequence.
	AttributeList CallerAttrs = F.getAttributes();
	if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
	.removeAttribute(Attribute::NoAlias)
	.removeAttribute(Attribute::NonNull)
	.hasAttributes())
	return false;

	// It's not safe to eliminate the sign / zero extension of the return value.
	if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) \|\|
	CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
	return false;

	// Check if the only use is a function return node.
	return isUsedByReturnOnly(Node, Chain);
	}

	bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
	const uint32_t *CallerPreservedMask,
	const SmallVectorImpl<CCValAssign> &ArgLocs,
	const SmallVectorImpl<SDValue> &OutVals) const {
	for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
	const CCValAssign &ArgLoc = ArgLocs[I];
	if (!ArgLoc.isRegLoc())
	continue;
	unsigned Reg = ArgLoc.getLocReg();
	// Only look at callee saved registers.
	if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
	continue;
	// Check that we pass the value used for the caller.
	// (We look for a CopyFromReg reading a virtual register that is used
	// for the function live-in value of register Reg)
	SDValue Value = OutVals[I];
	if (Value->getOpcode() != ISD::CopyFromReg)
	return false;
	unsigned ArgReg = cast<RegisterSDNode>(Value->getOperand(1))->getReg();
	if (MRI.getLiveInPhysReg(ArgReg) != Reg)
	return false;
	}
	return true;
	}

	/// Set CallLoweringInfo attribute flags based on a call instruction
	/// and called function attributes.
	void TargetLoweringBase::ArgListEntry::setAttributes(const CallBase *Call,
	unsigned ArgIdx) {
	IsSExt = Call->paramHasAttr(ArgIdx, Attribute::SExt);
	IsZExt = Call->paramHasAttr(ArgIdx, Attribute::ZExt);
	IsInReg = Call->paramHasAttr(ArgIdx, Attribute::InReg);
	IsSRet = Call->paramHasAttr(ArgIdx, Attribute::StructRet);
	IsNest = Call->paramHasAttr(ArgIdx, Attribute::Nest);
	IsByVal = Call->paramHasAttr(ArgIdx, Attribute::ByVal);
	IsInAlloca = Call->paramHasAttr(ArgIdx, Attribute::InAlloca);
	IsReturned = Call->paramHasAttr(ArgIdx, Attribute::Returned);
	IsSwiftSelf = Call->paramHasAttr(ArgIdx, Attribute::SwiftSelf);
	IsSwiftError = Call->paramHasAttr(ArgIdx, Attribute::SwiftError);
	Alignment = Call->getParamAlignment(ArgIdx);
	ByValType = nullptr;
	if (Call->paramHasAttr(ArgIdx, Attribute::ByVal))
	ByValType = Call->getParamByValType(ArgIdx);
	}

	/// Generate a libcall taking the given operands as arguments and returning a
	/// result of type RetVT.
	std::pair<SDValue, SDValue>
	TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
	ArrayRef<SDValue> Ops, bool isSigned,
	const SDLoc &dl, bool doesNotReturn,
	bool isReturnValueUsed,
	bool isPostTypeLegalization) const {
	TargetLowering::ArgListTy Args;
	Args.reserve(Ops.size());

	TargetLowering::ArgListEntry Entry;
	for (SDValue Op : Ops) {
	Entry.Node = Op;
	Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
	Entry.IsSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
	Entry.IsZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
	Args.push_back(Entry);
	}

	if (LC == RTLIB::UNKNOWN_LIBCALL)
	report_fatal_error("Unsupported library call operation!");
	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
	getPointerTy(DAG.getDataLayout()));

	Type RetTy = RetVT.getTypeForEVT(DAG.getContext());
	TargetLowering::CallLoweringInfo CLI(DAG);
	bool signExtend = shouldSignExtendTypeInLibCall(RetVT, isSigned);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
	.setNoReturn(doesNotReturn)
	.setDiscardResult(!isReturnValueUsed)
	.setIsPostTypeLegalization(isPostTypeLegalization)
	.setSExtResult(signExtend)
	.setZExtResult(!signExtend);
	return LowerCallTo(CLI);
	}

	bool
	TargetLowering::findOptimalMemOpLowering(std::vector<EVT> &MemOps,
	unsigned Limit, uint64_t Size,
	unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset,
	bool ZeroMemset,
	bool MemcpyStrSrc,
	bool AllowOverlap,
	unsigned DstAS, unsigned SrcAS,
	const AttributeList &FuncAttributes) const {
	// If 'SrcAlign' is zero, that means the memory operation does not need to
	// load the value, i.e. memset or memcpy from constant string. Otherwise,
	// it's the inferred alignment of the source. 'DstAlign', on the other hand,
	// is the specified alignment of the memory operation. If it is zero, that
	// means it's possible to change the alignment of the destination.
	// 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
	// not need to be loaded.
	if (!(SrcAlign == 0 \|\| SrcAlign >= DstAlign))
	return false;

	EVT VT = getOptimalMemOpType(Size, DstAlign, SrcAlign,
	IsMemset, ZeroMemset, MemcpyStrSrc,
	FuncAttributes);

	if (VT == MVT::Other) {
	// Use the largest integer type whose alignment constraints are satisfied.
	// We only need to check DstAlign here as SrcAlign is always greater or
	// equal to DstAlign (or zero).
	VT = MVT::i64;
	while (DstAlign && DstAlign < VT.getSizeInBits() / 8 &&
	!allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign))
	VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
	assert(VT.isInteger());

	// Find the largest legal integer type.
	MVT LVT = MVT::i64;
	while (!isTypeLegal(LVT))
	LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
	assert(LVT.isInteger());

	// If the type we've chosen is larger than the largest legal integer type
	// then use that instead.
	if (VT.bitsGT(LVT))
	VT = LVT;
	}

	unsigned NumMemOps = 0;
	while (Size != 0) {
	unsigned VTSize = VT.getSizeInBits() / 8;
	while (VTSize > Size) {
	// For now, only use non-vector load / store's for the left-over pieces.
	EVT NewVT = VT;
	unsigned NewVTSize;

	bool Found = false;
	if (VT.isVector() \|\| VT.isFloatingPoint()) {
	NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
	if (isOperationLegalOrCustom(ISD::STORE, NewVT) &&
	isSafeMemOpType(NewVT.getSimpleVT()))
	Found = true;
	else if (NewVT == MVT::i64 &&
	isOperationLegalOrCustom(ISD::STORE, MVT::f64) &&
	isSafeMemOpType(MVT::f64)) {
	// i64 is usually not legal on 32-bit targets, but f64 may be.
	NewVT = MVT::f64;
	Found = true;
	}
	}

	if (!Found) {
	do {
	NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
	if (NewVT == MVT::i8)
	break;
	} while (!isSafeMemOpType(NewVT.getSimpleVT()));
	}
	NewVTSize = NewVT.getSizeInBits() / 8;

	// If the new VT cannot cover all of the remaining bits, then consider
	// issuing a (or a pair of) unaligned and overlapping load / store.
	bool Fast;
	if (NumMemOps && AllowOverlap && NewVTSize < Size &&
	allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign,
	MachineMemOperand::MONone, &Fast) &&
	Fast)
	VTSize = Size;
	else {
	VT = NewVT;
	VTSize = NewVTSize;
	}
	}

	if (++NumMemOps > Limit)
	return false;

	MemOps.push_back(VT);
	Size -= VTSize;
	}

	return true;
	}

	/// Soften the operands of a comparison. This code is shared among BR_CC,
	/// SELECT_CC, and SETCC handlers.
	void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
	SDValue &NewLHS, SDValue &NewRHS,
	ISD::CondCode &CCCode,
	const SDLoc &dl) const {
	assert((VT == MVT::f32 \|\| VT == MVT::f64 \|\| VT == MVT::f128 \|\| VT == MVT::ppcf128)
	&& "Unsupported setcc type!");

	// Expand into one or more soft-fp libcall(s).
	RTLIB::Libcall LC1 = RTLIB::UNKNOWN_LIBCALL, LC2 = RTLIB::UNKNOWN_LIBCALL;
	bool ShouldInvertCC = false;
	switch (CCCode) {
	case ISD::SETEQ:
	case ISD::SETOEQ:
	LC1 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
	(VT == MVT::f64) ? RTLIB::OEQ_F64 :
	(VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128;
	break;
	case ISD::SETNE:
	case ISD::SETUNE:
	LC1 = (VT == MVT::f32) ? RTLIB::UNE_F32 :
	(VT == MVT::f64) ? RTLIB::UNE_F64 :
	(VT == MVT::f128) ? RTLIB::UNE_F128 : RTLIB::UNE_PPCF128;
	break;
	case ISD::SETGE:
	case ISD::SETOGE:
	LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
	(VT == MVT::f64) ? RTLIB::OGE_F64 :
	(VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128;
	break;
	case ISD::SETLT:
	case ISD::SETOLT:
	LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
	(VT == MVT::f64) ? RTLIB::OLT_F64 :
	(VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128;
	break;
	case ISD::SETLE:
	case ISD::SETOLE:
	LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
	(VT == MVT::f64) ? RTLIB::OLE_F64 :
	(VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128;
	break;
	case ISD::SETGT:
	case ISD::SETOGT:
	LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
	(VT == MVT::f64) ? RTLIB::OGT_F64 :
	(VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128;
	break;
	case ISD::SETUO:
	LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
	(VT == MVT::f64) ? RTLIB::UO_F64 :
	(VT == MVT::f128) ? RTLIB::UO_F128 : RTLIB::UO_PPCF128;
	break;
	case ISD::SETO:
	LC1 = (VT == MVT::f32) ? RTLIB::O_F32 :
	(VT == MVT::f64) ? RTLIB::O_F64 :
	(VT == MVT::f128) ? RTLIB::O_F128 : RTLIB::O_PPCF128;
	break;
	case ISD::SETONE:
	// SETONE = SETOLT \| SETOGT
	LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
	(VT == MVT::f64) ? RTLIB::OLT_F64 :
	(VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128;
	LC2 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
	(VT == MVT::f64) ? RTLIB::OGT_F64 :
	(VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128;
	break;
	case ISD::SETUEQ:
	LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
	(VT == MVT::f64) ? RTLIB::UO_F64 :
	(VT == MVT::f128) ? RTLIB::UO_F128 : RTLIB::UO_PPCF128;
	LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
	(VT == MVT::f64) ? RTLIB::OEQ_F64 :
	(VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128;
	break;
	default:
	// Invert CC for unordered comparisons
	ShouldInvertCC = true;
	switch (CCCode) {
	case ISD::SETULT:
	LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
	(VT == MVT::f64) ? RTLIB::OGE_F64 :
	(VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128;
	break;
	case ISD::SETULE:
	LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
	(VT == MVT::f64) ? RTLIB::OGT_F64 :
	(VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128;
	break;
	case ISD::SETUGT:
	LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
	(VT == MVT::f64) ? RTLIB::OLE_F64 :
	(VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128;
	break;
	case ISD::SETUGE:
	LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
	(VT == MVT::f64) ? RTLIB::OLT_F64 :
	(VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128;
	break;
	default: llvm_unreachable("Do not know how to soften this setcc!");
	}
	}

	// Use the target specific return value for comparions lib calls.
	EVT RetVT = getCmpLibcallReturnType();
	SDValue Ops[2] = {NewLHS, NewRHS};
	NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, false /sign irrelevant/,
	dl).first;
	NewRHS = DAG.getConstant(0, dl, RetVT);

	CCCode = getCmpLibcallCC(LC1);
	if (ShouldInvertCC)
	CCCode = getSetCCInverse(CCCode, /isInteger=/true);

	if (LC2 != RTLIB::UNKNOWN_LIBCALL) {
	SDValue Tmp = DAG.getNode(
	ISD::SETCC, dl,
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT),
	NewLHS, NewRHS, DAG.getCondCode(CCCode));
	NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, false/sign irrelevant/,
	dl).first;
	NewLHS = DAG.getNode(
	ISD::SETCC, dl,
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT),
	NewLHS, NewRHS, DAG.getCondCode(getCmpLibcallCC(LC2)));
	NewLHS = DAG.getNode(ISD::OR, dl, Tmp.getValueType(), Tmp, NewLHS);
	NewRHS = SDValue();
	}
	}

	/// Return the entry encoding for a jump table in the current function. The
	/// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum.
	unsigned TargetLowering::getJumpTableEncoding() const {
	// In non-pic modes, just use the address of a block.
	if (!isPositionIndependent())
	return MachineJumpTableInfo::EK_BlockAddress;

	// In PIC mode, if the target supports a GPRel32 directive, use it.
	if (getTargetMachine().getMCAsmInfo()->getGPRel32Directive() != nullptr)
	return MachineJumpTableInfo::EK_GPRel32BlockAddress;

	// Otherwise, use a label difference.
	return MachineJumpTableInfo::EK_LabelDifference32;
	}

	SDValue TargetLowering::getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const {
	// If our PIC model is GP relative, use the global offset table as the base.
	unsigned JTEncoding = getJumpTableEncoding();

	if ((JTEncoding == MachineJumpTableInfo::EK_GPRel64BlockAddress) \|\|
	(JTEncoding == MachineJumpTableInfo::EK_GPRel32BlockAddress))
	return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy(DAG.getDataLayout()));

	return Table;
	}

	/// This returns the relocation base for the given PIC jumptable, the same as
	/// getPICJumpTableRelocBase, but as an MCExpr.
	const MCExpr *
	TargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
	unsigned JTI,MCContext &Ctx) const{
	// The normal PIC reloc base is the label at the start of the jump table.
	return MCSymbolRefExpr::create(MF->getJTISymbol(JTI, Ctx), Ctx);
	}

	bool
	TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
	const TargetMachine &TM = getTargetMachine();
	const GlobalValue *GV = GA->getGlobal();

	// If the address is not even local to this DSO we will have to load it from
	// a got and then add the offset.
	if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
	return false;

	// If the code is position independent we will have to add a base register.
	if (isPositionIndependent())
	return false;

	// Otherwise we can do it.
	return true;
	}

	//===----------------------------------------------------------------------===//
	// Optimization Methods
	//===----------------------------------------------------------------------===//

	/// If the specified instruction has a constant integer operand and there are
	/// bits set in that constant that are not demanded, then clear those bits and
	/// return true.
	bool TargetLowering::ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
	TargetLoweringOpt &TLO) const {
	SDLoc DL(Op);
	unsigned Opcode = Op.getOpcode();

	// Do target-specific constant optimization.
	if (targetShrinkDemandedConstant(Op, Demanded, TLO))
	return TLO.New.getNode();

	// FIXME: ISD::SELECT, ISD::SELECT_CC
	switch (Opcode) {
	default:
	break;
	case ISD::XOR:
	case ISD::AND:
	case ISD::OR: {
	auto *Op1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!Op1C)
	return false;

	// If this is a 'not' op, don't touch it because that's a canonical form.
	const APInt &C = Op1C->getAPIntValue();
	if (Opcode == ISD::XOR && Demanded.isSubsetOf(C))
	return false;

	if (!C.isSubsetOf(Demanded)) {
	EVT VT = Op.getValueType();
	SDValue NewC = TLO.DAG.getConstant(Demanded & C, DL, VT);
	SDValue NewOp = TLO.DAG.getNode(Opcode, DL, VT, Op.getOperand(0), NewC);
	return TLO.CombineTo(Op, NewOp);
	}

	break;
	}
	}

	return false;
	}

	/// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.
	/// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
	/// generalized for targets with other types of implicit widening casts.
	bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
	const APInt &Demanded,
	TargetLoweringOpt &TLO) const {
	assert(Op.getNumOperands() == 2 &&
	"ShrinkDemandedOp only supports binary operators!");
	assert(Op.getNode()->getNumValues() == 1 &&
	"ShrinkDemandedOp only supports nodes with one result!");

	SelectionDAG &DAG = TLO.DAG;
	SDLoc dl(Op);

	// Early return, as this function cannot handle vector types.
	if (Op.getValueType().isVector())
	return false;

	// Don't do this if the node has another user, which may require the
	// full value.
	if (!Op.getNode()->hasOneUse())
	return false;

	// Search for the smallest integer type with free casts to and from
	// Op's type. For expedience, just check power-of-2 integer types.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned DemandedSize = Demanded.getActiveBits();
	unsigned SmallVTBits = DemandedSize;
	if (!isPowerOf2_32(SmallVTBits))
	SmallVTBits = NextPowerOf2(SmallVTBits);
	for (; SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
	EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), SmallVTBits);
	if (TLI.isTruncateFree(Op.getValueType(), SmallVT) &&
	TLI.isZExtFree(SmallVT, Op.getValueType())) {
	// We found a type with free casts.
	SDValue X = DAG.getNode(
	Op.getOpcode(), dl, SmallVT,
	DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)),
	DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(1)));
	assert(DemandedSize <= SmallVTBits && "Narrowed below demanded bits?");
	SDValue Z = DAG.getNode(ISD::ANY_EXTEND, dl, Op.getValueType(), X);
	return TLO.CombineTo(Op, Z);
	}
	}
	return false;
	}

	bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	KnownBits Known;

	bool Simplified = SimplifyDemandedBits(Op, DemandedBits, Known, TLO);
	if (Simplified) {
	DCI.AddToWorklist(Op.getNode());
	DCI.CommitTargetLoweringOpt(TLO);
	}
	return Simplified;
	}

	bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
	KnownBits &Known,
	TargetLoweringOpt &TLO,
	unsigned Depth,
	bool AssumeSingleUse) const {
	EVT VT = Op.getValueType();
	APInt DemandedElts = VT.isVector()
	? APInt::getAllOnesValue(VT.getVectorNumElements())
	: APInt(1, 1);
	return SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, Depth,
	AssumeSingleUse);
	}

	/// Look at Op. At this point, we know that only the OriginalDemandedBits of the
	/// result of Op are ever used downstream. If we can use this information to
	/// simplify Op, create a new simplified DAG node and return true, returning the
	/// original and new nodes in Old and New. Otherwise, analyze the expression and
	/// return a mask of Known bits for the expression (used to simplify the
	/// caller). The Known bits may only be accurate for those bits in the
	/// OriginalDemandedBits and OriginalDemandedElts.
	bool TargetLowering::SimplifyDemandedBits(
	SDValue Op, const APInt &OriginalDemandedBits,
	const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
	unsigned Depth, bool AssumeSingleUse) const {
	unsigned BitWidth = OriginalDemandedBits.getBitWidth();
	assert(Op.getScalarValueSizeInBits() == BitWidth &&
	"Mask size mismatches value type size!");

	unsigned NumElts = OriginalDemandedElts.getBitWidth();
	assert((!Op.getValueType().isVector() \|\|
	NumElts == Op.getValueType().getVectorNumElements()) &&
	"Unexpected vector size");

	APInt DemandedBits = OriginalDemandedBits;
	APInt DemandedElts = OriginalDemandedElts;
	SDLoc dl(Op);
	auto &DL = TLO.DAG.getDataLayout();

	// Don't know anything.
	Known = KnownBits(BitWidth);

	// Undef operand.
	if (Op.isUndef())
	return false;

	if (Op.getOpcode() == ISD::Constant) {
	// We know all of the bits for a constant!
	Known.One = cast<ConstantSDNode>(Op)->getAPIntValue();
	Known.Zero = ~Known.One;
	return false;
	}

	// Other users may use these bits.
	EVT VT = Op.getValueType();
	if (!Op.getNode()->hasOneUse() && !AssumeSingleUse) {
	if (Depth != 0) {
	// If not at the root, Just compute the Known bits to
	// simplify things downstream.
	Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
	return false;
	}
	// If this is the root being simplified, allow it to have multiple uses,
	// just set the DemandedBits/Elts to all bits.
	DemandedBits = APInt::getAllOnesValue(BitWidth);
	DemandedElts = APInt::getAllOnesValue(NumElts);
	} else if (OriginalDemandedBits == 0 \|\| OriginalDemandedElts == 0) {
	// Not demanding any bits/elts from Op.
	return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
	} else if (Depth == 6) { // Limit search depth.
	return false;
	}

	KnownBits Known2, KnownOut;
	switch (Op.getOpcode()) {
	case ISD::SCALAR_TO_VECTOR: {
	if (!DemandedElts[0])
	return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));

	KnownBits SrcKnown;
	SDValue Src = Op.getOperand(0);
	unsigned SrcBitWidth = Src.getScalarValueSizeInBits();
	APInt SrcDemandedBits = DemandedBits.zextOrSelf(SrcBitWidth);
	if (SimplifyDemandedBits(Src, SrcDemandedBits, SrcKnown, TLO, Depth + 1))
	return true;
	Known = SrcKnown.zextOrTrunc(BitWidth, false);
	break;
	}
	case ISD::BUILD_VECTOR:
	// Collect the known bits that are shared by every demanded element.
	// TODO: Call SimplifyDemandedBits for non-constant demanded elements.
	Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
	return false; // Don't fall through, will infinitely loop.
	case ISD::LOAD: {
	LoadSDNode *LD = cast<LoadSDNode>(Op);
	if (getTargetConstantFromLoad(LD)) {
	Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
	return false; // Don't fall through, will infinitely loop.
	}
	break;
	}
	case ISD::INSERT_VECTOR_ELT: {
	SDValue Vec = Op.getOperand(0);
	SDValue Scl = Op.getOperand(1);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	EVT VecVT = Vec.getValueType();

	// If index isn't constant, assume we need all vector elements AND the
	// inserted element.
	APInt DemandedVecElts(DemandedElts);
	if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
	unsigned Idx = CIdx->getZExtValue();
	DemandedVecElts.clearBit(Idx);

	// Inserted element is not required.
	if (!DemandedElts[Idx])
	return TLO.CombineTo(Op, Vec);
	}

	KnownBits KnownScl;
	unsigned NumSclBits = Scl.getScalarValueSizeInBits();
	APInt DemandedSclBits = DemandedBits.zextOrTrunc(NumSclBits);
	if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
	return true;

	Known = KnownScl.zextOrTrunc(BitWidth, false);

	KnownBits KnownVec;
	if (SimplifyDemandedBits(Vec, DemandedBits, DemandedVecElts, KnownVec, TLO,
	Depth + 1))
	return true;

	if (!!DemandedVecElts) {
	Known.One &= KnownVec.One;
	Known.Zero &= KnownVec.Zero;
	}

	return false;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue Base = Op.getOperand(0);
	SDValue Sub = Op.getOperand(1);
	EVT SubVT = Sub.getValueType();
	unsigned NumSubElts = SubVT.getVectorNumElements();

	// If index isn't constant, assume we need the original demanded base
	// elements and ALL the inserted subvector elements.
	APInt BaseElts = DemandedElts;
	APInt SubElts = APInt::getAllOnesValue(NumSubElts);
	if (isa<ConstantSDNode>(Op.getOperand(2))) {
	const APInt &Idx = Op.getConstantOperandAPInt(2);
	if (Idx.ule(NumElts - NumSubElts)) {
	unsigned SubIdx = Idx.getZExtValue();
	SubElts = DemandedElts.extractBits(NumSubElts, SubIdx);
	BaseElts.insertBits(APInt::getNullValue(NumSubElts), SubIdx);
	}
	}

	KnownBits KnownSub, KnownBase;
	if (SimplifyDemandedBits(Sub, DemandedBits, SubElts, KnownSub, TLO,
	Depth + 1))
	return true;
	if (SimplifyDemandedBits(Base, DemandedBits, BaseElts, KnownBase, TLO,
	Depth + 1))
	return true;

	Known.Zero.setAllBits();
	Known.One.setAllBits();
	if (!!SubElts) {
	Known.One &= KnownSub.One;
	Known.Zero &= KnownSub.Zero;
	}
	if (!!BaseElts) {
	Known.One &= KnownBase.One;
	Known.Zero &= KnownBase.Zero;
	}
	break;
	}
	case ISD::CONCAT_VECTORS: {
	Known.Zero.setAllBits();
	Known.One.setAllBits();
	EVT SubVT = Op.getOperand(0).getValueType();
	unsigned NumSubVecs = Op.getNumOperands();
	unsigned NumSubElts = SubVT.getVectorNumElements();
	for (unsigned i = 0; i != NumSubVecs; ++i) {
	APInt DemandedSubElts =
	DemandedElts.extractBits(NumSubElts, i * NumSubElts);
	if (SimplifyDemandedBits(Op.getOperand(i), DemandedBits, DemandedSubElts,
	Known2, TLO, Depth + 1))
	return true;
	// Known bits are shared by every demanded subvector element.
	if (!!DemandedSubElts) {
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	}
	break;
	}
	case ISD::VECTOR_SHUFFLE: {
	ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();

	// Collect demanded elements from shuffle operands..
	APInt DemandedLHS(NumElts, 0);
	APInt DemandedRHS(NumElts, 0);
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;
	int M = ShuffleMask[i];
	if (M < 0) {
	// For UNDEF elements, we don't know anything about the common state of
	// the shuffle result.
	DemandedLHS.clearAllBits();
	DemandedRHS.clearAllBits();
	break;
	}
	assert(0 <= M && M < (int)(2 * NumElts) && "Shuffle index out of range");
	if (M < (int)NumElts)
	DemandedLHS.setBit(M);
	else
	DemandedRHS.setBit(M - NumElts);
	}

	if (!!DemandedLHS \|\| !!DemandedRHS) {
	Known.Zero.setAllBits();
	Known.One.setAllBits();
	if (!!DemandedLHS) {
	if (SimplifyDemandedBits(Op.getOperand(0), DemandedBits, DemandedLHS,
	Known2, TLO, Depth + 1))
	return true;
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	if (!!DemandedRHS) {
	if (SimplifyDemandedBits(Op.getOperand(1), DemandedBits, DemandedRHS,
	Known2, TLO, Depth + 1))
	return true;
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	}
	break;
	}
	case ISD::AND: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// If the RHS is a constant, check to see if the LHS would be zero without
	// using the bits from the RHS. Below, we use knowledge about the RHS to
	// simplify the LHS, here we're using information from the LHS to simplify
	// the RHS.
	if (ConstantSDNode *RHSC = isConstOrConstSplat(Op1)) {
	// Do not increment Depth here; that can cause an infinite loop.
	KnownBits LHSKnown = TLO.DAG.computeKnownBits(Op0, DemandedElts, Depth);
	// If the LHS already has zeros where RHSC does, this 'and' is dead.
	if ((LHSKnown.Zero & DemandedBits) ==
	(~RHSC->getAPIntValue() & DemandedBits))
	return TLO.CombineTo(Op, Op0);

	// If any of the set bits in the RHS are known zero on the LHS, shrink
	// the constant.
	if (ShrinkDemandedConstant(Op, ~LHSKnown.Zero & DemandedBits, TLO))
	return true;

	// Bitwise-not (xor X, -1) is a special case: we don't usually shrink its
	// constant, but if this 'and' is only clearing bits that were just set by
	// the xor, then this 'and' can be eliminated by shrinking the mask of
	// the xor. For example, for a 32-bit X:
	// and (xor (srl X, 31), -1), 1 --> xor (srl X, 31), 1
	if (isBitwiseNot(Op0) && Op0.hasOneUse() &&
	LHSKnown.One == ~RHSC->getAPIntValue()) {
	SDValue Xor = TLO.DAG.getNode(ISD::XOR, dl, VT, Op0.getOperand(0), Op1);
	return TLO.CombineTo(Op, Xor);
	}
	}

	if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO,
	Depth + 1))
	return true;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	if (SimplifyDemandedBits(Op0, ~Known.Zero & DemandedBits, DemandedElts,
	Known2, TLO, Depth + 1))
	return true;
	assert(!Known2.hasConflict() && "Bits known to be one AND zero?");

	// If all of the demanded bits are known one on one side, return the other.
	// These bits cannot contribute to the result of the 'and'.
	if (DemandedBits.isSubsetOf(Known2.Zero \| Known.One))
	return TLO.CombineTo(Op, Op0);
	if (DemandedBits.isSubsetOf(Known.Zero \| Known2.One))
	return TLO.CombineTo(Op, Op1);
	// If all of the demanded bits in the inputs are known zeros, return zero.
	if (DemandedBits.isSubsetOf(Known.Zero \| Known2.Zero))
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, VT));
	// If the RHS is a constant, see if we can simplify it.
	if (ShrinkDemandedConstant(Op, ~Known2.Zero & DemandedBits, TLO))
	return true;
	// If the operation can be done in a smaller type, do so.
	if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
	return true;

	// Output known-1 bits are only known if set in both the LHS & RHS.
	Known.One &= Known2.One;
	// Output known-0 are known to be clear if zero in either the LHS \| RHS.
	Known.Zero \|= Known2.Zero;
	break;
	}
	case ISD::OR: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO,
	Depth + 1))
	return true;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	if (SimplifyDemandedBits(Op0, ~Known.One & DemandedBits, DemandedElts,
	Known2, TLO, Depth + 1))
	return true;
	assert(!Known2.hasConflict() && "Bits known to be one AND zero?");

	// If all of the demanded bits are known zero on one side, return the other.
	// These bits cannot contribute to the result of the 'or'.
	if (DemandedBits.isSubsetOf(Known2.One \| Known.Zero))
	return TLO.CombineTo(Op, Op0);
	if (DemandedBits.isSubsetOf(Known.One \| Known2.Zero))
	return TLO.CombineTo(Op, Op1);
	// If the RHS is a constant, see if we can simplify it.
	if (ShrinkDemandedConstant(Op, DemandedBits, TLO))
	return true;
	// If the operation can be done in a smaller type, do so.
	if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
	return true;

	// Output known-0 bits are only known if clear in both the LHS & RHS.
	Known.Zero &= Known2.Zero;
	// Output known-1 are known to be set if set in either the LHS \| RHS.
	Known.One \|= Known2.One;
	break;
	}
	case ISD::XOR: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO,
	Depth + 1))
	return true;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known2, TLO,
	Depth + 1))
	return true;
	assert(!Known2.hasConflict() && "Bits known to be one AND zero?");

	// If all of the demanded bits are known zero on one side, return the other.
	// These bits cannot contribute to the result of the 'xor'.
	if (DemandedBits.isSubsetOf(Known.Zero))
	return TLO.CombineTo(Op, Op0);
	if (DemandedBits.isSubsetOf(Known2.Zero))
	return TLO.CombineTo(Op, Op1);
	// If the operation can be done in a smaller type, do so.
	if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
	return true;

	// If all of the unknown bits are known to be zero on one side or the other
	// (but not both) turn this into an inclusive or.
	// e.g. (A & C1)^(B & C2) -> (A & C1)\|(B & C2) iff C1&C2 == 0
	if (DemandedBits.isSubsetOf(Known.Zero \| Known2.Zero))
	return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, VT, Op0, Op1));

	// Output known-0 bits are known if clear or set in both the LHS & RHS.
	KnownOut.Zero = (Known.Zero & Known2.Zero) \| (Known.One & Known2.One);
	// Output known-1 are known to be set if set in only one of the LHS, RHS.
	KnownOut.One = (Known.Zero & Known2.One) \| (Known.One & Known2.Zero);

	if (ConstantSDNode *C = isConstOrConstSplat(Op1)) {
	// If one side is a constant, and all of the known set bits on the other
	// side are also set in the constant, turn this into an AND, as we know
	// the bits will be cleared.
	// e.g. (X \| C1) ^ C2 --> (X \| C1) & ~C2 iff (C1&C2) == C2
	// NB: it is okay if more bits are known than are requested
	if (C->getAPIntValue() == Known2.One) {
	SDValue ANDC =
	TLO.DAG.getConstant(~C->getAPIntValue() & DemandedBits, dl, VT);
	return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, dl, VT, Op0, ANDC));
	}

	// If the RHS is a constant, see if we can change it. Don't alter a -1
	// constant because that's a 'not' op, and that is better for combining
	// and codegen.
	if (!C->isAllOnesValue()) {
	if (DemandedBits.isSubsetOf(C->getAPIntValue())) {
	// We're flipping all demanded bits. Flip the undemanded bits too.
	SDValue New = TLO.DAG.getNOT(dl, Op0, VT);
	return TLO.CombineTo(Op, New);
	}
	// If we can't turn this into a 'not', try to shrink the constant.
	if (ShrinkDemandedConstant(Op, DemandedBits, TLO))
	return true;
	}
	}

	Known = std::move(KnownOut);
	break;
	}
	case ISD::SELECT:
	if (SimplifyDemandedBits(Op.getOperand(2), DemandedBits, Known, TLO,
	Depth + 1))
	return true;
	if (SimplifyDemandedBits(Op.getOperand(1), DemandedBits, Known2, TLO,
	Depth + 1))
	return true;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	assert(!Known2.hasConflict() && "Bits known to be one AND zero?");

	// If the operands are constants, see if we can simplify them.
	if (ShrinkDemandedConstant(Op, DemandedBits, TLO))
	return true;

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	case ISD::SELECT_CC:
	if (SimplifyDemandedBits(Op.getOperand(3), DemandedBits, Known, TLO,
	Depth + 1))
	return true;
	if (SimplifyDemandedBits(Op.getOperand(2), DemandedBits, Known2, TLO,
	Depth + 1))
	return true;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	assert(!Known2.hasConflict() && "Bits known to be one AND zero?");

	// If the operands are constants, see if we can simplify them.
	if (ShrinkDemandedConstant(Op, DemandedBits, TLO))
	return true;

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	case ISD::SETCC: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	// If (1) we only need the sign-bit, (2) the setcc operands are the same
	// width as the setcc result, and (3) the result of a setcc conforms to 0 or
	// -1, we may be able to bypass the setcc.
	if (DemandedBits.isSignMask() &&
	Op0.getScalarValueSizeInBits() == BitWidth &&
	getBooleanContents(VT) ==
	BooleanContent::ZeroOrNegativeOneBooleanContent) {
	// If we're testing X < 0, then this compare isn't needed - just use X!
	// FIXME: We're limiting to integer types here, but this should also work
	// if we don't care about FP signed-zero. The use of SETLT with FP means
	// that we don't care about NaNs.
	if (CC == ISD::SETLT && Op1.getValueType().isInteger() &&
	(isNullConstant(Op1) \|\| ISD::isBuildVectorAllZeros(Op1.getNode())))
	return TLO.CombineTo(Op, Op0);

	// TODO: Should we check for other forms of sign-bit comparisons?
	// Examples: X <= -1, X >= 0
	}
	if (getBooleanContents(Op0.getValueType()) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	BitWidth > 1)
	Known.Zero.setBitsFrom(1);
	break;
	}
	case ISD::SHL: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	if (ConstantSDNode *SA = isConstOrConstSplat(Op1, DemandedElts)) {
	// If the shift count is an invalid immediate, don't do anything.
	if (SA->getAPIntValue().uge(BitWidth))
	break;

	unsigned ShAmt = SA->getZExtValue();
	if (ShAmt == 0)
	return TLO.CombineTo(Op, Op0);

	// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
	// single shift. We can do this if the bottom bits (which are shifted
	// out) are never demanded.
	// TODO - support non-uniform vector amounts.
	if (Op0.getOpcode() == ISD::SRL) {
	if ((DemandedBits & APInt::getLowBitsSet(BitWidth, ShAmt)) == 0) {
	if (ConstantSDNode *SA2 =
	isConstOrConstSplat(Op0.getOperand(1), DemandedElts)) {
	if (SA2->getAPIntValue().ult(BitWidth)) {
	unsigned C1 = SA2->getZExtValue();
	unsigned Opc = ISD::SHL;
	int Diff = ShAmt - C1;
	if (Diff < 0) {
	Diff = -Diff;
	Opc = ISD::SRL;
	}

	SDValue NewSA = TLO.DAG.getConstant(Diff, dl, Op1.getValueType());
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(Opc, dl, VT, Op0.getOperand(0), NewSA));
	}
	}
	}
	}

	if (SimplifyDemandedBits(Op0, DemandedBits.lshr(ShAmt), DemandedElts,
	Known, TLO, Depth + 1))
	return true;

	// Try shrinking the operation as long as the shift amount will still be
	// in range.
	if ((ShAmt < DemandedBits.getActiveBits()) &&
	ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
	return true;

	// Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits
	// are not demanded. This will likely allow the anyext to be folded away.
	if (Op0.getOpcode() == ISD::ANY_EXTEND) {
	SDValue InnerOp = Op0.getOperand(0);
	EVT InnerVT = InnerOp.getValueType();
	unsigned InnerBits = InnerVT.getScalarSizeInBits();
	if (ShAmt < InnerBits && DemandedBits.getActiveBits() <= InnerBits &&
	isTypeDesirableForOp(ISD::SHL, InnerVT)) {
	EVT ShTy = getShiftAmountTy(InnerVT, DL);
	if (!APInt(BitWidth, ShAmt).isIntN(ShTy.getSizeInBits()))
	ShTy = InnerVT;
	SDValue NarrowShl =
	TLO.DAG.getNode(ISD::SHL, dl, InnerVT, InnerOp,
	TLO.DAG.getConstant(ShAmt, dl, ShTy));
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, NarrowShl));
	}
	// Repeat the SHL optimization above in cases where an extension
	// intervenes: (shl (anyext (shr x, c1)), c2) to
	// (shl (anyext x), c2-c1). This requires that the bottom c1 bits
	// aren't demanded (as above) and that the shifted upper c1 bits of
	// x aren't demanded.
	if (Op0.hasOneUse() && InnerOp.getOpcode() == ISD::SRL &&
	InnerOp.hasOneUse()) {
	if (ConstantSDNode *SA2 =
	isConstOrConstSplat(InnerOp.getOperand(1))) {
	unsigned InnerShAmt = SA2->getLimitedValue(InnerBits);
	if (InnerShAmt < ShAmt && InnerShAmt < InnerBits &&
	DemandedBits.getActiveBits() <=
	(InnerBits - InnerShAmt + ShAmt) &&
	DemandedBits.countTrailingZeros() >= ShAmt) {
	SDValue NewSA = TLO.DAG.getConstant(ShAmt - InnerShAmt, dl,
	Op1.getValueType());
	SDValue NewExt = TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT,
	InnerOp.getOperand(0));
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(ISD::SHL, dl, VT, NewExt, NewSA));
	}
	}
	}
	}

	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;
	// low bits known zero.
	Known.Zero.setLowBits(ShAmt);
	}
	break;
	}
	case ISD::SRL: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	if (ConstantSDNode *SA = isConstOrConstSplat(Op1, DemandedElts)) {
	// If the shift count is an invalid immediate, don't do anything.
	if (SA->getAPIntValue().uge(BitWidth))
	break;

	unsigned ShAmt = SA->getZExtValue();
	if (ShAmt == 0)
	return TLO.CombineTo(Op, Op0);

	EVT ShiftVT = Op1.getValueType();
	APInt InDemandedMask = (DemandedBits << ShAmt);

	// If the shift is exact, then it does demand the low bits (and knows that
	// they are zero).
	if (Op->getFlags().hasExact())
	InDemandedMask.setLowBits(ShAmt);

	// If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
	// single shift. We can do this if the top bits (which are shifted out)
	// are never demanded.
	// TODO - support non-uniform vector amounts.
	if (Op0.getOpcode() == ISD::SHL) {
	if (ConstantSDNode *SA2 =
	isConstOrConstSplat(Op0.getOperand(1), DemandedElts)) {
	if ((DemandedBits & APInt::getHighBitsSet(BitWidth, ShAmt)) == 0) {
	if (SA2->getAPIntValue().ult(BitWidth)) {
	unsigned C1 = SA2->getZExtValue();
	unsigned Opc = ISD::SRL;
	int Diff = ShAmt - C1;
	if (Diff < 0) {
	Diff = -Diff;
	Opc = ISD::SHL;
	}

	SDValue NewSA = TLO.DAG.getConstant(Diff, dl, ShiftVT);
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(Opc, dl, VT, Op0.getOperand(0), NewSA));
	}
	}
	}
	}

	// Compute the new bits that are at the top now.
	if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
	Depth + 1))
	return true;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);

	Known.Zero.setHighBits(ShAmt); // High bits known zero.
	}
	break;
	}
	case ISD::SRA: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// If this is an arithmetic shift right and only the low-bit is set, we can
	// always convert this into a logical shr, even if the shift amount is
	// variable. The low bit of the shift cannot be an input sign bit unless
	// the shift amount is >= the size of the datatype, which is undefined.
	if (DemandedBits.isOneValue())
	return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1));

	if (ConstantSDNode *SA = isConstOrConstSplat(Op1, DemandedElts)) {
	// If the shift count is an invalid immediate, don't do anything.
	if (SA->getAPIntValue().uge(BitWidth))
	break;

	unsigned ShAmt = SA->getZExtValue();
	if (ShAmt == 0)
	return TLO.CombineTo(Op, Op0);

	APInt InDemandedMask = (DemandedBits << ShAmt);

	// If the shift is exact, then it does demand the low bits (and knows that
	// they are zero).
	if (Op->getFlags().hasExact())
	InDemandedMask.setLowBits(ShAmt);

	// If any of the demanded bits are produced by the sign extension, we also
	// demand the input sign bit.
	if (DemandedBits.countLeadingZeros() < ShAmt)
	InDemandedMask.setSignBit();

	if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
	Depth + 1))
	return true;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);

	// If the input sign bit is known to be zero, or if none of the top bits
	// are demanded, turn this into an unsigned shift right.
	if (Known.Zero[BitWidth - ShAmt - 1] \|\|
	DemandedBits.countLeadingZeros() >= ShAmt) {
	SDNodeFlags Flags;
	Flags.setExact(Op->getFlags().hasExact());
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1, Flags));
	}

	int Log2 = DemandedBits.exactLogBase2();
	if (Log2 >= 0) {
	// The bit must come from the sign.
	SDValue NewSA =
	TLO.DAG.getConstant(BitWidth - 1 - Log2, dl, Op1.getValueType());
	return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, NewSA));
	}

	if (Known.One[BitWidth - ShAmt - 1])
	// New bits are known one.
	Known.One.setHighBits(ShAmt);
	}
	break;
	}
	case ISD::FSHL:
	case ISD::FSHR: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Op2 = Op.getOperand(2);
	bool IsFSHL = (Op.getOpcode() == ISD::FSHL);

	if (ConstantSDNode *SA = isConstOrConstSplat(Op2, DemandedElts)) {
	unsigned Amt = SA->getAPIntValue().urem(BitWidth);

	// For fshl, 0-shift returns the 1st arg.
	// For fshr, 0-shift returns the 2nd arg.
	if (Amt == 0) {
	if (SimplifyDemandedBits(IsFSHL ? Op0 : Op1, DemandedBits, DemandedElts,
	Known, TLO, Depth + 1))
	return true;
	break;
	}

	// fshl: (Op0 << Amt) \| (Op1 >> (BW - Amt))
	// fshr: (Op0 << (BW - Amt)) \| (Op1 >> Amt)
	APInt Demanded0 = DemandedBits.lshr(IsFSHL ? Amt : (BitWidth - Amt));
	APInt Demanded1 = DemandedBits << (IsFSHL ? (BitWidth - Amt) : Amt);
	if (SimplifyDemandedBits(Op0, Demanded0, DemandedElts, Known2, TLO,
	Depth + 1))
	return true;
	if (SimplifyDemandedBits(Op1, Demanded1, DemandedElts, Known, TLO,
	Depth + 1))
	return true;

	Known2.One <<= (IsFSHL ? Amt : (BitWidth - Amt));
	Known2.Zero <<= (IsFSHL ? Amt : (BitWidth - Amt));
	Known.One.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt);
	Known.Zero.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt);
	Known.One \|= Known2.One;
	Known.Zero \|= Known2.Zero;
	}
	break;
	}
	case ISD::BITREVERSE: {
	SDValue Src = Op.getOperand(0);
	APInt DemandedSrcBits = DemandedBits.reverseBits();
	if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO,
	Depth + 1))
	return true;
	Known.One = Known2.One.reverseBits();
	Known.Zero = Known2.Zero.reverseBits();
	break;
	}
	case ISD::SIGN_EXTEND_INREG: {
	SDValue Op0 = Op.getOperand(0);
	EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
	unsigned ExVTBits = ExVT.getScalarSizeInBits();

	// If we only care about the highest bit, don't bother shifting right.
	if (DemandedBits.isSignMask()) {
	unsigned NumSignBits = TLO.DAG.ComputeNumSignBits(Op0);
	bool AlreadySignExtended = NumSignBits >= BitWidth - ExVTBits + 1;
	// However if the input is already sign extended we expect the sign
	// extension to be dropped altogether later and do not simplify.
	if (!AlreadySignExtended) {
	// Compute the correct shift amount type, which must be getShiftAmountTy
	// for scalar types after legalization.
	EVT ShiftAmtTy = VT;
	if (TLO.LegalTypes() && !ShiftAmtTy.isVector())
	ShiftAmtTy = getShiftAmountTy(ShiftAmtTy, DL);

	SDValue ShiftAmt =
	TLO.DAG.getConstant(BitWidth - ExVTBits, dl, ShiftAmtTy);
	return TLO.CombineTo(Op,
	TLO.DAG.getNode(ISD::SHL, dl, VT, Op0, ShiftAmt));
	}
	}

	// If none of the extended bits are demanded, eliminate the sextinreg.
	if (DemandedBits.getActiveBits() <= ExVTBits)
	return TLO.CombineTo(Op, Op0);

	APInt InputDemandedBits = DemandedBits.getLoBits(ExVTBits);

	// Since the sign extended bits are demanded, we know that the sign
	// bit is demanded.
	InputDemandedBits.setBit(ExVTBits - 1);

	if (SimplifyDemandedBits(Op0, InputDemandedBits, Known, TLO, Depth + 1))
	return true;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");

	// If the sign bit of the input is known set or clear, then we know the
	// top bits of the result.

	// If the input sign bit is known zero, convert this into a zero extension.
	if (Known.Zero[ExVTBits - 1])
	return TLO.CombineTo(
	Op, TLO.DAG.getZeroExtendInReg(Op0, dl, ExVT.getScalarType()));

	APInt Mask = APInt::getLowBitsSet(BitWidth, ExVTBits);
	if (Known.One[ExVTBits - 1]) { // Input sign bit known set
	Known.One.setBitsFrom(ExVTBits);
	Known.Zero &= Mask;
	} else { // Input sign bit unknown
	Known.Zero &= Mask;
	Known.One &= Mask;
	}
	break;
	}
	case ISD::BUILD_PAIR: {
	EVT HalfVT = Op.getOperand(0).getValueType();
	unsigned HalfBitWidth = HalfVT.getScalarSizeInBits();

	APInt MaskLo = DemandedBits.getLoBits(HalfBitWidth).trunc(HalfBitWidth);
	APInt MaskHi = DemandedBits.getHiBits(HalfBitWidth).trunc(HalfBitWidth);

	KnownBits KnownLo, KnownHi;

	if (SimplifyDemandedBits(Op.getOperand(0), MaskLo, KnownLo, TLO, Depth + 1))
	return true;

	if (SimplifyDemandedBits(Op.getOperand(1), MaskHi, KnownHi, TLO, Depth + 1))
	return true;

	Known.Zero = KnownLo.Zero.zext(BitWidth) \|
	KnownHi.Zero.zext(BitWidth).shl(HalfBitWidth);

	Known.One = KnownLo.One.zext(BitWidth) \|
	KnownHi.One.zext(BitWidth).shl(HalfBitWidth);
	break;
	}
	case ISD::ZERO_EXTEND:
	case ISD::ZERO_EXTEND_VECTOR_INREG: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	unsigned InBits = SrcVT.getScalarSizeInBits();
	unsigned InElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
	bool IsVecInReg = Op.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG;

	// If none of the top bits are demanded, convert this into an any_extend.
	if (DemandedBits.getActiveBits() <= InBits) {
	// If we only need the non-extended bits of the bottom element
	// then we can just bitcast to the result.
	if (IsVecInReg && DemandedElts == 1 &&
	VT.getSizeInBits() == SrcVT.getSizeInBits() &&
	TLO.DAG.getDataLayout().isLittleEndian())
	return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));

	unsigned Opc =
	IsVecInReg ? ISD::ANY_EXTEND_VECTOR_INREG : ISD::ANY_EXTEND;
	if (!TLO.LegalOperations() \|\| isOperationLegal(Opc, VT))
	return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src));
	}

	APInt InDemandedBits = DemandedBits.trunc(InBits);
	APInt InDemandedElts = DemandedElts.zextOrSelf(InElts);
	if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO,
	Depth + 1))
	return true;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	assert(Known.getBitWidth() == InBits && "Src width has changed?");
	Known = Known.zext(BitWidth, true /* ExtendedBitsAreKnownZero */);
	break;
	}
	case ISD::SIGN_EXTEND:
	case ISD::SIGN_EXTEND_VECTOR_INREG: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	unsigned InBits = SrcVT.getScalarSizeInBits();
	unsigned InElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
	bool IsVecInReg = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG;

	// If none of the top bits are demanded, convert this into an any_extend.
	if (DemandedBits.getActiveBits() <= InBits) {
	// If we only need the non-extended bits of the bottom element
	// then we can just bitcast to the result.
	if (IsVecInReg && DemandedElts == 1 &&
	VT.getSizeInBits() == SrcVT.getSizeInBits() &&
	TLO.DAG.getDataLayout().isLittleEndian())
	return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));

	unsigned Opc =
	IsVecInReg ? ISD::ANY_EXTEND_VECTOR_INREG : ISD::ANY_EXTEND;
	if (!TLO.LegalOperations() \|\| isOperationLegal(Opc, VT))
	return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src));
	}

	APInt InDemandedBits = DemandedBits.trunc(InBits);
	APInt InDemandedElts = DemandedElts.zextOrSelf(InElts);

	// Since some of the sign extended bits are demanded, we know that the sign
	// bit is demanded.
	InDemandedBits.setBit(InBits - 1);

	if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO,
	Depth + 1))
	return true;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	assert(Known.getBitWidth() == InBits && "Src width has changed?");

	// If the sign bit is known one, the top bits match.
	Known = Known.sext(BitWidth);

	// If the sign bit is known zero, convert this to a zero extend.
	if (Known.isNonNegative()) {
	unsigned Opc =
	IsVecInReg ? ISD::ZERO_EXTEND_VECTOR_INREG : ISD::ZERO_EXTEND;
	if (!TLO.LegalOperations() \|\| isOperationLegal(Opc, VT))
	return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src));
	}
	break;
	}
	case ISD::ANY_EXTEND:
	case ISD::ANY_EXTEND_VECTOR_INREG: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	unsigned InBits = SrcVT.getScalarSizeInBits();
	unsigned InElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
	bool IsVecInReg = Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG;

	// If we only need the bottom element then we can just bitcast.
	// TODO: Handle ANY_EXTEND?
	if (IsVecInReg && DemandedElts == 1 &&
	VT.getSizeInBits() == SrcVT.getSizeInBits() &&
	TLO.DAG.getDataLayout().isLittleEndian())
	return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));

	APInt InDemandedBits = DemandedBits.trunc(InBits);
	APInt InDemandedElts = DemandedElts.zextOrSelf(InElts);
	if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO,
	Depth + 1))
	return true;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	assert(Known.getBitWidth() == InBits && "Src width has changed?");
	Known = Known.zext(BitWidth, false /* => any extend */);
	break;
	}
	case ISD::TRUNCATE: {
	SDValue Src = Op.getOperand(0);

	// Simplify the input, using demanded bit information, and compute the known
	// zero/one bits live out.
	unsigned OperandBitWidth = Src.getScalarValueSizeInBits();
	APInt TruncMask = DemandedBits.zext(OperandBitWidth);
	if (SimplifyDemandedBits(Src, TruncMask, Known, TLO, Depth + 1))
	return true;
	Known = Known.trunc(BitWidth);

	// If the input is only used by this truncate, see if we can shrink it based
	// on the known demanded bits.
	if (Src.getNode()->hasOneUse()) {
	switch (Src.getOpcode()) {
	default:
	break;
	case ISD::SRL:
	// Shrink SRL by a constant if none of the high bits shifted in are
	// demanded.
	if (TLO.LegalTypes() && !isTypeDesirableForOp(ISD::SRL, VT))
	// Do not turn (vt1 truncate (vt2 srl)) into (vt1 srl) if vt1 is
	// undesirable.
	break;

	auto *ShAmt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
	if (!ShAmt \|\| ShAmt->getAPIntValue().uge(BitWidth))
	break;

	SDValue Shift = Src.getOperand(1);
	uint64_t ShVal = ShAmt->getZExtValue();

	if (TLO.LegalTypes())
	Shift = TLO.DAG.getConstant(ShVal, dl, getShiftAmountTy(VT, DL));

	APInt HighBits =
	APInt::getHighBitsSet(OperandBitWidth, OperandBitWidth - BitWidth);
	HighBits.lshrInPlace(ShVal);
	HighBits = HighBits.trunc(BitWidth);

	if (!(HighBits & DemandedBits)) {
	// None of the shifted in bits are needed. Add a truncate of the
	// shift input, then shift it.
	SDValue NewTrunc =
	TLO.DAG.getNode(ISD::TRUNCATE, dl, VT, Src.getOperand(0));
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(ISD::SRL, dl, VT, NewTrunc, Shift));
	}
	break;
	}
	}

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	break;
	}
	case ISD::AssertZext: {
	// AssertZext demands all of the high bits, plus any of the low bits
	// demanded by its users.
	EVT ZVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
	APInt InMask = APInt::getLowBitsSet(BitWidth, ZVT.getSizeInBits());
	if (SimplifyDemandedBits(Op.getOperand(0), ~InMask \| DemandedBits, Known,
	TLO, Depth + 1))
	return true;
	assert(!Known.hasConflict() && "Bits known to be one AND zero?");

	Known.Zero \|= ~InMask;
	break;
	}
	case ISD::EXTRACT_VECTOR_ELT: {
	SDValue Src = Op.getOperand(0);
	SDValue Idx = Op.getOperand(1);
	unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
	unsigned EltBitWidth = Src.getScalarValueSizeInBits();

	// Demand the bits from every vector element without a constant index.
	APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
	if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx))
	if (CIdx->getAPIntValue().ult(NumSrcElts))
	DemandedSrcElts = APInt::getOneBitSet(NumSrcElts, CIdx->getZExtValue());

	// If BitWidth > EltBitWidth the value is anyext:ed. So we do not know
	// anything about the extended bits.
	APInt DemandedSrcBits = DemandedBits;
	if (BitWidth > EltBitWidth)
	DemandedSrcBits = DemandedSrcBits.trunc(EltBitWidth);

	if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts, Known2, TLO,
	Depth + 1))
	return true;

	Known = Known2;
	if (BitWidth > EltBitWidth)
	Known = Known.zext(BitWidth, false /* => any extend */);
	break;
	}
	case ISD::BITCAST: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();

	// If this is an FP->Int bitcast and if the sign bit is the only
	// thing demanded, turn this into a FGETSIGN.
	if (!TLO.LegalOperations() && !VT.isVector() && !SrcVT.isVector() &&
	DemandedBits == APInt::getSignMask(Op.getValueSizeInBits()) &&
	SrcVT.isFloatingPoint()) {
	bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, VT);
	bool i32Legal = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
	if ((OpVTLegal \|\| i32Legal) && VT.isSimple() && SrcVT != MVT::f16 &&
	SrcVT != MVT::f128) {
	// Cannot eliminate/lower SHL for f128 yet.
	EVT Ty = OpVTLegal ? VT : MVT::i32;
	// Make a FGETSIGN + SHL to move the sign bit into the appropriate
	// place. We expect the SHL to be eliminated by other optimizations.
	SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, dl, Ty, Src);
	unsigned OpVTSizeInBits = Op.getValueSizeInBits();
	if (!OpVTLegal && OpVTSizeInBits > 32)
	Sign = TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Sign);
	unsigned ShVal = Op.getValueSizeInBits() - 1;
	SDValue ShAmt = TLO.DAG.getConstant(ShVal, dl, VT);
	return TLO.CombineTo(Op,
	TLO.DAG.getNode(ISD::SHL, dl, VT, Sign, ShAmt));
	}
	}

	// Bitcast from a vector using SimplifyDemanded Bits/VectorElts.
	// Demand the elt/bit if any of the original elts/bits are demanded.
	// TODO - bigendian once we have test coverage.
	// TODO - bool vectors once SimplifyDemandedVectorElts has SETCC support.
	if (SrcVT.isVector() && NumSrcEltBits > 1 &&
	(BitWidth % NumSrcEltBits) == 0 &&
	TLO.DAG.getDataLayout().isLittleEndian()) {
	unsigned Scale = BitWidth / NumSrcEltBits;
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
	APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
	for (unsigned i = 0; i != Scale; ++i) {
	unsigned Offset = i * NumSrcEltBits;
	APInt Sub = DemandedBits.extractBits(NumSrcEltBits, Offset);
	if (!Sub.isNullValue()) {
	DemandedSrcBits \|= Sub;
	for (unsigned j = 0; j != NumElts; ++j)
	if (DemandedElts[j])
	DemandedSrcElts.setBit((j * Scale) + i);
	}
	}

	APInt KnownSrcUndef, KnownSrcZero;
	if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef,
	KnownSrcZero, TLO, Depth + 1))
	return true;

	KnownBits KnownSrcBits;
	if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
	KnownSrcBits, TLO, Depth + 1))
	return true;
	} else if ((NumSrcEltBits % BitWidth) == 0 &&
	TLO.DAG.getDataLayout().isLittleEndian()) {
	unsigned Scale = NumSrcEltBits / BitWidth;
	unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
	APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
	APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i]) {
	unsigned Offset = (i % Scale) * BitWidth;
	DemandedSrcBits.insertBits(DemandedBits, Offset);
	DemandedSrcElts.setBit(i / Scale);
	}

	if (SrcVT.isVector()) {
	APInt KnownSrcUndef, KnownSrcZero;
	if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef,
	KnownSrcZero, TLO, Depth + 1))
	return true;
	}

	KnownBits KnownSrcBits;
	if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
	KnownSrcBits, TLO, Depth + 1))
	return true;
	}

	// If this is a bitcast, let computeKnownBits handle it. Only do this on a
	// recursive call where Known may be useful to the caller.
	if (Depth > 0) {
	Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
	return false;
	}
	break;
	}
	case ISD::ADD:
	case ISD::MUL:
	case ISD::SUB: {
	// Add, Sub, and Mul don't demand any bits in positions beyond that
	// of the highest bit demanded of them.
	SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
	unsigned DemandedBitsLZ = DemandedBits.countLeadingZeros();
	APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
	if (SimplifyDemandedBits(Op0, LoMask, DemandedElts, Known2, TLO,
	Depth + 1) \|\|
	SimplifyDemandedBits(Op1, LoMask, DemandedElts, Known2, TLO,
	Depth + 1) \|\|
	// See if the operation should be performed at a smaller bit width.
	ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) {
	SDNodeFlags Flags = Op.getNode()->getFlags();
	if (Flags.hasNoSignedWrap() \|\| Flags.hasNoUnsignedWrap()) {
	// Disable the nsw and nuw flags. We can no longer guarantee that we
	// won't wrap after simplification.
	Flags.setNoSignedWrap(false);
	Flags.setNoUnsignedWrap(false);
	SDValue NewOp =
	TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1, Flags);
	return TLO.CombineTo(Op, NewOp);
	}
	return true;
	}

	// If we have a constant operand, we may be able to turn it into -1 if we
	// do not demand the high bits. This can make the constant smaller to
	// encode, allow more general folding, or match specialized instruction
	// patterns (eg, 'blsr' on x86). Don't bother changing 1 to -1 because that
	// is probably not useful (and could be detrimental).
	ConstantSDNode *C = isConstOrConstSplat(Op1);
	APInt HighMask = APInt::getHighBitsSet(BitWidth, DemandedBitsLZ);
	if (C && !C->isAllOnesValue() && !C->isOne() &&
	(C->getAPIntValue() \| HighMask).isAllOnesValue()) {
	SDValue Neg1 = TLO.DAG.getAllOnesConstant(dl, VT);
	// We can't guarantee that the new math op doesn't wrap, so explicitly
	// clear those flags to prevent folding with a potential existing node
	// that has those flags set.
	SDNodeFlags Flags;
	Flags.setNoSignedWrap(false);
	Flags.setNoUnsignedWrap(false);
	SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Neg1, Flags);
	return TLO.CombineTo(Op, NewOp);
	}

	LLVM_FALLTHROUGH;
	}
	default:
	if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
	if (SimplifyDemandedBitsForTargetNode(Op, DemandedBits, DemandedElts,
	Known, TLO, Depth))
	return true;
	break;
	}

	// Just use computeKnownBits to compute output bits.
	Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
	break;
	}

	// If we know the value of all of the demanded bits, return this as a
	// constant.
	if (DemandedBits.isSubsetOf(Known.Zero \| Known.One)) {
	// Avoid folding to a constant if any OpaqueConstant is involved.
	const SDNode *N = Op.getNode();
	for (SDNodeIterator I = SDNodeIterator::begin(N),
	E = SDNodeIterator::end(N);
	I != E; ++I) {
	SDNode Op = I;
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
	if (C->isOpaque())
	return false;
	}
	// TODO: Handle float bits as well.
	if (VT.isInteger())
	return TLO.CombineTo(Op, TLO.DAG.getConstant(Known.One, dl, VT));
	}

	return false;
	}

	bool TargetLowering::SimplifyDemandedVectorElts(SDValue Op,
	const APInt &DemandedElts,
	APInt &KnownUndef,
	APInt &KnownZero,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());

	bool Simplified =
	SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, TLO);
	if (Simplified) {
	DCI.AddToWorklist(Op.getNode());
	DCI.CommitTargetLoweringOpt(TLO);
	}

	return Simplified;
	}

	/// Given a vector binary operation and known undefined elements for each input
	/// operand, compute whether each element of the output is undefined.
	static APInt getKnownUndefForVectorBinop(SDValue BO, SelectionDAG &DAG,
	const APInt &UndefOp0,
	const APInt &UndefOp1) {
	EVT VT = BO.getValueType();
	assert(DAG.getTargetLoweringInfo().isBinOp(BO.getOpcode()) && VT.isVector() &&
	"Vector binop only");

	EVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();
	assert(UndefOp0.getBitWidth() == NumElts &&
	UndefOp1.getBitWidth() == NumElts && "Bad type for undef analysis");

	auto getUndefOrConstantElt = [&](SDValue V, unsigned Index,
	const APInt &UndefVals) {
	if (UndefVals[Index])
	return DAG.getUNDEF(EltVT);

	if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
	// Try hard to make sure that the getNode() call is not creating temporary
	// nodes. Ignore opaque integers because they do not constant fold.
	SDValue Elt = BV->getOperand(Index);
	auto *C = dyn_cast<ConstantSDNode>(Elt);
	if (isa<ConstantFPSDNode>(Elt) \|\| Elt.isUndef() \|\| (C && !C->isOpaque()))
	return Elt;
	}

	return SDValue();
	};

	APInt KnownUndef = APInt::getNullValue(NumElts);
	for (unsigned i = 0; i != NumElts; ++i) {
	// If both inputs for this element are either constant or undef and match
	// the element type, compute the constant/undef result for this element of
	// the vector.
	// TODO: Ideally we would use FoldConstantArithmetic() here, but that does
	// not handle FP constants. The code within getNode() should be refactored
	// to avoid the danger of creating a bogus temporary node here.
	SDValue C0 = getUndefOrConstantElt(BO.getOperand(0), i, UndefOp0);
	SDValue C1 = getUndefOrConstantElt(BO.getOperand(1), i, UndefOp1);
	if (C0 && C1 && C0.getValueType() == EltVT && C1.getValueType() == EltVT)
	if (DAG.getNode(BO.getOpcode(), SDLoc(BO), EltVT, C0, C1).isUndef())
	KnownUndef.setBit(i);
	}
	return KnownUndef;
	}

	bool TargetLowering::SimplifyDemandedVectorElts(
	SDValue Op, const APInt &OriginalDemandedElts, APInt &KnownUndef,
	APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth,
	bool AssumeSingleUse) const {
	EVT VT = Op.getValueType();
	APInt DemandedElts = OriginalDemandedElts;
	unsigned NumElts = DemandedElts.getBitWidth();
	assert(VT.isVector() && "Expected vector op");
	assert(VT.getVectorNumElements() == NumElts &&
	"Mask size mismatches value type element count!");

	KnownUndef = KnownZero = APInt::getNullValue(NumElts);

	// Undef operand.
	if (Op.isUndef()) {
	KnownUndef.setAllBits();
	return false;
	}

	// If Op has other users, assume that all elements are needed.
	if (!Op.getNode()->hasOneUse() && !AssumeSingleUse)
	DemandedElts.setAllBits();

	// Not demanding any elements from Op.
	if (DemandedElts == 0) {
	KnownUndef.setAllBits();
	return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
	}

	// Limit search depth.
	if (Depth >= 6)
	return false;

	SDLoc DL(Op);
	unsigned EltSizeInBits = VT.getScalarSizeInBits();

	switch (Op.getOpcode()) {
	case ISD::SCALAR_TO_VECTOR: {
	if (!DemandedElts[0]) {
	KnownUndef.setAllBits();
	return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
	}
	KnownUndef.setHighBits(NumElts - 1);
	break;
	}
	case ISD::BITCAST: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();

	// We only handle vectors here.
	// TODO - investigate calling SimplifyDemandedBits/ComputeKnownBits?
	if (!SrcVT.isVector())
	break;

	// Fast handling of 'identity' bitcasts.
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	if (NumSrcElts == NumElts)
	return SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef,
	KnownZero, TLO, Depth + 1);

	APInt SrcZero, SrcUndef;
	APInt SrcDemandedElts = APInt::getNullValue(NumSrcElts);

	// Bitcast from 'large element' src vector to 'small element' vector, we
	// must demand a source element if any DemandedElt maps to it.
	if ((NumElts % NumSrcElts) == 0) {
	unsigned Scale = NumElts / NumSrcElts;
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i])
	SrcDemandedElts.setBit(i / Scale);

	if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
	TLO, Depth + 1))
	return true;

	// Try calling SimplifyDemandedBits, converting demanded elts to the bits
	// of the large element.
	// TODO - bigendian once we have test coverage.
	if (TLO.DAG.getDataLayout().isLittleEndian()) {
	unsigned SrcEltSizeInBits = SrcVT.getScalarSizeInBits();
	APInt SrcDemandedBits = APInt::getNullValue(SrcEltSizeInBits);
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i]) {
	unsigned Ofs = (i % Scale) * EltSizeInBits;
	SrcDemandedBits.setBits(Ofs, Ofs + EltSizeInBits);
	}

	KnownBits Known;
	if (SimplifyDemandedBits(Src, SrcDemandedBits, Known, TLO, Depth + 1))
	return true;
	}

	// If the src element is zero/undef then all the output elements will be -
	// only demanded elements are guaranteed to be correct.
	for (unsigned i = 0; i != NumSrcElts; ++i) {
	if (SrcDemandedElts[i]) {
	if (SrcZero[i])
	KnownZero.setBits(i * Scale, (i + 1) * Scale);
	if (SrcUndef[i])
	KnownUndef.setBits(i * Scale, (i + 1) * Scale);
	}
	}
	}

	// Bitcast from 'small element' src vector to 'large element' vector, we
	// demand all smaller source elements covered by the larger demanded element
	// of this vector.
	if ((NumSrcElts % NumElts) == 0) {
	unsigned Scale = NumSrcElts / NumElts;
	for (unsigned i = 0; i != NumElts; ++i)
	if (DemandedElts[i])
	SrcDemandedElts.setBits(i * Scale, (i + 1) * Scale);

	if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
	TLO, Depth + 1))
	return true;

	// If all the src elements covering an output element are zero/undef, then
	// the output element will be as well, assuming it was demanded.
	for (unsigned i = 0; i != NumElts; ++i) {
	if (DemandedElts[i]) {
	if (SrcZero.extractBits(Scale, i * Scale).isAllOnesValue())
	KnownZero.setBit(i);
	if (SrcUndef.extractBits(Scale, i * Scale).isAllOnesValue())
	KnownUndef.setBit(i);
	}
	}
	}
	break;
	}
	case ISD::BUILD_VECTOR: {
	// Check all elements and simplify any unused elements with UNDEF.
	if (!DemandedElts.isAllOnesValue()) {
	// Don't simplify BROADCASTS.
	if (llvm::any_of(Op->op_values(),
	[&](SDValue Elt) { return Op.getOperand(0) != Elt; })) {
	SmallVector<SDValue, 32> Ops(Op->op_begin(), Op->op_end());
	bool Updated = false;
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i] && !Ops[i].isUndef()) {
	Ops[i] = TLO.DAG.getUNDEF(Ops[0].getValueType());
	KnownUndef.setBit(i);
	Updated = true;
	}
	}
	if (Updated)
	return TLO.CombineTo(Op, TLO.DAG.getBuildVector(VT, DL, Ops));
	}
	}
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue SrcOp = Op.getOperand(i);
	if (SrcOp.isUndef()) {
	KnownUndef.setBit(i);
	} else if (EltSizeInBits == SrcOp.getScalarValueSizeInBits() &&
	(isNullConstant(SrcOp) \|\| isNullFPConstant(SrcOp))) {
	KnownZero.setBit(i);
	}
	}
	break;
	}
	case ISD::CONCAT_VECTORS: {
	EVT SubVT = Op.getOperand(0).getValueType();
	unsigned NumSubVecs = Op.getNumOperands();
	unsigned NumSubElts = SubVT.getVectorNumElements();
	for (unsigned i = 0; i != NumSubVecs; ++i) {
	SDValue SubOp = Op.getOperand(i);
	APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts);
	APInt SubUndef, SubZero;
	if (SimplifyDemandedVectorElts(SubOp, SubElts, SubUndef, SubZero, TLO,
	Depth + 1))
	return true;
	KnownUndef.insertBits(SubUndef, i * NumSubElts);
	KnownZero.insertBits(SubZero, i * NumSubElts);
	}
	break;
	}
	case ISD::INSERT_SUBVECTOR: {
	if (!isa<ConstantSDNode>(Op.getOperand(2)))
	break;
	SDValue Base = Op.getOperand(0);
	SDValue Sub = Op.getOperand(1);
	EVT SubVT = Sub.getValueType();
	unsigned NumSubElts = SubVT.getVectorNumElements();
	const APInt &Idx = Op.getConstantOperandAPInt(2);
	if (Idx.ugt(NumElts - NumSubElts))
	break;
	unsigned SubIdx = Idx.getZExtValue();
	APInt SubElts = DemandedElts.extractBits(NumSubElts, SubIdx);
	APInt SubUndef, SubZero;
	if (SimplifyDemandedVectorElts(Sub, SubElts, SubUndef, SubZero, TLO,
	Depth + 1))
	return true;
	APInt BaseElts = DemandedElts;
	BaseElts.insertBits(APInt::getNullValue(NumSubElts), SubIdx);
	if (SimplifyDemandedVectorElts(Base, BaseElts, KnownUndef, KnownZero, TLO,
	Depth + 1))
	return true;
	KnownUndef.insertBits(SubUndef, SubIdx);
	KnownZero.insertBits(SubZero, SubIdx);
	break;
	}
	case ISD::EXTRACT_SUBVECTOR: {
	SDValue Src = Op.getOperand(0);
	ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
	if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
	// Offset the demanded elts by the subvector index.
	uint64_t Idx = SubIdx->getZExtValue();
	APInt SrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	KnownUndef = SrcUndef.extractBits(NumElts, Idx);
	KnownZero = SrcZero.extractBits(NumElts, Idx);
	}
	break;
	}
	case ISD::INSERT_VECTOR_ELT: {
	SDValue Vec = Op.getOperand(0);
	SDValue Scl = Op.getOperand(1);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

	// For a legal, constant insertion index, if we don't need this insertion
	// then strip it, else remove it from the demanded elts.
	if (CIdx && CIdx->getAPIntValue().ult(NumElts)) {
	unsigned Idx = CIdx->getZExtValue();
	if (!DemandedElts[Idx])
	return TLO.CombineTo(Op, Vec);

	APInt DemandedVecElts(DemandedElts);
	DemandedVecElts.clearBit(Idx);
	if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
	KnownZero, TLO, Depth + 1))
	return true;

	KnownUndef.clearBit(Idx);
	if (Scl.isUndef())
	KnownUndef.setBit(Idx);

	KnownZero.clearBit(Idx);
	if (isNullConstant(Scl) \|\| isNullFPConstant(Scl))
	KnownZero.setBit(Idx);
	break;
	}

	APInt VecUndef, VecZero;
	if (SimplifyDemandedVectorElts(Vec, DemandedElts, VecUndef, VecZero, TLO,
	Depth + 1))
	return true;
	// Without knowing the insertion index we can't set KnownUndef/KnownZero.
	break;
	}
	case ISD::VSELECT: {
	// Try to transform the select condition based on the current demanded
	// elements.
	// TODO: If a condition element is undef, we can choose from one arm of the
	// select (and if one arm is undef, then we can propagate that to the
	// result).
	// TODO - add support for constant vselect masks (see IR version of this).
	APInt UnusedUndef, UnusedZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, UnusedUndef,
	UnusedZero, TLO, Depth + 1))
	return true;

	// See if we can simplify either vselect operand.
	APInt DemandedLHS(DemandedElts);
	APInt DemandedRHS(DemandedElts);
	APInt UndefLHS, ZeroLHS;
	APInt UndefRHS, ZeroRHS;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedLHS, UndefLHS,
	ZeroLHS, TLO, Depth + 1))
	return true;
	if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedRHS, UndefRHS,
	ZeroRHS, TLO, Depth + 1))
	return true;

	KnownUndef = UndefLHS & UndefRHS;
	KnownZero = ZeroLHS & ZeroRHS;
	break;
	}
	case ISD::VECTOR_SHUFFLE: {
	ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();

	// Collect demanded elements from shuffle operands..
	APInt DemandedLHS(NumElts, 0);
	APInt DemandedRHS(NumElts, 0);
	for (unsigned i = 0; i != NumElts; ++i) {
	int M = ShuffleMask[i];
	if (M < 0 \|\| !DemandedElts[i])
	continue;
	assert(0 <= M && M < (int)(2 * NumElts) && "Shuffle index out of range");
	if (M < (int)NumElts)
	DemandedLHS.setBit(M);
	else
	DemandedRHS.setBit(M - NumElts);
	}

	// See if we can simplify either shuffle operand.
	APInt UndefLHS, ZeroLHS;
	APInt UndefRHS, ZeroRHS;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, UndefLHS,
	ZeroLHS, TLO, Depth + 1))
	return true;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, UndefRHS,
	ZeroRHS, TLO, Depth + 1))
	return true;

	// Simplify mask using undef elements from LHS/RHS.
	bool Updated = false;
	bool IdentityLHS = true, IdentityRHS = true;
	SmallVector<int, 32> NewMask(ShuffleMask.begin(), ShuffleMask.end());
	for (unsigned i = 0; i != NumElts; ++i) {
	int &M = NewMask[i];
	if (M < 0)
	continue;
	if (!DemandedElts[i] \|\| (M < (int)NumElts && UndefLHS[M]) \|\|
	(M >= (int)NumElts && UndefRHS[M - NumElts])) {
	Updated = true;
	M = -1;
	}
	IdentityLHS &= (M < 0) \|\| (M == (int)i);
	IdentityRHS &= (M < 0) \|\| ((M - NumElts) == i);
	}

	// Update legal shuffle masks based on demanded elements if it won't reduce
	// to Identity which can cause premature removal of the shuffle mask.
	if (Updated && !IdentityLHS && !IdentityRHS && !TLO.LegalOps &&
	isShuffleMaskLegal(NewMask, VT))
	return TLO.CombineTo(Op,
	TLO.DAG.getVectorShuffle(VT, DL, Op.getOperand(0),
	Op.getOperand(1), NewMask));

	// Propagate undef/zero elements from LHS/RHS.
	for (unsigned i = 0; i != NumElts; ++i) {
	int M = ShuffleMask[i];
	if (M < 0) {
	KnownUndef.setBit(i);
	} else if (M < (int)NumElts) {
	if (UndefLHS[M])
	KnownUndef.setBit(i);
	if (ZeroLHS[M])
	KnownZero.setBit(i);
	} else {
	if (UndefRHS[M - NumElts])
	KnownUndef.setBit(i);
	if (ZeroRHS[M - NumElts])
	KnownZero.setBit(i);
	}
	}
	break;
	}
	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG: {
	APInt SrcUndef, SrcZero;
	SDValue Src = Op.getOperand(0);
	unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
	APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts);
	if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	KnownZero = SrcZero.zextOrTrunc(NumElts);
	KnownUndef = SrcUndef.zextOrTrunc(NumElts);

	if (Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG &&
	Op.getValueSizeInBits() == Src.getValueSizeInBits() &&
	DemandedSrcElts == 1 && TLO.DAG.getDataLayout().isLittleEndian()) {
	// aext - if we just need the bottom element then we can bitcast.
	return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));
	}

	if (Op.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) {
	// zext(undef) upper bits are guaranteed to be zero.
	if (DemandedElts.isSubsetOf(KnownUndef))
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
	KnownUndef.clearAllBits();
	}
	break;
	}

	// TODO: There are more binop opcodes that could be handled here - MUL, MIN,
	// MAX, saturated math, etc.
	case ISD::OR:
	case ISD::XOR:
	case ISD::ADD:
	case ISD::SUB:
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM: {
	APInt UndefRHS, ZeroRHS;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, UndefRHS,
	ZeroRHS, TLO, Depth + 1))
	return true;
	APInt UndefLHS, ZeroLHS;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, UndefLHS,
	ZeroLHS, TLO, Depth + 1))
	return true;

	KnownZero = ZeroLHS & ZeroRHS;
	KnownUndef = getKnownUndefForVectorBinop(Op, TLO.DAG, UndefLHS, UndefRHS);
	break;
	}
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SRA:
	case ISD::ROTL:
	case ISD::ROTR: {
	APInt UndefRHS, ZeroRHS;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, UndefRHS,
	ZeroRHS, TLO, Depth + 1))
	return true;
	APInt UndefLHS, ZeroLHS;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, UndefLHS,
	ZeroLHS, TLO, Depth + 1))
	return true;

	KnownZero = ZeroLHS;
	KnownUndef = UndefLHS & UndefRHS; // TODO: use getKnownUndefForVectorBinop?
	break;
	}
	case ISD::MUL:
	case ISD::AND: {
	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, SrcUndef,
	SrcZero, TLO, Depth + 1))
	return true;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
	KnownZero, TLO, Depth + 1))
	return true;

	// If either side has a zero element, then the result element is zero, even
	// if the other is an UNDEF.
	// TODO: Extend getKnownUndefForVectorBinop to also deal with known zeros
	// and then handle 'and' nodes with the rest of the binop opcodes.
	KnownZero \|= SrcZero;
	KnownUndef &= SrcUndef;
	KnownUndef &= ~KnownZero;
	break;
	}
	case ISD::TRUNCATE:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
	KnownZero, TLO, Depth + 1))
	return true;

	if (Op.getOpcode() == ISD::ZERO_EXTEND) {
	// zext(undef) upper bits are guaranteed to be zero.
	if (DemandedElts.isSubsetOf(KnownUndef))
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
	KnownUndef.clearAllBits();
	}
	break;
	default: {
	if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
	if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts, KnownUndef,
	KnownZero, TLO, Depth))
	return true;
	} else {
	KnownBits Known;
	APInt DemandedBits = APInt::getAllOnesValue(EltSizeInBits);
	if (SimplifyDemandedBits(Op, DemandedBits, OriginalDemandedElts, Known,
	TLO, Depth, AssumeSingleUse))
	return true;
	}
	break;
	}
	}
	assert((KnownUndef & KnownZero) == 0 && "Elements flagged as undef AND zero");

	// Constant fold all undef cases.
	// TODO: Handle zero cases as well.
	if (DemandedElts.isSubsetOf(KnownUndef))
	return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));

	return false;
	}

	/// Determine which of the bits specified in Mask are known to be either zero or
	/// one and return them in the Known.
	void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	assert((Op.getOpcode() >= ISD::BUILTIN_OP_END \|\|
	Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN \|\|
	Op.getOpcode() == ISD::INTRINSIC_W_CHAIN \|\|
	Op.getOpcode() == ISD::INTRINSIC_VOID) &&
	"Should use MaskedValueIsZero if you don't know whether Op"
	" is a target node!");
	Known.resetAll();
	}

	void TargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	assert(isa<FrameIndexSDNode>(Op) && "expected FrameIndex");

	if (unsigned Align = DAG.InferPtrAlignment(Op)) {
	// The low bits are known zero if the pointer is aligned.
	Known.Zero.setLowBits(Log2_32(Align));
	}
	}

	/// This method can be implemented by targets that want to expose additional
	/// information about sign bits to the DAG Combiner.
	unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
	const APInt &,
	const SelectionDAG &,
	unsigned Depth) const {
	assert((Op.getOpcode() >= ISD::BUILTIN_OP_END \|\|
	Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN \|\|
	Op.getOpcode() == ISD::INTRINSIC_W_CHAIN \|\|
	Op.getOpcode() == ISD::INTRINSIC_VOID) &&
	"Should use ComputeNumSignBits if you don't know whether Op"
	" is a target node!");
	return 1;
	}

	bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
	SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
	TargetLoweringOpt &TLO, unsigned Depth) const {
	assert((Op.getOpcode() >= ISD::BUILTIN_OP_END \|\|
	Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN \|\|
	Op.getOpcode() == ISD::INTRINSIC_W_CHAIN \|\|
	Op.getOpcode() == ISD::INTRINSIC_VOID) &&
	"Should use SimplifyDemandedVectorElts if you don't know whether Op"
	" is a target node!");
	return false;
	}

	bool TargetLowering::SimplifyDemandedBitsForTargetNode(
	SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
	KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
	assert((Op.getOpcode() >= ISD::BUILTIN_OP_END \|\|
	Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN \|\|
	Op.getOpcode() == ISD::INTRINSIC_W_CHAIN \|\|
	Op.getOpcode() == ISD::INTRINSIC_VOID) &&
	"Should use SimplifyDemandedBits if you don't know whether Op"
	" is a target node!");
	computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
	return false;
	}

	const Constant TargetLowering::getTargetConstantFromLoad(LoadSDNode) const {
	return nullptr;
	}

	bool TargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
	const SelectionDAG &DAG,
	bool SNaN,
	unsigned Depth) const {
	assert((Op.getOpcode() >= ISD::BUILTIN_OP_END \|\|
	Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN \|\|
	Op.getOpcode() == ISD::INTRINSIC_W_CHAIN \|\|
	Op.getOpcode() == ISD::INTRINSIC_VOID) &&
	"Should use isKnownNeverNaN if you don't know whether Op"
	" is a target node!");
	return false;
	}

	// FIXME: Ideally, this would use ISD::isConstantSplatVector(), but that must
	// work with truncating build vectors and vectors with elements of less than
	// 8 bits.
	bool TargetLowering::isConstTrueVal(const SDNode *N) const {
	if (!N)
	return false;

	APInt CVal;
	if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
	CVal = CN->getAPIntValue();
	} else if (auto *BV = dyn_cast<BuildVectorSDNode>(N)) {
	auto *CN = BV->getConstantSplatNode();
	if (!CN)
	return false;

	// If this is a truncating build vector, truncate the splat value.
	// Otherwise, we may fail to match the expected values below.
	unsigned BVEltWidth = BV->getValueType(0).getScalarSizeInBits();
	CVal = CN->getAPIntValue();
	if (BVEltWidth < CVal.getBitWidth())
	CVal = CVal.trunc(BVEltWidth);
	} else {
	return false;
	}

	switch (getBooleanContents(N->getValueType(0))) {
	case UndefinedBooleanContent:
	return CVal[0];
	case ZeroOrOneBooleanContent:
	return CVal.isOneValue();
	case ZeroOrNegativeOneBooleanContent:
	return CVal.isAllOnesValue();
	}

	llvm_unreachable("Invalid boolean contents");
	}

	bool TargetLowering::isConstFalseVal(const SDNode *N) const {
	if (!N)
	return false;

	const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
	if (!CN) {
	const BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N);
	if (!BV)
	return false;

	// Only interested in constant splats, we don't care about undef
	// elements in identifying boolean constants and getConstantSplatNode
	// returns NULL if all ops are undef;
	CN = BV->getConstantSplatNode();
	if (!CN)
	return false;
	}

	if (getBooleanContents(N->getValueType(0)) == UndefinedBooleanContent)
	return !CN->getAPIntValue()[0];

	return CN->isNullValue();
	}

	bool TargetLowering::isExtendedTrueVal(const ConstantSDNode *N, EVT VT,
	bool SExt) const {
	if (VT == MVT::i1)
	return N->isOne();

	TargetLowering::BooleanContent Cnt = getBooleanContents(VT);
	switch (Cnt) {
	case TargetLowering::ZeroOrOneBooleanContent:
	// An extended value of 1 is always true, unless its original type is i1,
	// in which case it will be sign extended to -1.
	return (N->isOne() && !SExt) \|\| (SExt && (N->getValueType(0) != MVT::i1));
	case TargetLowering::UndefinedBooleanContent:
	case TargetLowering::ZeroOrNegativeOneBooleanContent:
	return N->isAllOnesValue() && SExt;
	}
	llvm_unreachable("Unexpected enumeration.");
	}

	/// This helper function of SimplifySetCC tries to optimize the comparison when
	/// either operand of the SetCC node is a bitwise-and instruction.
	SDValue TargetLowering::foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1,
	ISD::CondCode Cond, const SDLoc &DL,
	DAGCombinerInfo &DCI) const {
	// Match these patterns in any of their permutations:
	// (X & Y) == Y
	// (X & Y) != Y
	if (N1.getOpcode() == ISD::AND && N0.getOpcode() != ISD::AND)
	std::swap(N0, N1);

	EVT OpVT = N0.getValueType();
	if (N0.getOpcode() != ISD::AND \|\| !OpVT.isInteger() \|\|
	(Cond != ISD::SETEQ && Cond != ISD::SETNE))
	return SDValue();

	SDValue X, Y;
	if (N0.getOperand(0) == N1) {
	X = N0.getOperand(1);
	Y = N0.getOperand(0);
	} else if (N0.getOperand(1) == N1) {
	X = N0.getOperand(0);
	Y = N0.getOperand(1);
	} else {
	return SDValue();
	}

	SelectionDAG &DAG = DCI.DAG;
	SDValue Zero = DAG.getConstant(0, DL, OpVT);
	if (DAG.isKnownToBeAPowerOfTwo(Y)) {
	// Simplify X & Y == Y to X & Y != 0 if Y has exactly one bit set.
	// Note that where Y is variable and is known to have at most one bit set
	// (for example, if it is Z & 1) we cannot do this; the expressions are not
	// equivalent when Y == 0.
	Cond = ISD::getSetCCInverse(Cond, /isInteger=/true);
	if (DCI.isBeforeLegalizeOps() \|\|
	isCondCodeLegal(Cond, N0.getSimpleValueType()))
	return DAG.getSetCC(DL, VT, N0, Zero, Cond);
	} else if (N0.hasOneUse() && hasAndNotCompare(Y)) {
	// If the target supports an 'and-not' or 'and-complement' logic operation,
	// try to use that to make a comparison operation more efficient.
	// But don't do this transform if the mask is a single bit because there are
	// more efficient ways to deal with that case (for example, 'bt' on x86 or
	// 'rlwinm' on PPC).

	// Bail out if the compare operand that we want to turn into a zero is
	// already a zero (otherwise, infinite loop).
	auto *YConst = dyn_cast<ConstantSDNode>(Y);
	if (YConst && YConst->isNullValue())
	return SDValue();

	// Transform this into: ~X & Y == 0.
	SDValue NotX = DAG.getNOT(SDLoc(X), X, OpVT);
	SDValue NewAnd = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, NotX, Y);
	return DAG.getSetCC(DL, VT, NewAnd, Zero, Cond);
	}

	return SDValue();
	}

	/// There are multiple IR patterns that could be checking whether certain
	/// truncation of a signed number would be lossy or not. The pattern which is
	/// best at IR level, may not lower optimally. Thus, we want to unfold it.
	/// We are looking for the following pattern: (KeptBits is a constant)
	/// (add %x, (1 << (KeptBits-1))) srccond (1 << KeptBits)
	/// KeptBits won't be bitwidth(x), that will be constant-folded to true/false.
	/// KeptBits also can't be 1, that would have been folded to %x dstcond 0
	/// We will unfold it into the natural trunc+sext pattern:
	/// ((%x << C) a>> C) dstcond %x
	/// Where C = bitwidth(x) - KeptBits and C u< bitwidth(x)
	SDValue TargetLowering::optimizeSetCCOfSignedTruncationCheck(
	EVT SCCVT, SDValue N0, SDValue N1, ISD::CondCode Cond, DAGCombinerInfo &DCI,
	const SDLoc &DL) const {
	// We must be comparing with a constant.
	ConstantSDNode *C1;
	if (!(C1 = dyn_cast<ConstantSDNode>(N1)))
	return SDValue();

	// N0 should be: add %x, (1 << (KeptBits-1))
	if (N0->getOpcode() != ISD::ADD)
	return SDValue();

	// And we must be 'add'ing a constant.
	ConstantSDNode *C01;
	if (!(C01 = dyn_cast<ConstantSDNode>(N0->getOperand(1))))
	return SDValue();

	SDValue X = N0->getOperand(0);
	EVT XVT = X.getValueType();

	// Validate constants ...

	APInt I1 = C1->getAPIntValue();

	ISD::CondCode NewCond;
	if (Cond == ISD::CondCode::SETULT) {
	NewCond = ISD::CondCode::SETEQ;
	} else if (Cond == ISD::CondCode::SETULE) {
	NewCond = ISD::CondCode::SETEQ;
	// But need to 'canonicalize' the constant.
	I1 += 1;
	} else if (Cond == ISD::CondCode::SETUGT) {
	NewCond = ISD::CondCode::SETNE;
	// But need to 'canonicalize' the constant.
	I1 += 1;
	} else if (Cond == ISD::CondCode::SETUGE) {
	NewCond = ISD::CondCode::SETNE;
	} else
	return SDValue();

	APInt I01 = C01->getAPIntValue();

	auto checkConstants = [&I1, &I01]() -> bool {
	// Both of them must be power-of-two, and the constant from setcc is bigger.
	return I1.ugt(I01) && I1.isPowerOf2() && I01.isPowerOf2();
	};

	if (checkConstants()) {
	// Great, e.g. got icmp ult i16 (add i16 %x, 128), 256
	} else {
	// What if we invert constants? (and the target predicate)
	I1.negate();
	I01.negate();
	NewCond = getSetCCInverse(NewCond, /isInteger=/true);
	if (!checkConstants())
	return SDValue();
	// Great, e.g. got icmp uge i16 (add i16 %x, -128), -256
	}

	// They are power-of-two, so which bit is set?
	const unsigned KeptBits = I1.logBase2();
	const unsigned KeptBitsMinusOne = I01.logBase2();

	// Magic!
	if (KeptBits != (KeptBitsMinusOne + 1))
	return SDValue();
	assert(KeptBits > 0 && KeptBits < XVT.getSizeInBits() && "unreachable");

	// We don't want to do this in every single case.
	SelectionDAG &DAG = DCI.DAG;
	if (!DAG.getTargetLoweringInfo().shouldTransformSignedTruncationCheck(
	XVT, KeptBits))
	return SDValue();

	const unsigned MaskedBits = XVT.getSizeInBits() - KeptBits;
	assert(MaskedBits > 0 && MaskedBits < XVT.getSizeInBits() && "unreachable");

	// Unfold into: ((%x << C) a>> C) cond %x
	// Where 'cond' will be either 'eq' or 'ne'.
	SDValue ShiftAmt = DAG.getConstant(MaskedBits, DL, XVT);
	SDValue T0 = DAG.getNode(ISD::SHL, DL, XVT, X, ShiftAmt);
	SDValue T1 = DAG.getNode(ISD::SRA, DL, XVT, T0, ShiftAmt);
	SDValue T2 = DAG.getSetCC(DL, SCCVT, T1, X, NewCond);

	return T2;
	}

	/// Try to fold an equality comparison with a {add/sub/xor} binary operation as
	/// the 1st operand (N0). Callers are expected to swap the N0/N1 parameters to
	/// handle the commuted versions of these patterns.
	SDValue TargetLowering::foldSetCCWithBinOp(EVT VT, SDValue N0, SDValue N1,
	ISD::CondCode Cond, const SDLoc &DL,
	DAGCombinerInfo &DCI) const {
	unsigned BOpcode = N0.getOpcode();
	assert((BOpcode == ISD::ADD \|\| BOpcode == ISD::SUB \|\| BOpcode == ISD::XOR) &&
	"Unexpected binop");
	assert((Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) && "Unexpected condcode");

	// (X + Y) == X --> Y == 0
	// (X - Y) == X --> Y == 0
	// (X ^ Y) == X --> Y == 0
	SelectionDAG &DAG = DCI.DAG;
	EVT OpVT = N0.getValueType();
	SDValue X = N0.getOperand(0);
	SDValue Y = N0.getOperand(1);
	if (X == N1)
	return DAG.getSetCC(DL, VT, Y, DAG.getConstant(0, DL, OpVT), Cond);

	if (Y != N1)
	return SDValue();

	// (X + Y) == Y --> X == 0
	// (X ^ Y) == Y --> X == 0
	if (BOpcode == ISD::ADD \|\| BOpcode == ISD::XOR)
	return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, OpVT), Cond);

	// The shift would not be valid if the operands are boolean (i1).
	if (!N0.hasOneUse() \|\| OpVT.getScalarSizeInBits() == 1)
	return SDValue();

	// (X - Y) == Y --> X == Y << 1
	EVT ShiftVT = getShiftAmountTy(OpVT, DAG.getDataLayout(),
	!DCI.isBeforeLegalize());
	SDValue One = DAG.getConstant(1, DL, ShiftVT);
	SDValue YShl1 = DAG.getNode(ISD::SHL, DL, N1.getValueType(), Y, One);
	if (!DCI.isCalledByLegalizer())
	DCI.AddToWorklist(YShl1.getNode());
	return DAG.getSetCC(DL, VT, X, YShl1, Cond);
	}

	/// Try to simplify a setcc built with the specified operands and cc. If it is
	/// unable to simplify it, return a null SDValue.
	SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
	ISD::CondCode Cond, bool foldBooleans,
	DAGCombinerInfo &DCI,
	const SDLoc &dl) const {
	SelectionDAG &DAG = DCI.DAG;
	EVT OpVT = N0.getValueType();

	// Constant fold or commute setcc.
	if (SDValue Fold = DAG.FoldSetCC(VT, N0, N1, Cond, dl))
	return Fold;

	// Ensure that the constant occurs on the RHS and fold constant comparisons.
	// TODO: Handle non-splat vector constants. All undef causes trouble.
	ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond);
	if (isConstOrConstSplat(N0) &&
	(DCI.isBeforeLegalizeOps() \|\|
	isCondCodeLegal(SwappedCC, N0.getSimpleValueType())))
	return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);

	// If we have a subtract with the same 2 non-constant operands as this setcc
	// -- but in reverse order -- then try to commute the operands of this setcc
	// to match. A matching pair of setcc (cmp) and sub may be combined into 1
	// instruction on some targets.
	if (!isConstOrConstSplat(N0) && !isConstOrConstSplat(N1) &&
	(DCI.isBeforeLegalizeOps() \|\|
	isCondCodeLegal(SwappedCC, N0.getSimpleValueType())) &&
	DAG.getNodeIfExists(ISD::SUB, DAG.getVTList(OpVT), { N1, N0 } ) &&
	!DAG.getNodeIfExists(ISD::SUB, DAG.getVTList(OpVT), { N0, N1 } ))
	return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);

	if (auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
	const APInt &C1 = N1C->getAPIntValue();

	// If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an
	// equality comparison, then we're just comparing whether X itself is
	// zero.
	if (N0.getOpcode() == ISD::SRL && (C1.isNullValue() \|\| C1.isOneValue()) &&
	N0.getOperand(0).getOpcode() == ISD::CTLZ &&
	N0.getOperand(1).getOpcode() == ISD::Constant) {
	const APInt &ShAmt = N0.getConstantOperandAPInt(1);
	if ((Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) &&
	ShAmt == Log2_32(N0.getValueSizeInBits())) {
	if ((C1 == 0) == (Cond == ISD::SETEQ)) {
	// (srl (ctlz x), 5) == 0 -> X != 0
	// (srl (ctlz x), 5) != 1 -> X != 0
	Cond = ISD::SETNE;
	} else {
	// (srl (ctlz x), 5) != 0 -> X == 0
	// (srl (ctlz x), 5) == 1 -> X == 0
	Cond = ISD::SETEQ;
	}
	SDValue Zero = DAG.getConstant(0, dl, N0.getValueType());
	return DAG.getSetCC(dl, VT, N0.getOperand(0).getOperand(0),
	Zero, Cond);
	}
	}

	SDValue CTPOP = N0;
	// Look through truncs that don't change the value of a ctpop.
	if (N0.hasOneUse() && N0.getOpcode() == ISD::TRUNCATE)
	CTPOP = N0.getOperand(0);

	if (CTPOP.hasOneUse() && CTPOP.getOpcode() == ISD::CTPOP &&
	(N0 == CTPOP \|\|
	N0.getValueSizeInBits() > Log2_32_Ceil(CTPOP.getValueSizeInBits()))) {
	EVT CTVT = CTPOP.getValueType();
	SDValue CTOp = CTPOP.getOperand(0);

	// (ctpop x) u< 2 -> (x & x-1) == 0
	// (ctpop x) u> 1 -> (x & x-1) != 0
	if ((Cond == ISD::SETULT && C1 == 2) \|\| (Cond == ISD::SETUGT && C1 == 1)){
	SDValue Sub = DAG.getNode(ISD::SUB, dl, CTVT, CTOp,
	DAG.getConstant(1, dl, CTVT));
	SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Sub);
	ISD::CondCode CC = Cond == ISD::SETULT ? ISD::SETEQ : ISD::SETNE;
	return DAG.getSetCC(dl, VT, And, DAG.getConstant(0, dl, CTVT), CC);
	}

	// If ctpop is not supported, expand a power-of-2 comparison based on it.
	if (C1 == 1 && !isOperationLegalOrCustom(ISD::CTPOP, CTVT) &&
	(Cond == ISD::SETEQ \|\| Cond == ISD::SETNE)) {
	// (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0)
	// (ctpop x) != 1 --> (x == 0) \|\| ((x & x-1) != 0)
	SDValue Zero = DAG.getConstant(0, dl, CTVT);
	SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT);
	ISD::CondCode InvCond = ISD::getSetCCInverse(Cond, true);
	SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, CTOp, NegOne);
	SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add);
	SDValue LHS = DAG.getSetCC(dl, VT, CTOp, Zero, InvCond);
	SDValue RHS = DAG.getSetCC(dl, VT, And, Zero, Cond);
	unsigned LogicOpcode = Cond == ISD::SETEQ ? ISD::AND : ISD::OR;
	return DAG.getNode(LogicOpcode, dl, VT, LHS, RHS);
	}
	}

	// (zext x) == C --> x == (trunc C)
	// (sext x) == C --> x == (trunc C)
	if ((Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) &&
	DCI.isBeforeLegalize() && N0->hasOneUse()) {
	unsigned MinBits = N0.getValueSizeInBits();
	SDValue PreExt;
	bool Signed = false;
	if (N0->getOpcode() == ISD::ZERO_EXTEND) {
	// ZExt
	MinBits = N0->getOperand(0).getValueSizeInBits();
	PreExt = N0->getOperand(0);
	} else if (N0->getOpcode() == ISD::AND) {
	// DAGCombine turns costly ZExts into ANDs
	if (auto *C = dyn_cast<ConstantSDNode>(N0->getOperand(1)))
	if ((C->getAPIntValue()+1).isPowerOf2()) {
	MinBits = C->getAPIntValue().countTrailingOnes();
	PreExt = N0->getOperand(0);
	}
	} else if (N0->getOpcode() == ISD::SIGN_EXTEND) {
	// SExt
	MinBits = N0->getOperand(0).getValueSizeInBits();
	PreExt = N0->getOperand(0);
	Signed = true;
	} else if (auto *LN0 = dyn_cast<LoadSDNode>(N0)) {
	// ZEXTLOAD / SEXTLOAD
	if (LN0->getExtensionType() == ISD::ZEXTLOAD) {
	MinBits = LN0->getMemoryVT().getSizeInBits();
	PreExt = N0;
	} else if (LN0->getExtensionType() == ISD::SEXTLOAD) {
	Signed = true;
	MinBits = LN0->getMemoryVT().getSizeInBits();
	PreExt = N0;
	}
	}

	// Figure out how many bits we need to preserve this constant.
	unsigned ReqdBits = Signed ?
	C1.getBitWidth() - C1.getNumSignBits() + 1 :
	C1.getActiveBits();

	// Make sure we're not losing bits from the constant.
	if (MinBits > 0 &&
	MinBits < C1.getBitWidth() &&
	MinBits >= ReqdBits) {
	EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits);
	if (isTypeDesirableForOp(ISD::SETCC, MinVT)) {
	// Will get folded away.
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MinVT, PreExt);
	if (MinBits == 1 && C1 == 1)
	// Invert the condition.
	return DAG.getSetCC(dl, VT, Trunc, DAG.getConstant(0, dl, MVT::i1),
	Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
	SDValue C = DAG.getConstant(C1.trunc(MinBits), dl, MinVT);
	return DAG.getSetCC(dl, VT, Trunc, C, Cond);
	}

	// If truncating the setcc operands is not desirable, we can still
	// simplify the expression in some cases:
	// setcc ([sz]ext (setcc x, y, cc)), 0, setne) -> setcc (x, y, cc)
	// setcc ([sz]ext (setcc x, y, cc)), 0, seteq) -> setcc (x, y, inv(cc))
	// setcc (zext (setcc x, y, cc)), 1, setne) -> setcc (x, y, inv(cc))
	// setcc (zext (setcc x, y, cc)), 1, seteq) -> setcc (x, y, cc)
	// setcc (sext (setcc x, y, cc)), -1, setne) -> setcc (x, y, inv(cc))
	// setcc (sext (setcc x, y, cc)), -1, seteq) -> setcc (x, y, cc)
	SDValue TopSetCC = N0->getOperand(0);
	unsigned N0Opc = N0->getOpcode();
	bool SExt = (N0Opc == ISD::SIGN_EXTEND);
	if (TopSetCC.getValueType() == MVT::i1 && VT == MVT::i1 &&
	TopSetCC.getOpcode() == ISD::SETCC &&
	(N0Opc == ISD::ZERO_EXTEND \|\| N0Opc == ISD::SIGN_EXTEND) &&
	(isConstFalseVal(N1C) \|\|
	isExtendedTrueVal(N1C, N0->getValueType(0), SExt))) {

	bool Inverse = (N1C->isNullValue() && Cond == ISD::SETEQ) \|\|
	(!N1C->isNullValue() && Cond == ISD::SETNE);

	if (!Inverse)
	return TopSetCC;

	ISD::CondCode InvCond = ISD::getSetCCInverse(
	cast<CondCodeSDNode>(TopSetCC.getOperand(2))->get(),
	TopSetCC.getOperand(0).getValueType().isInteger());
	return DAG.getSetCC(dl, VT, TopSetCC.getOperand(0),
	TopSetCC.getOperand(1),
	InvCond);
	}
	}
	}

	// If the LHS is '(and load, const)', the RHS is 0, the test is for
	// equality or unsigned, and all 1 bits of the const are in the same
	// partial word, see if we can shorten the load.
	if (DCI.isBeforeLegalize() &&
	!ISD::isSignedIntSetCC(Cond) &&
	N0.getOpcode() == ISD::AND && C1 == 0 &&
	N0.getNode()->hasOneUse() &&
	isa<LoadSDNode>(N0.getOperand(0)) &&
	N0.getOperand(0).getNode()->hasOneUse() &&
	isa<ConstantSDNode>(N0.getOperand(1))) {
	LoadSDNode *Lod = cast<LoadSDNode>(N0.getOperand(0));
	APInt bestMask;
	unsigned bestWidth = 0, bestOffset = 0;
	if (!Lod->isVolatile() && Lod->isUnindexed()) {
	unsigned origWidth = N0.getValueSizeInBits();
	unsigned maskWidth = origWidth;
	// We can narrow (e.g.) 16-bit extending loads on 32-bit target to
	// 8 bits, but have to be careful...
	if (Lod->getExtensionType() != ISD::NON_EXTLOAD)
	origWidth = Lod->getMemoryVT().getSizeInBits();
	const APInt &Mask = N0.getConstantOperandAPInt(1);
	for (unsigned width = origWidth / 2; width>=8; width /= 2) {
	APInt newMask = APInt::getLowBitsSet(maskWidth, width);
	for (unsigned offset=0; offset<origWidth/width; offset++) {
	if (Mask.isSubsetOf(newMask)) {
	if (DAG.getDataLayout().isLittleEndian())
	bestOffset = (uint64_t)offset * (width/8);
	else
	bestOffset = (origWidth/width - offset - 1) * (width/8);
	bestMask = Mask.lshr(offset * (width/8) * 8);
	bestWidth = width;
	break;
	}
	newMask <<= width;
	}
	}
	}
	if (bestWidth) {
	EVT newVT = EVT::getIntegerVT(*DAG.getContext(), bestWidth);
	if (newVT.isRound() &&
	shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT)) {
	EVT PtrType = Lod->getOperand(1).getValueType();
	SDValue Ptr = Lod->getBasePtr();
	if (bestOffset != 0)
	Ptr = DAG.getNode(ISD::ADD, dl, PtrType, Lod->getBasePtr(),
	DAG.getConstant(bestOffset, dl, PtrType));
	unsigned NewAlign = MinAlign(Lod->getAlignment(), bestOffset);
	SDValue NewLoad = DAG.getLoad(
	newVT, dl, Lod->getChain(), Ptr,
	Lod->getPointerInfo().getWithOffset(bestOffset), NewAlign);
	return DAG.getSetCC(dl, VT,
	DAG.getNode(ISD::AND, dl, newVT, NewLoad,
	DAG.getConstant(bestMask.trunc(bestWidth),
	dl, newVT)),
	DAG.getConstant(0LL, dl, newVT), Cond);
	}
	}
	}

	// If the LHS is a ZERO_EXTEND, perform the comparison on the input.
	if (N0.getOpcode() == ISD::ZERO_EXTEND) {
	unsigned InSize = N0.getOperand(0).getValueSizeInBits();

	// If the comparison constant has bits in the upper part, the
	// zero-extended value could never match.
	if (C1.intersects(APInt::getHighBitsSet(C1.getBitWidth(),
	C1.getBitWidth() - InSize))) {
	switch (Cond) {
	case ISD::SETUGT:
	case ISD::SETUGE:
	case ISD::SETEQ:
	return DAG.getConstant(0, dl, VT);
	case ISD::SETULT:
	case ISD::SETULE:
	case ISD::SETNE:
	return DAG.getConstant(1, dl, VT);
	case ISD::SETGT:
	case ISD::SETGE:
	// True if the sign bit of C1 is set.
	return DAG.getConstant(C1.isNegative(), dl, VT);
	case ISD::SETLT:
	case ISD::SETLE:
	// True if the sign bit of C1 isn't set.
	return DAG.getConstant(C1.isNonNegative(), dl, VT);
	default:
	break;
	}
	}

	// Otherwise, we can perform the comparison with the low bits.
	switch (Cond) {
	case ISD::SETEQ:
	case ISD::SETNE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	case ISD::SETULT:
	case ISD::SETULE: {
	EVT newVT = N0.getOperand(0).getValueType();
	if (DCI.isBeforeLegalizeOps() \|\|
	(isOperationLegal(ISD::SETCC, newVT) &&
	isCondCodeLegal(Cond, newVT.getSimpleVT()))) {
	EVT NewSetCCVT =
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), newVT);
	SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);

	SDValue NewSetCC = DAG.getSetCC(dl, NewSetCCVT, N0.getOperand(0),
	NewConst, Cond);
	return DAG.getBoolExtOrTrunc(NewSetCC, dl, VT, N0.getValueType());
	}
	break;
	}
	default:
	break; // todo, be more careful with signed comparisons
	}
	} else if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
	(Cond == ISD::SETEQ \|\| Cond == ISD::SETNE)) {
	EVT ExtSrcTy = cast<VTSDNode>(N0.getOperand(1))->getVT();
	unsigned ExtSrcTyBits = ExtSrcTy.getSizeInBits();
	EVT ExtDstTy = N0.getValueType();
	unsigned ExtDstTyBits = ExtDstTy.getSizeInBits();

	// If the constant doesn't fit into the number of bits for the source of
	// the sign extension, it is impossible for both sides to be equal.
	if (C1.getMinSignedBits() > ExtSrcTyBits)
	return DAG.getConstant(Cond == ISD::SETNE, dl, VT);

	SDValue ZextOp;
	EVT Op0Ty = N0.getOperand(0).getValueType();
	if (Op0Ty == ExtSrcTy) {
	ZextOp = N0.getOperand(0);
	} else {
	APInt Imm = APInt::getLowBitsSet(ExtDstTyBits, ExtSrcTyBits);
	ZextOp = DAG.getNode(ISD::AND, dl, Op0Ty, N0.getOperand(0),
	DAG.getConstant(Imm, dl, Op0Ty));
	}
	if (!DCI.isCalledByLegalizer())
	DCI.AddToWorklist(ZextOp.getNode());
	// Otherwise, make this a use of a zext.
	return DAG.getSetCC(dl, VT, ZextOp,
	DAG.getConstant(C1 & APInt::getLowBitsSet(
	ExtDstTyBits,
	ExtSrcTyBits),
	dl, ExtDstTy),
	Cond);
	} else if ((N1C->isNullValue() \|\| N1C->isOne()) &&
	(Cond == ISD::SETEQ \|\| Cond == ISD::SETNE)) {
	// SETCC (SETCC), [0\|1], [EQ\|NE] -> SETCC
	if (N0.getOpcode() == ISD::SETCC &&
	isTypeLegal(VT) && VT.bitsLE(N0.getValueType())) {
	bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (!N1C->isOne());
	if (TrueWhenTrue)
	return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
	// Invert the condition.
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
	CC = ISD::getSetCCInverse(CC,
	N0.getOperand(0).getValueType().isInteger());
	if (DCI.isBeforeLegalizeOps() \|\|
	isCondCodeLegal(CC, N0.getOperand(0).getSimpleValueType()))
	return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
	}

	if ((N0.getOpcode() == ISD::XOR \|\|
	(N0.getOpcode() == ISD::AND &&
	N0.getOperand(0).getOpcode() == ISD::XOR &&
	N0.getOperand(1) == N0.getOperand(0).getOperand(1))) &&
	isa<ConstantSDNode>(N0.getOperand(1)) &&
	cast<ConstantSDNode>(N0.getOperand(1))->isOne()) {
	// If this is (X^1) == 0/1, swap the RHS and eliminate the xor. We
	// can only do this if the top bits are known zero.
	unsigned BitWidth = N0.getValueSizeInBits();
	if (DAG.MaskedValueIsZero(N0,
	APInt::getHighBitsSet(BitWidth,
	BitWidth-1))) {
	// Okay, get the un-inverted input value.
	SDValue Val;
	if (N0.getOpcode() == ISD::XOR) {
	Val = N0.getOperand(0);
	} else {
	assert(N0.getOpcode() == ISD::AND &&
	N0.getOperand(0).getOpcode() == ISD::XOR);
	// ((X^1)&1)^1 -> X & 1
	Val = DAG.getNode(ISD::AND, dl, N0.getValueType(),
	N0.getOperand(0).getOperand(0),
	N0.getOperand(1));
	}

	return DAG.getSetCC(dl, VT, Val, N1,
	Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
	}
	} else if (N1C->isOne() &&
	(VT == MVT::i1 \|\|
	getBooleanContents(N0->getValueType(0)) ==
	ZeroOrOneBooleanContent)) {
	SDValue Op0 = N0;
	if (Op0.getOpcode() == ISD::TRUNCATE)
	Op0 = Op0.getOperand(0);

	if ((Op0.getOpcode() == ISD::XOR) &&
	Op0.getOperand(0).getOpcode() == ISD::SETCC &&
	Op0.getOperand(1).getOpcode() == ISD::SETCC) {
	// (xor (setcc), (setcc)) == / != 1 -> (setcc) != / == (setcc)
	Cond = (Cond == ISD::SETEQ) ? ISD::SETNE : ISD::SETEQ;
	return DAG.getSetCC(dl, VT, Op0.getOperand(0), Op0.getOperand(1),
	Cond);
	}
	if (Op0.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(Op0.getOperand(1)) &&
	cast<ConstantSDNode>(Op0.getOperand(1))->isOne()) {
	// If this is (X&1) == / != 1, normalize it to (X&1) != / == 0.
	if (Op0.getValueType().bitsGT(VT))
	Op0 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::TRUNCATE, dl, VT, Op0.getOperand(0)),
	DAG.getConstant(1, dl, VT));
	else if (Op0.getValueType().bitsLT(VT))
	Op0 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::ANY_EXTEND, dl, VT, Op0.getOperand(0)),
	DAG.getConstant(1, dl, VT));

	return DAG.getSetCC(dl, VT, Op0,
	DAG.getConstant(0, dl, Op0.getValueType()),
	Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
	}
	if (Op0.getOpcode() == ISD::AssertZext &&
	cast<VTSDNode>(Op0.getOperand(1))->getVT() == MVT::i1)
	return DAG.getSetCC(dl, VT, Op0,
	DAG.getConstant(0, dl, Op0.getValueType()),
	Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
	}
	}

	// Given:
	// icmp eq/ne (urem %x, %y), 0
	// Iff %x has 0 or 1 bits set, and %y has at least 2 bits set, omit 'urem':
	// icmp eq/ne %x, 0
	if (N0.getOpcode() == ISD::UREM && N1C->isNullValue() &&
	(Cond == ISD::SETEQ \|\| Cond == ISD::SETNE)) {
	KnownBits XKnown = DAG.computeKnownBits(N0.getOperand(0));
	KnownBits YKnown = DAG.computeKnownBits(N0.getOperand(1));
	if (XKnown.countMaxPopulation() == 1 && YKnown.countMinPopulation() >= 2)
	return DAG.getSetCC(dl, VT, N0.getOperand(0), N1, Cond);
	}

	if (SDValue V =
	optimizeSetCCOfSignedTruncationCheck(VT, N0, N1, Cond, DCI, dl))
	return V;
	}

	// These simplifications apply to splat vectors as well.
	// TODO: Handle more splat vector cases.
	if (auto *N1C = isConstOrConstSplat(N1)) {
	const APInt &C1 = N1C->getAPIntValue();

	APInt MinVal, MaxVal;
	unsigned OperandBitSize = N1C->getValueType(0).getScalarSizeInBits();
	if (ISD::isSignedIntSetCC(Cond)) {
	MinVal = APInt::getSignedMinValue(OperandBitSize);
	MaxVal = APInt::getSignedMaxValue(OperandBitSize);
	} else {
	MinVal = APInt::getMinValue(OperandBitSize);
	MaxVal = APInt::getMaxValue(OperandBitSize);
	}

	// Canonicalize GE/LE comparisons to use GT/LT comparisons.
	if (Cond == ISD::SETGE \|\| Cond == ISD::SETUGE) {
	// X >= MIN --> true
	if (C1 == MinVal)
	return DAG.getBoolConstant(true, dl, VT, OpVT);

	if (!VT.isVector()) { // TODO: Support this for vectors.
	// X >= C0 --> X > (C0 - 1)
	APInt C = C1 - 1;
	ISD::CondCode NewCC = (Cond == ISD::SETGE) ? ISD::SETGT : ISD::SETUGT;
	if ((DCI.isBeforeLegalizeOps() \|\|
	isCondCodeLegal(NewCC, VT.getSimpleVT())) &&
	(!N1C->isOpaque() \|\| (C.getBitWidth() <= 64 &&
	isLegalICmpImmediate(C.getSExtValue())))) {
	return DAG.getSetCC(dl, VT, N0,
	DAG.getConstant(C, dl, N1.getValueType()),
	NewCC);
	}
	}
	}

	if (Cond == ISD::SETLE \|\| Cond == ISD::SETULE) {
	// X <= MAX --> true
	if (C1 == MaxVal)
	return DAG.getBoolConstant(true, dl, VT, OpVT);

	// X <= C0 --> X < (C0 + 1)
	if (!VT.isVector()) { // TODO: Support this for vectors.
	APInt C = C1 + 1;
	ISD::CondCode NewCC = (Cond == ISD::SETLE) ? ISD::SETLT : ISD::SETULT;
	if ((DCI.isBeforeLegalizeOps() \|\|
	isCondCodeLegal(NewCC, VT.getSimpleVT())) &&
	(!N1C->isOpaque() \|\| (C.getBitWidth() <= 64 &&
	isLegalICmpImmediate(C.getSExtValue())))) {
	return DAG.getSetCC(dl, VT, N0,
	DAG.getConstant(C, dl, N1.getValueType()),
	NewCC);
	}
	}
	}

	if (Cond == ISD::SETLT \|\| Cond == ISD::SETULT) {
	if (C1 == MinVal)
	return DAG.getBoolConstant(false, dl, VT, OpVT); // X < MIN --> false

	// TODO: Support this for vectors after legalize ops.
	if (!VT.isVector() \|\| DCI.isBeforeLegalizeOps()) {
	// Canonicalize setlt X, Max --> setne X, Max
	if (C1 == MaxVal)
	return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);

	// If we have setult X, 1, turn it into seteq X, 0
	if (C1 == MinVal+1)
	return DAG.getSetCC(dl, VT, N0,
	DAG.getConstant(MinVal, dl, N0.getValueType()),
	ISD::SETEQ);
	}
	}

	if (Cond == ISD::SETGT \|\| Cond == ISD::SETUGT) {
	if (C1 == MaxVal)
	return DAG.getBoolConstant(false, dl, VT, OpVT); // X > MAX --> false

	// TODO: Support this for vectors after legalize ops.
	if (!VT.isVector() \|\| DCI.isBeforeLegalizeOps()) {
	// Canonicalize setgt X, Min --> setne X, Min
	if (C1 == MinVal)
	return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);

	// If we have setugt X, Max-1, turn it into seteq X, Max
	if (C1 == MaxVal-1)
	return DAG.getSetCC(dl, VT, N0,
	DAG.getConstant(MaxVal, dl, N0.getValueType()),
	ISD::SETEQ);
	}
	}

	// If we have "setcc X, C0", check to see if we can shrink the immediate
	// by changing cc.
	// TODO: Support this for vectors after legalize ops.
	if (!VT.isVector() \|\| DCI.isBeforeLegalizeOps()) {
	// SETUGT X, SINTMAX -> SETLT X, 0
	if (Cond == ISD::SETUGT &&
	C1 == APInt::getSignedMaxValue(OperandBitSize))
	return DAG.getSetCC(dl, VT, N0,
	DAG.getConstant(0, dl, N1.getValueType()),
	ISD::SETLT);

	// SETULT X, SINTMIN -> SETGT X, -1
	if (Cond == ISD::SETULT &&
	C1 == APInt::getSignedMinValue(OperandBitSize)) {
	SDValue ConstMinusOne =
	DAG.getConstant(APInt::getAllOnesValue(OperandBitSize), dl,
	N1.getValueType());
	return DAG.getSetCC(dl, VT, N0, ConstMinusOne, ISD::SETGT);
	}
	}
	}

	// Back to non-vector simplifications.
	// TODO: Can we do these for vector splats?
	if (auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
	const APInt &C1 = N1C->getAPIntValue();

	// Fold bit comparisons when we can.
	if ((Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) &&
	(VT == N0.getValueType() \|\|
	(isTypeLegal(VT) && VT.bitsLE(N0.getValueType()))) &&
	N0.getOpcode() == ISD::AND) {
	auto &DL = DAG.getDataLayout();
	if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
	EVT ShiftTy = getShiftAmountTy(N0.getValueType(), DL,
	!DCI.isBeforeLegalize());
	if (Cond == ISD::SETNE && C1 == 0) {// (X & 8) != 0 --> (X & 8) >> 3
	// Perform the xform if the AND RHS is a single bit.
	if (AndRHS->getAPIntValue().isPowerOf2()) {
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
	DAG.getConstant(AndRHS->getAPIntValue().logBase2(), dl,
	ShiftTy)));
	}
	} else if (Cond == ISD::SETEQ && C1 == AndRHS->getAPIntValue()) {
	// (X & 8) == 8 --> (X & 8) >> 3
	// Perform the xform if C1 is a single bit.
	if (C1.isPowerOf2()) {
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
	DAG.getConstant(C1.logBase2(), dl,
	ShiftTy)));
	}
	}
	}
	}

	if (C1.getMinSignedBits() <= 64 &&
	!isLegalICmpImmediate(C1.getSExtValue())) {
	// (X & -256) == 256 -> (X >> 8) == 1
	if ((Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) &&
	N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
	if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
	const APInt &AndRHSC = AndRHS->getAPIntValue();
	if ((-AndRHSC).isPowerOf2() && (AndRHSC & C1) == C1) {
	unsigned ShiftBits = AndRHSC.countTrailingZeros();
	auto &DL = DAG.getDataLayout();
	EVT ShiftTy = getShiftAmountTy(N0.getValueType(), DL,
	!DCI.isBeforeLegalize());
	EVT CmpTy = N0.getValueType();
	SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0.getOperand(0),
	DAG.getConstant(ShiftBits, dl,
	ShiftTy));
	SDValue CmpRHS = DAG.getConstant(C1.lshr(ShiftBits), dl, CmpTy);
	return DAG.getSetCC(dl, VT, Shift, CmpRHS, Cond);
	}
	}
	} else if (Cond == ISD::SETULT \|\| Cond == ISD::SETUGE \|\|
	Cond == ISD::SETULE \|\| Cond == ISD::SETUGT) {
	bool AdjOne = (Cond == ISD::SETULE \|\| Cond == ISD::SETUGT);
	// X < 0x100000000 -> (X >> 32) < 1
	// X >= 0x100000000 -> (X >> 32) >= 1
	// X <= 0x0ffffffff -> (X >> 32) < 1
	// X > 0x0ffffffff -> (X >> 32) >= 1
	unsigned ShiftBits;
	APInt NewC = C1;
	ISD::CondCode NewCond = Cond;
	if (AdjOne) {
	ShiftBits = C1.countTrailingOnes();
	NewC = NewC + 1;
	NewCond = (Cond == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
	} else {
	ShiftBits = C1.countTrailingZeros();
	}
	NewC.lshrInPlace(ShiftBits);
	if (ShiftBits && NewC.getMinSignedBits() <= 64 &&
	isLegalICmpImmediate(NewC.getSExtValue())) {
	auto &DL = DAG.getDataLayout();
	EVT ShiftTy = getShiftAmountTy(N0.getValueType(), DL,
	!DCI.isBeforeLegalize());
	EVT CmpTy = N0.getValueType();
	SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0,
	DAG.getConstant(ShiftBits, dl, ShiftTy));
	SDValue CmpRHS = DAG.getConstant(NewC, dl, CmpTy);
	return DAG.getSetCC(dl, VT, Shift, CmpRHS, NewCond);
	}
	}
	}
	}

	if (!isa<ConstantFPSDNode>(N0) && isa<ConstantFPSDNode>(N1)) {
	auto *CFP = cast<ConstantFPSDNode>(N1);
	assert(!CFP->getValueAPF().isNaN() && "Unexpected NaN value");

	// Otherwise, we know the RHS is not a NaN. Simplify the node to drop the
	// constant if knowing that the operand is non-nan is enough. We prefer to
	// have SETO(x,x) instead of SETO(x, 0.0) because this avoids having to
	// materialize 0.0.
	if (Cond == ISD::SETO \|\| Cond == ISD::SETUO)
	return DAG.getSetCC(dl, VT, N0, N0, Cond);

	// setcc (fneg x), C -> setcc swap(pred) x, -C
	if (N0.getOpcode() == ISD::FNEG) {
	ISD::CondCode SwapCond = ISD::getSetCCSwappedOperands(Cond);
	if (DCI.isBeforeLegalizeOps() \|\|
	isCondCodeLegal(SwapCond, N0.getSimpleValueType())) {
	SDValue NegN1 = DAG.getNode(ISD::FNEG, dl, N0.getValueType(), N1);
	return DAG.getSetCC(dl, VT, N0.getOperand(0), NegN1, SwapCond);
	}
	}

	// If the condition is not legal, see if we can find an equivalent one
	// which is legal.
	if (!isCondCodeLegal(Cond, N0.getSimpleValueType())) {
	// If the comparison was an awkward floating-point == or != and one of
	// the comparison operands is infinity or negative infinity, convert the
	// condition to a less-awkward <= or >=.
	if (CFP->getValueAPF().isInfinity()) {
	if (CFP->getValueAPF().isNegative()) {
	if (Cond == ISD::SETOEQ &&
	isCondCodeLegal(ISD::SETOLE, N0.getSimpleValueType()))
	return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOLE);
	if (Cond == ISD::SETUEQ &&
	isCondCodeLegal(ISD::SETOLE, N0.getSimpleValueType()))
	return DAG.getSetCC(dl, VT, N0, N1, ISD::SETULE);
	if (Cond == ISD::SETUNE &&
	isCondCodeLegal(ISD::SETUGT, N0.getSimpleValueType()))
	return DAG.getSetCC(dl, VT, N0, N1, ISD::SETUGT);
	if (Cond == ISD::SETONE &&
	isCondCodeLegal(ISD::SETUGT, N0.getSimpleValueType()))
	return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOGT);
	} else {
	if (Cond == ISD::SETOEQ &&
	isCondCodeLegal(ISD::SETOGE, N0.getSimpleValueType()))
	return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOGE);
	if (Cond == ISD::SETUEQ &&
	isCondCodeLegal(ISD::SETOGE, N0.getSimpleValueType()))
	return DAG.getSetCC(dl, VT, N0, N1, ISD::SETUGE);
	if (Cond == ISD::SETUNE &&
	isCondCodeLegal(ISD::SETULT, N0.getSimpleValueType()))
	return DAG.getSetCC(dl, VT, N0, N1, ISD::SETULT);
	if (Cond == ISD::SETONE &&
	isCondCodeLegal(ISD::SETULT, N0.getSimpleValueType()))
	return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOLT);
	}
	}
	}
	}

	if (N0 == N1) {
	// The sext(setcc()) => setcc() optimization relies on the appropriate
	// constant being emitted.
	assert(!N0.getValueType().isInteger() &&
	"Integer types should be handled by FoldSetCC");

	bool EqTrue = ISD::isTrueWhenEqual(Cond);
	unsigned UOF = ISD::getUnorderedFlavor(Cond);
	if (UOF == 2) // FP operators that are undefined on NaNs.
	return DAG.getBoolConstant(EqTrue, dl, VT, OpVT);
	if (UOF == unsigned(EqTrue))
	return DAG.getBoolConstant(EqTrue, dl, VT, OpVT);
	// Otherwise, we can't fold it. However, we can simplify it to SETUO/SETO
	// if it is not already.
	ISD::CondCode NewCond = UOF == 0 ? ISD::SETO : ISD::SETUO;
	if (NewCond != Cond &&
	(DCI.isBeforeLegalizeOps() \|\|
	isCondCodeLegal(NewCond, N0.getSimpleValueType())))
	return DAG.getSetCC(dl, VT, N0, N1, NewCond);
	}

	if ((Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) &&
	N0.getValueType().isInteger()) {
	if (N0.getOpcode() == ISD::ADD \|\| N0.getOpcode() == ISD::SUB \|\|
	N0.getOpcode() == ISD::XOR) {
	// Simplify (X+Y) == (X+Z) --> Y == Z
	if (N0.getOpcode() == N1.getOpcode()) {
	if (N0.getOperand(0) == N1.getOperand(0))
	return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(1), Cond);
	if (N0.getOperand(1) == N1.getOperand(1))
	return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(0), Cond);
	if (isCommutativeBinOp(N0.getOpcode())) {
	// If X op Y == Y op X, try other combinations.
	if (N0.getOperand(0) == N1.getOperand(1))
	return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(0),
	Cond);
	if (N0.getOperand(1) == N1.getOperand(0))
	return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(1),
	Cond);
	}
	}

	// If RHS is a legal immediate value for a compare instruction, we need
	// to be careful about increasing register pressure needlessly.
	bool LegalRHSImm = false;

	if (auto *RHSC = dyn_cast<ConstantSDNode>(N1)) {
	if (auto *LHSR = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
	// Turn (X+C1) == C2 --> X == C2-C1
	if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse()) {
	return DAG.getSetCC(dl, VT, N0.getOperand(0),
	DAG.getConstant(RHSC->getAPIntValue()-
	LHSR->getAPIntValue(),
	dl, N0.getValueType()), Cond);
	}

	// Turn (X^C1) == C2 into X == C1^C2 iff X&~C1 = 0.
	if (N0.getOpcode() == ISD::XOR)
	// If we know that all of the inverted bits are zero, don't bother
	// performing the inversion.
	if (DAG.MaskedValueIsZero(N0.getOperand(0), ~LHSR->getAPIntValue()))
	return
	DAG.getSetCC(dl, VT, N0.getOperand(0),
	DAG.getConstant(LHSR->getAPIntValue() ^
	RHSC->getAPIntValue(),
	dl, N0.getValueType()),
	Cond);
	}

	// Turn (C1-X) == C2 --> X == C1-C2
	if (auto *SUBC = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
	if (N0.getOpcode() == ISD::SUB && N0.getNode()->hasOneUse()) {
	return
	DAG.getSetCC(dl, VT, N0.getOperand(1),
	DAG.getConstant(SUBC->getAPIntValue() -
	RHSC->getAPIntValue(),
	dl, N0.getValueType()),
	Cond);
	}
	}

	// Could RHSC fold directly into a compare?
	if (RHSC->getValueType(0).getSizeInBits() <= 64)
	LegalRHSImm = isLegalICmpImmediate(RHSC->getSExtValue());
	}

	// (X+Y) == X --> Y == 0 and similar folds.
	// Don't do this if X is an immediate that can fold into a cmp
	// instruction and X+Y has other uses. It could be an induction variable
	// chain, and the transform would increase register pressure.
	if (!LegalRHSImm \|\| N0.hasOneUse())
	if (SDValue V = foldSetCCWithBinOp(VT, N0, N1, Cond, dl, DCI))
	return V;
	}

	if (N1.getOpcode() == ISD::ADD \|\| N1.getOpcode() == ISD::SUB \|\|
	N1.getOpcode() == ISD::XOR)
	if (SDValue V = foldSetCCWithBinOp(VT, N1, N0, Cond, dl, DCI))
	return V;

	if (SDValue V = foldSetCCWithAnd(VT, N0, N1, Cond, dl, DCI))
	return V;
	}

	// Fold remainder of division by a constant.
	if (N0.getOpcode() == ISD::UREM && N0.hasOneUse() &&
	(Cond == ISD::SETEQ \|\| Cond == ISD::SETNE)) {
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

	// When division is cheap or optimizing for minimum size,
	// fall through to DIVREM creation by skipping this fold.
	if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttribute(Attribute::MinSize))
	if (SDValue Folded = buildUREMEqFold(VT, N0, N1, Cond, DCI, dl))
	return Folded;
	}

	// Fold away ALL boolean setcc's.
	if (N0.getValueType().getScalarType() == MVT::i1 && foldBooleans) {
	SDValue Temp;
	switch (Cond) {
	default: llvm_unreachable("Unknown integer setcc!");
	case ISD::SETEQ: // X == Y -> ~(X^Y)
	Temp = DAG.getNode(ISD::XOR, dl, OpVT, N0, N1);
	N0 = DAG.getNOT(dl, Temp, OpVT);
	if (!DCI.isCalledByLegalizer())
	DCI.AddToWorklist(Temp.getNode());
	break;
	case ISD::SETNE: // X != Y --> (X^Y)
	N0 = DAG.getNode(ISD::XOR, dl, OpVT, N0, N1);
	break;
	case ISD::SETGT: // X >s Y --> X == 0 & Y == 1 --> ~X & Y
	case ISD::SETULT: // X <u Y --> X == 0 & Y == 1 --> ~X & Y
	Temp = DAG.getNOT(dl, N0, OpVT);
	N0 = DAG.getNode(ISD::AND, dl, OpVT, N1, Temp);
	if (!DCI.isCalledByLegalizer())
	DCI.AddToWorklist(Temp.getNode());
	break;
	case ISD::SETLT: // X <s Y --> X == 1 & Y == 0 --> ~Y & X
	case ISD::SETUGT: // X >u Y --> X == 1 & Y == 0 --> ~Y & X
	Temp = DAG.getNOT(dl, N1, OpVT);
	N0 = DAG.getNode(ISD::AND, dl, OpVT, N0, Temp);
	if (!DCI.isCalledByLegalizer())
	DCI.AddToWorklist(Temp.getNode());
	break;
	case ISD::SETULE: // X <=u Y --> X == 0 \| Y == 1 --> ~X \| Y
	case ISD::SETGE: // X >=s Y --> X == 0 \| Y == 1 --> ~X \| Y
	Temp = DAG.getNOT(dl, N0, OpVT);
	N0 = DAG.getNode(ISD::OR, dl, OpVT, N1, Temp);
	if (!DCI.isCalledByLegalizer())
	DCI.AddToWorklist(Temp.getNode());
	break;
	case ISD::SETUGE: // X >=u Y --> X == 1 \| Y == 0 --> ~Y \| X
	case ISD::SETLE: // X <=s Y --> X == 1 \| Y == 0 --> ~Y \| X
	Temp = DAG.getNOT(dl, N1, OpVT);
	N0 = DAG.getNode(ISD::OR, dl, OpVT, N0, Temp);
	break;
	}
	if (VT.getScalarType() != MVT::i1) {
	if (!DCI.isCalledByLegalizer())
	DCI.AddToWorklist(N0.getNode());
	// FIXME: If running after legalize, we probably can't do this.
	ISD::NodeType ExtendCode = getExtendForContent(getBooleanContents(OpVT));
	N0 = DAG.getNode(ExtendCode, dl, VT, N0);
	}
	return N0;
	}

	// Could not fold it.
	return SDValue();
	}

	/// Returns true (and the GlobalValue and the offset) if the node is a
	/// GlobalAddress + offset.
	bool TargetLowering::isGAPlusOffset(SDNode WN, const GlobalValue &GA,
	int64_t &Offset) const {

	SDNode *N = unwrapAddress(SDValue(WN, 0)).getNode();

	if (auto *GASD = dyn_cast<GlobalAddressSDNode>(N)) {
	GA = GASD->getGlobal();
	Offset += GASD->getOffset();
	return true;
	}

	if (N->getOpcode() == ISD::ADD) {
	SDValue N1 = N->getOperand(0);
	SDValue N2 = N->getOperand(1);
	if (isGAPlusOffset(N1.getNode(), GA, Offset)) {
	if (auto *V = dyn_cast<ConstantSDNode>(N2)) {
	Offset += V->getSExtValue();
	return true;
	}
	} else if (isGAPlusOffset(N2.getNode(), GA, Offset)) {
	if (auto *V = dyn_cast<ConstantSDNode>(N1)) {
	Offset += V->getSExtValue();
	return true;
	}
	}
	}

	return false;
	}

	SDValue TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	// Default implementation: no optimization.
	return SDValue();
	}

	//===----------------------------------------------------------------------===//
	// Inline Assembler Implementation Methods
	//===----------------------------------------------------------------------===//

	TargetLowering::ConstraintType
	TargetLowering::getConstraintType(StringRef Constraint) const {
	unsigned S = Constraint.size();

	if (S == 1) {
	switch (Constraint[0]) {
	default: break;
	- case 'r': return C_RegisterClass;
	+ case 'r':
	+ return C_RegisterClass;
	case 'm': // memory
	case 'o': // offsetable
	case 'V': // not offsetable
	return C_Memory;
	- case 'i': // Simple Integer or Relocatable Constant
	case 'n': // Simple Integer
	case 'E': // Floating Point Constant
	case 'F': // Floating Point Constant
	+ return C_Immediate;
	+ case 'i': // Simple Integer or Relocatable Constant
	case 's': // Relocatable Constant
	case 'p': // Address.
	case 'X': // Allow ANY value.
	case 'I': // Target registers.
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'O':
	case 'P':
	case '<':
	case '>':
	return C_Other;
	}
	}

	if (S > 1 && Constraint[0] == '{' && Constraint[S - 1] == '}') {
	if (S == 8 && Constraint.substr(1, 6) == "memory") // "{memory}"
	return C_Memory;
	return C_Register;
	}
	return C_Unknown;
	}

	/// Try to replace an X constraint, which matches anything, with another that
	/// has more specific requirements based on the type of the corresponding
	/// operand.
	const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
	if (ConstraintVT.isInteger())
	return "r";
	if (ConstraintVT.isFloatingPoint())
	return "f"; // works for many targets
	return nullptr;
	}

	SDValue TargetLowering::LowerAsmOutputForConstraint(
	SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
	SelectionDAG &DAG) const {
	return SDValue();
	}

	/// Lower the specified operand into the Ops vector.
	/// If it is invalid, don't add anything to Ops.
	void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const {

	if (Constraint.length() > 1) return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default: break;
	case 'X': // Allows any operand; labels (basic block) use this.
	if (Op.getOpcode() == ISD::BasicBlock \|\|
	Op.getOpcode() == ISD::TargetBlockAddress) {
	Ops.push_back(Op);
	return;
	}
	LLVM_FALLTHROUGH;
	case 'i': // Simple Integer or Relocatable Constant
	case 'n': // Simple Integer
	case 's': { // Relocatable Constant

	GlobalAddressSDNode *GA;
	ConstantSDNode *C;
	BlockAddressSDNode *BA;
	uint64_t Offset = 0;

	// Match (GA) or (C) or (GA+C) or (GA-C) or ((GA+C)+C) or (((GA+C)+C)+C),
	// etc., since getelementpointer is variadic. We can't use
	// SelectionDAG::FoldSymbolOffset because it expects the GA to be accessible
	// while in this case the GA may be furthest from the root node which is
	// likely an ISD::ADD.
	while (1) {
	if ((GA = dyn_cast<GlobalAddressSDNode>(Op)) && ConstraintLetter != 'n') {
	Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
	GA->getValueType(0),
	Offset + GA->getOffset()));
	return;
	} else if ((C = dyn_cast<ConstantSDNode>(Op)) &&
	ConstraintLetter != 's') {
	// gcc prints these as sign extended. Sign extend value to 64 bits
	// now; without this it would get ZExt'd later in
	// ScheduleDAGSDNodes::EmitNode, which is very generic.
	bool IsBool = C->getConstantIntValue()->getBitWidth() == 1;
	BooleanContent BCont = getBooleanContents(MVT::i64);
	ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
	: ISD::SIGN_EXTEND;
	int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? C->getZExtValue()
	: C->getSExtValue();
	Ops.push_back(DAG.getTargetConstant(Offset + ExtVal,
	SDLoc(C), MVT::i64));
	return;
	} else if ((BA = dyn_cast<BlockAddressSDNode>(Op)) &&
	ConstraintLetter != 'n') {
	Ops.push_back(DAG.getTargetBlockAddress(
	BA->getBlockAddress(), BA->getValueType(0),
	Offset + BA->getOffset(), BA->getTargetFlags()));
	return;
	} else {
	const unsigned OpCode = Op.getOpcode();
	if (OpCode == ISD::ADD \|\| OpCode == ISD::SUB) {
	if ((C = dyn_cast<ConstantSDNode>(Op.getOperand(0))))
	Op = Op.getOperand(1);
	// Subtraction is not commutative.
	else if (OpCode == ISD::ADD &&
	(C = dyn_cast<ConstantSDNode>(Op.getOperand(1))))
	Op = Op.getOperand(0);
	else
	return;
	Offset += (OpCode == ISD::ADD ? 1 : -1) * C->getSExtValue();
	continue;
	}
	}
	return;
	}
	break;
	}
	}
	}

	std::pair<unsigned, const TargetRegisterClass *>
	TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI,
	StringRef Constraint,
	MVT VT) const {
	if (Constraint.empty() \|\| Constraint[0] != '{')
	return std::make_pair(0u, static_cast<TargetRegisterClass *>(nullptr));
	assert(*(Constraint.end() - 1) == '}' && "Not a brace enclosed constraint?");

	// Remove the braces from around the name.
	StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);

	std::pair<unsigned, const TargetRegisterClass *> R =
	std::make_pair(0u, static_cast<const TargetRegisterClass *>(nullptr));

	// Figure out which register class contains this reg.
	for (const TargetRegisterClass *RC : RI->regclasses()) {
	// If none of the value types for this register class are valid, we
	// can't use it. For example, 64-bit reg classes on 32-bit targets.
	if (!isLegalRC(RI, RC))
	continue;

	for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
	I != E; ++I) {
	if (RegName.equals_lower(RI->getRegAsmName(*I))) {
	std::pair<unsigned, const TargetRegisterClass *> S =
	std::make_pair(*I, RC);

	// If this register class has the requested value type, return it,
	// otherwise keep searching and return the first class found
	// if no other is found which explicitly has the requested type.
	if (RI->isTypeLegalForClass(*RC, VT))
	return S;
	if (!R.second)
	R = S;
	}
	}
	}

	return R;
	}

	//===----------------------------------------------------------------------===//
	// Constraint Selection.

	/// Return true of this is an input operand that is a matching constraint like
	/// "4".
	bool TargetLowering::AsmOperandInfo::isMatchingInputConstraint() const {
	assert(!ConstraintCode.empty() && "No known constraint!");
	return isdigit(static_cast<unsigned char>(ConstraintCode[0]));
	}

	/// If this is an input matching constraint, this method returns the output
	/// operand it matches.
	unsigned TargetLowering::AsmOperandInfo::getMatchedOperand() const {
	assert(!ConstraintCode.empty() && "No known constraint!");
	return atoi(ConstraintCode.c_str());
	}

	/// Split up the constraint string from the inline assembly value into the
	/// specific constraints and their prefixes, and also tie in the associated
	/// operand values.
	/// If this returns an empty vector, and if the constraint string itself
	/// isn't empty, there was an error parsing.
	TargetLowering::AsmOperandInfoVector
	TargetLowering::ParseConstraints(const DataLayout &DL,
	const TargetRegisterInfo *TRI,
	ImmutableCallSite CS) const {
	/// Information about all of the constraints.
	AsmOperandInfoVector ConstraintOperands;
	const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
	unsigned maCount = 0; // Largest number of multiple alternative constraints.

	// Do a prepass over the constraints, canonicalizing them, and building up the
	// ConstraintOperands list.
	unsigned ArgNo = 0; // ArgNo - The argument of the CallInst.
	unsigned ResNo = 0; // ResNo - The result number of the next output.

	for (InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
	ConstraintOperands.emplace_back(std::move(CI));
	AsmOperandInfo &OpInfo = ConstraintOperands.back();

	// Update multiple alternative constraint count.
	if (OpInfo.multipleAlternatives.size() > maCount)
	maCount = OpInfo.multipleAlternatives.size();

	OpInfo.ConstraintVT = MVT::Other;

	// Compute the value type for each operand.
	switch (OpInfo.Type) {
	case InlineAsm::isOutput:
	// Indirect outputs just consume an argument.
	if (OpInfo.isIndirect) {
	OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
	break;
	}

	// The return value of the call is this value. As such, there is no
	// corresponding argument.
	assert(!CS.getType()->isVoidTy() &&
	"Bad inline asm!");
	if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
	OpInfo.ConstraintVT =
	getSimpleValueType(DL, STy->getElementType(ResNo));
	} else {
	assert(ResNo == 0 && "Asm only has one result!");
	OpInfo.ConstraintVT = getSimpleValueType(DL, CS.getType());
	}
	++ResNo;
	break;
	case InlineAsm::isInput:
	OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
	break;
	case InlineAsm::isClobber:
	// Nothing to do.
	break;
	}

	if (OpInfo.CallOperandVal) {
	llvm::Type *OpTy = OpInfo.CallOperandVal->getType();
	if (OpInfo.isIndirect) {
	llvm::PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
	if (!PtrTy)
	report_fatal_error("Indirect operand for inline asm not a pointer!");
	OpTy = PtrTy->getElementType();
	}

	// Look for vector wrapped in a struct. e.g. { <16 x i8> }.
	if (StructType *STy = dyn_cast<StructType>(OpTy))
	if (STy->getNumElements() == 1)
	OpTy = STy->getElementType(0);

	// If OpTy is not a single value, it may be a struct/union that we
	// can tile with integers.
	if (!OpTy->isSingleValueType() && OpTy->isSized()) {
	unsigned BitSize = DL.getTypeSizeInBits(OpTy);
	switch (BitSize) {
	default: break;
	case 1:
	case 8:
	case 16:
	case 32:
	case 64:
	case 128:
	OpInfo.ConstraintVT =
	MVT::getVT(IntegerType::get(OpTy->getContext(), BitSize), true);
	break;
	}
	} else if (PointerType *PT = dyn_cast<PointerType>(OpTy)) {
	unsigned PtrSize = DL.getPointerSizeInBits(PT->getAddressSpace());
	OpInfo.ConstraintVT = MVT::getIntegerVT(PtrSize);
	} else {
	OpInfo.ConstraintVT = MVT::getVT(OpTy, true);
	}
	}
	}

	// If we have multiple alternative constraints, select the best alternative.
	if (!ConstraintOperands.empty()) {
	if (maCount) {
	unsigned bestMAIndex = 0;
	int bestWeight = -1;
	// weight: -1 = invalid match, and 0 = so-so match to 5 = good match.
	int weight = -1;
	unsigned maIndex;
	// Compute the sums of the weights for each alternative, keeping track
	// of the best (highest weight) one so far.
	for (maIndex = 0; maIndex < maCount; ++maIndex) {
	int weightSum = 0;
	for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
	cIndex != eIndex; ++cIndex) {
	AsmOperandInfo &OpInfo = ConstraintOperands[cIndex];
	if (OpInfo.Type == InlineAsm::isClobber)
	continue;

	// If this is an output operand with a matching input operand,
	// look up the matching input. If their types mismatch, e.g. one
	// is an integer, the other is floating point, or their sizes are
	// different, flag it as an maCantMatch.
	if (OpInfo.hasMatchingInput()) {
	AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
	if (OpInfo.ConstraintVT != Input.ConstraintVT) {
	if ((OpInfo.ConstraintVT.isInteger() !=
	Input.ConstraintVT.isInteger()) \|\|
	(OpInfo.ConstraintVT.getSizeInBits() !=
	Input.ConstraintVT.getSizeInBits())) {
	weightSum = -1; // Can't match.
	break;
	}
	}
	}
	weight = getMultipleConstraintMatchWeight(OpInfo, maIndex);
	if (weight == -1) {
	weightSum = -1;
	break;
	}
	weightSum += weight;
	}
	// Update best.
	if (weightSum > bestWeight) {
	bestWeight = weightSum;
	bestMAIndex = maIndex;
	}
	}

	// Now select chosen alternative in each constraint.
	for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
	cIndex != eIndex; ++cIndex) {
	AsmOperandInfo &cInfo = ConstraintOperands[cIndex];
	if (cInfo.Type == InlineAsm::isClobber)
	continue;
	cInfo.selectAlternative(bestMAIndex);
	}
	}
	}

	// Check and hook up tied operands, choose constraint code to use.
	for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
	cIndex != eIndex; ++cIndex) {
	AsmOperandInfo &OpInfo = ConstraintOperands[cIndex];

	// If this is an output operand with a matching input operand, look up the
	// matching input. If their types mismatch, e.g. one is an integer, the
	// other is floating point, or their sizes are different, flag it as an
	// error.
	if (OpInfo.hasMatchingInput()) {
	AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];

	if (OpInfo.ConstraintVT != Input.ConstraintVT) {
	std::pair<unsigned, const TargetRegisterClass *> MatchRC =
	getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
	OpInfo.ConstraintVT);
	std::pair<unsigned, const TargetRegisterClass *> InputRC =
	getRegForInlineAsmConstraint(TRI, Input.ConstraintCode,
	Input.ConstraintVT);
	if ((OpInfo.ConstraintVT.isInteger() !=
	Input.ConstraintVT.isInteger()) \|\|
	(MatchRC.second != InputRC.second)) {
	report_fatal_error("Unsupported asm: input constraint"
	" with a matching output constraint of"
	" incompatible type!");
	}
	}
	}
	}

	return ConstraintOperands;
	}

	/// Return an integer indicating how general CT is.
	static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
	switch (CT) {
	+ case TargetLowering::C_Immediate:
	case TargetLowering::C_Other:
	case TargetLowering::C_Unknown:
	return 0;
	case TargetLowering::C_Register:
	return 1;
	case TargetLowering::C_RegisterClass:
	return 2;
	case TargetLowering::C_Memory:
	return 3;
	}
	llvm_unreachable("Invalid constraint type");
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	TargetLowering::getMultipleConstraintMatchWeight(
	AsmOperandInfo &info, int maIndex) const {
	InlineAsm::ConstraintCodeVector *rCodes;
	if (maIndex >= (int)info.multipleAlternatives.size())
	rCodes = &info.Codes;
	else
	rCodes = &info.multipleAlternatives[maIndex].Codes;
	ConstraintWeight BestWeight = CW_Invalid;

	// Loop over the options, keeping track of the most general one.
	for (unsigned i = 0, e = rCodes->size(); i != e; ++i) {
	ConstraintWeight weight =
	getSingleConstraintMatchWeight(info, (*rCodes)[i].c_str());
	if (weight > BestWeight)
	BestWeight = weight;
	}

	return BestWeight;
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	// Look at the constraint type.
	switch (*constraint) {
	case 'i': // immediate integer.
	case 'n': // immediate integer with a known value.
	if (isa<ConstantInt>(CallOperandVal))
	weight = CW_Constant;
	break;
	case 's': // non-explicit intregal immediate.
	if (isa<GlobalValue>(CallOperandVal))
	weight = CW_Constant;
	break;
	case 'E': // immediate float if host format.
	case 'F': // immediate float.
	if (isa<ConstantFP>(CallOperandVal))
	weight = CW_Constant;
	break;
	case '<': // memory operand with autodecrement.
	case '>': // memory operand with autoincrement.
	case 'm': // memory operand.
	case 'o': // offsettable memory operand
	case 'V': // non-offsettable memory operand
	weight = CW_Memory;
	break;
	case 'r': // general register.
	case 'g': // general register, memory operand or immediate integer.
	// note: Clang converts "g" to "imr".
	if (CallOperandVal->getType()->isIntegerTy())
	weight = CW_Register;
	break;
	case 'X': // any operand.
	default:
	weight = CW_Default;
	break;
	}
	return weight;
	}

	/// If there are multiple different constraints that we could pick for this
	/// operand (e.g. "imr") try to pick the 'best' one.
	/// This is somewhat tricky: constraints fall into four classes:
	/// Other -> immediates and magic values
	/// Register -> one specific register
	/// RegisterClass -> a group of regs
	/// Memory -> memory
	/// Ideally, we would pick the most specific constraint possible: if we have
	/// something that fits into a register, we would pick it. The problem here
	/// is that if we have something that could either be in a register or in
	/// memory that use of the register could cause selection of other
	/// operands to fail: they might only succeed if we pick memory. Because of
	/// this the heuristic we use is:
	///
	/// 1) If there is an 'other' constraint, and if the operand is valid for
	/// that constraint, use it. This makes us take advantage of 'i'
	/// constraints when available.
	/// 2) Otherwise, pick the most general constraint present. This prefers
	/// 'm' over 'r', for example.
	///
	static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
	const TargetLowering &TLI,
	SDValue Op, SelectionDAG *DAG) {
	assert(OpInfo.Codes.size() > 1 && "Doesn't have multiple constraint options");
	unsigned BestIdx = 0;
	TargetLowering::ConstraintType BestType = TargetLowering::C_Unknown;
	int BestGenerality = -1;

	// Loop over the options, keeping track of the most general one.
	for (unsigned i = 0, e = OpInfo.Codes.size(); i != e; ++i) {
	TargetLowering::ConstraintType CType =
	TLI.getConstraintType(OpInfo.Codes[i]);

	- // If this is an 'other' constraint, see if the operand is valid for it.
	- // For example, on X86 we might have an 'rI' constraint. If the operand
	- // is an integer in the range [0..31] we want to use I (saving a load
	- // of a register), otherwise we must use 'r'.
	- if (CType == TargetLowering::C_Other && Op.getNode()) {
	+ // If this is an 'other' or 'immediate' constraint, see if the operand is
	+ // valid for it. For example, on X86 we might have an 'rI' constraint. If
	+ // the operand is an integer in the range [0..31] we want to use I (saving a
	+ // load of a register), otherwise we must use 'r'.
	+ if ((CType == TargetLowering::C_Other \|\|
	+ CType == TargetLowering::C_Immediate) && Op.getNode()) {
	assert(OpInfo.Codes[i].size() == 1 &&
	"Unhandled multi-letter 'other' constraint");
	std::vector<SDValue> ResultOps;
	TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i],
	ResultOps, *DAG);
	if (!ResultOps.empty()) {
	BestType = CType;
	BestIdx = i;
	break;
	}
	}

	// Things with matching constraints can only be registers, per gcc
	// documentation. This mainly affects "g" constraints.
	if (CType == TargetLowering::C_Memory && OpInfo.hasMatchingInput())
	continue;

	// This constraint letter is more general than the previous one, use it.
	int Generality = getConstraintGenerality(CType);
	if (Generality > BestGenerality) {
	BestType = CType;
	BestIdx = i;
	BestGenerality = Generality;
	}
	}

	OpInfo.ConstraintCode = OpInfo.Codes[BestIdx];
	OpInfo.ConstraintType = BestType;
	}

	/// Determines the constraint code and constraint type to use for the specific
	/// AsmOperandInfo, setting OpInfo.ConstraintCode and OpInfo.ConstraintType.
	void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
	SDValue Op,
	SelectionDAG *DAG) const {
	assert(!OpInfo.Codes.empty() && "Must have at least one constraint");

	// Single-letter constraints ('r') are very common.
	if (OpInfo.Codes.size() == 1) {
	OpInfo.ConstraintCode = OpInfo.Codes[0];
	OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
	} else {
	ChooseConstraint(OpInfo, *this, Op, DAG);
	}

	// 'X' matches anything.
	if (OpInfo.ConstraintCode == "X" && OpInfo.CallOperandVal) {
	// Labels and constants are handled elsewhere ('X' is the only thing
	// that matches labels). For Functions, the type here is the type of
	// the result, which is not what we want to look at; leave them alone.
	Value *v = OpInfo.CallOperandVal;
	if (isa<BasicBlock>(v) \|\| isa<ConstantInt>(v) \|\| isa<Function>(v)) {
	OpInfo.CallOperandVal = v;
	return;
	}

	if (Op.getNode() && Op.getOpcode() == ISD::TargetBlockAddress)
	return;

	// Otherwise, try to resolve it to something we know about by looking at
	// the actual operand type.
	if (const char *Repl = LowerXConstraint(OpInfo.ConstraintVT)) {
	OpInfo.ConstraintCode = Repl;
	OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
	}
	}
	}

	/// Given an exact SDIV by a constant, create a multiplication
	/// with the multiplicative inverse of the constant.
	static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
	const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
	EVT ShSVT = ShVT.getScalarType();

	bool UseSRA = false;
	SmallVector<SDValue, 16> Shifts, Factors;

	auto BuildSDIVPattern = [&](ConstantSDNode *C) {
	if (C->isNullValue())
	return false;
	APInt Divisor = C->getAPIntValue();
	unsigned Shift = Divisor.countTrailingZeros();
	if (Shift) {
	Divisor.ashrInPlace(Shift);
	UseSRA = true;
	}
	// Calculate the multiplicative inverse, using Newton's method.
	APInt t;
	APInt Factor = Divisor;
	while ((t = Divisor * Factor) != 1)
	Factor *= APInt(Divisor.getBitWidth(), 2) - t;
	Shifts.push_back(DAG.getConstant(Shift, dl, ShSVT));
	Factors.push_back(DAG.getConstant(Factor, dl, SVT));
	return true;
	};

	// Collect all magic values from the build vector.
	if (!ISD::matchUnaryPredicate(Op1, BuildSDIVPattern))
	return SDValue();

	SDValue Shift, Factor;
	if (VT.isVector()) {
	Shift = DAG.getBuildVector(ShVT, dl, Shifts);
	Factor = DAG.getBuildVector(VT, dl, Factors);
	} else {
	Shift = Shifts[0];
	Factor = Factors[0];
	}

	SDValue Res = Op0;

	// Shift the value upfront if it is even, so the LSB is one.
	if (UseSRA) {
	// TODO: For UDIV use SRL instead of SRA.
	SDNodeFlags Flags;
	Flags.setExact(true);
	Res = DAG.getNode(ISD::SRA, dl, VT, Res, Shift, Flags);
	Created.push_back(Res.getNode());
	}

	return DAG.getNode(ISD::MUL, dl, VT, Res, Factor);
	}

	SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const {
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isIntDivCheap(N->getValueType(0), Attr))
	return SDValue(N, 0); // Lower SDIV as SDIV
	return SDValue();
	}

	/// Given an ISD::SDIV node expressing a divide by constant,
	/// return a DAG expression to select that will generate the same value by
	/// multiplying by a magic number.
	/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
	SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
	bool IsAfterLegalization,
	SmallVectorImpl<SDNode *> &Created) const {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
	EVT ShSVT = ShVT.getScalarType();
	unsigned EltBits = VT.getScalarSizeInBits();

	// Check to see if we can do this.
	// FIXME: We should be more aggressive here.
	if (!isTypeLegal(VT))
	return SDValue();

	// If the sdiv has an 'exact' bit we can use a simpler lowering.
	if (N->getFlags().hasExact())
	return BuildExactSDIV(*this, N, dl, DAG, Created);

	SmallVector<SDValue, 16> MagicFactors, Factors, Shifts, ShiftMasks;

	auto BuildSDIVPattern = [&](ConstantSDNode *C) {
	if (C->isNullValue())
	return false;

	const APInt &Divisor = C->getAPIntValue();
	APInt::ms magics = Divisor.magic();
	int NumeratorFactor = 0;
	int ShiftMask = -1;

	if (Divisor.isOneValue() \|\| Divisor.isAllOnesValue()) {
	// If d is +1/-1, we just multiply the numerator by +1/-1.
	NumeratorFactor = Divisor.getSExtValue();
	magics.m = 0;
	magics.s = 0;
	ShiftMask = 0;
	} else if (Divisor.isStrictlyPositive() && magics.m.isNegative()) {
	// If d > 0 and m < 0, add the numerator.
	NumeratorFactor = 1;
	} else if (Divisor.isNegative() && magics.m.isStrictlyPositive()) {
	// If d < 0 and m > 0, subtract the numerator.
	NumeratorFactor = -1;
	}

	MagicFactors.push_back(DAG.getConstant(magics.m, dl, SVT));
	Factors.push_back(DAG.getConstant(NumeratorFactor, dl, SVT));
	Shifts.push_back(DAG.getConstant(magics.s, dl, ShSVT));
	ShiftMasks.push_back(DAG.getConstant(ShiftMask, dl, SVT));
	return true;
	};

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Collect the shifts / magic values from each element.
	if (!ISD::matchUnaryPredicate(N1, BuildSDIVPattern))
	return SDValue();

	SDValue MagicFactor, Factor, Shift, ShiftMask;
	if (VT.isVector()) {
	MagicFactor = DAG.getBuildVector(VT, dl, MagicFactors);
	Factor = DAG.getBuildVector(VT, dl, Factors);
	Shift = DAG.getBuildVector(ShVT, dl, Shifts);
	ShiftMask = DAG.getBuildVector(VT, dl, ShiftMasks);
	} else {
	MagicFactor = MagicFactors[0];
	Factor = Factors[0];
	Shift = Shifts[0];
	ShiftMask = ShiftMasks[0];
	}

	// Multiply the numerator (operand 0) by the magic value.
	// FIXME: We should support doing a MUL in a wider type.
	SDValue Q;
	if (IsAfterLegalization ? isOperationLegal(ISD::MULHS, VT)
	: isOperationLegalOrCustom(ISD::MULHS, VT))
	Q = DAG.getNode(ISD::MULHS, dl, VT, N0, MagicFactor);
	else if (IsAfterLegalization ? isOperationLegal(ISD::SMUL_LOHI, VT)
	: isOperationLegalOrCustom(ISD::SMUL_LOHI, VT)) {
	SDValue LoHi =
	DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), N0, MagicFactor);
	Q = SDValue(LoHi.getNode(), 1);
	} else
	return SDValue(); // No mulhs or equivalent.
	Created.push_back(Q.getNode());

	// (Optionally) Add/subtract the numerator using Factor.
	Factor = DAG.getNode(ISD::MUL, dl, VT, N0, Factor);
	Created.push_back(Factor.getNode());
	Q = DAG.getNode(ISD::ADD, dl, VT, Q, Factor);
	Created.push_back(Q.getNode());

	// Shift right algebraic by shift value.
	Q = DAG.getNode(ISD::SRA, dl, VT, Q, Shift);
	Created.push_back(Q.getNode());

	// Extract the sign bit, mask it and add it to the quotient.
	SDValue SignShift = DAG.getConstant(EltBits - 1, dl, ShVT);
	SDValue T = DAG.getNode(ISD::SRL, dl, VT, Q, SignShift);
	Created.push_back(T.getNode());
	T = DAG.getNode(ISD::AND, dl, VT, T, ShiftMask);
	Created.push_back(T.getNode());
	return DAG.getNode(ISD::ADD, dl, VT, Q, T);
	}

	/// Given an ISD::UDIV node expressing a divide by constant,
	/// return a DAG expression to select that will generate the same value by
	/// multiplying by a magic number.
	/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
	SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
	bool IsAfterLegalization,
	SmallVectorImpl<SDNode *> &Created) const {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
	EVT ShSVT = ShVT.getScalarType();
	unsigned EltBits = VT.getScalarSizeInBits();

	// Check to see if we can do this.
	// FIXME: We should be more aggressive here.
	if (!isTypeLegal(VT))
	return SDValue();

	bool UseNPQ = false;
	SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;

	auto BuildUDIVPattern = [&](ConstantSDNode *C) {
	if (C->isNullValue())
	return false;
	// FIXME: We should use a narrower constant when the upper
	// bits are known to be zero.
	APInt Divisor = C->getAPIntValue();
	APInt::mu magics = Divisor.magicu();
	unsigned PreShift = 0, PostShift = 0;

	// If the divisor is even, we can avoid using the expensive fixup by
	// shifting the divided value upfront.
	if (magics.a != 0 && !Divisor[0]) {
	PreShift = Divisor.countTrailingZeros();
	// Get magic number for the shifted divisor.
	magics = Divisor.lshr(PreShift).magicu(PreShift);
	assert(magics.a == 0 && "Should use cheap fixup now");
	}

	APInt Magic = magics.m;

	unsigned SelNPQ;
	if (magics.a == 0 \|\| Divisor.isOneValue()) {
	assert(magics.s < Divisor.getBitWidth() &&
	"We shouldn't generate an undefined shift!");
	PostShift = magics.s;
	SelNPQ = false;
	} else {
	PostShift = magics.s - 1;
	SelNPQ = true;
	}

	PreShifts.push_back(DAG.getConstant(PreShift, dl, ShSVT));
	MagicFactors.push_back(DAG.getConstant(Magic, dl, SVT));
	NPQFactors.push_back(
	DAG.getConstant(SelNPQ ? APInt::getOneBitSet(EltBits, EltBits - 1)
	: APInt::getNullValue(EltBits),
	dl, SVT));
	PostShifts.push_back(DAG.getConstant(PostShift, dl, ShSVT));
	UseNPQ \|= SelNPQ;
	return true;
	};

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Collect the shifts/magic values from each element.
	if (!ISD::matchUnaryPredicate(N1, BuildUDIVPattern))
	return SDValue();

	SDValue PreShift, PostShift, MagicFactor, NPQFactor;
	if (VT.isVector()) {
	PreShift = DAG.getBuildVector(ShVT, dl, PreShifts);
	MagicFactor = DAG.getBuildVector(VT, dl, MagicFactors);
	NPQFactor = DAG.getBuildVector(VT, dl, NPQFactors);
	PostShift = DAG.getBuildVector(ShVT, dl, PostShifts);
	} else {
	PreShift = PreShifts[0];
	MagicFactor = MagicFactors[0];
	PostShift = PostShifts[0];
	}

	SDValue Q = N0;
	Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift);
	Created.push_back(Q.getNode());

	// FIXME: We should support doing a MUL in a wider type.
	auto GetMULHU = [&](SDValue X, SDValue Y) {
	if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT)
	: isOperationLegalOrCustom(ISD::MULHU, VT))
	return DAG.getNode(ISD::MULHU, dl, VT, X, Y);
	if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT)
	: isOperationLegalOrCustom(ISD::UMUL_LOHI, VT)) {
	SDValue LoHi =
	DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y);
	return SDValue(LoHi.getNode(), 1);
	}
	return SDValue(); // No mulhu or equivalent
	};

	// Multiply the numerator (operand 0) by the magic value.
	Q = GetMULHU(Q, MagicFactor);
	if (!Q)
	return SDValue();

	Created.push_back(Q.getNode());

	if (UseNPQ) {
	SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N0, Q);
	Created.push_back(NPQ.getNode());

	// For vectors we might have a mix of non-NPQ/NPQ paths, so use
	// MULHU to act as a SRL-by-1 for NPQ, else multiply by zero.
	if (VT.isVector())
	NPQ = GetMULHU(NPQ, NPQFactor);
	else
	NPQ = DAG.getNode(ISD::SRL, dl, VT, NPQ, DAG.getConstant(1, dl, ShVT));

	Created.push_back(NPQ.getNode());

	Q = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q);
	Created.push_back(Q.getNode());
	}

	Q = DAG.getNode(ISD::SRL, dl, VT, Q, PostShift);
	Created.push_back(Q.getNode());

	SDValue One = DAG.getConstant(1, dl, VT);
	SDValue IsOne = DAG.getSetCC(dl, VT, N1, One, ISD::SETEQ);
	return DAG.getSelect(dl, VT, IsOne, N0, Q);
	}

	/// Given an ISD::UREM used only by an ISD::SETEQ or ISD::SETNE
	/// where the divisor is constant and the comparison target is zero,
	/// return a DAG expression that will generate the same comparison result
	/// using only multiplications, additions and shifts/rotations.
	/// Ref: "Hacker's Delight" 10-17.
	SDValue TargetLowering::buildUREMEqFold(EVT SETCCVT, SDValue REMNode,
	SDValue CompTargetNode,
	ISD::CondCode Cond,
	DAGCombinerInfo &DCI,
	const SDLoc &DL) const {
	SmallVector<SDNode *, 2> Built;
	if (SDValue Folded = prepareUREMEqFold(SETCCVT, REMNode, CompTargetNode, Cond,
	DCI, DL, Built)) {
	for (SDNode *N : Built)
	DCI.AddToWorklist(N);
	return Folded;
	}

	return SDValue();
	}

	SDValue
	TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
	SDValue CompTargetNode, ISD::CondCode Cond,
	DAGCombinerInfo &DCI, const SDLoc &DL,
	SmallVectorImpl<SDNode *> &Created) const {
	// fold (seteq/ne (urem N, D), 0) -> (setule/ugt (rotr (mul N, P), K), Q)
	// - D must be constant with D = D0 * 2^K where D0 is odd and D0 != 1
	// - P is the multiplicative inverse of D0 modulo 2^W
	// - Q = floor((2^W - 1) / D0)
	// where W is the width of the common type of N and D.
	assert((Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) &&
	"Only applicable for (in)equality comparisons.");

	EVT VT = REMNode.getValueType();

	// If MUL is unavailable, we cannot proceed in any case.
	if (!isOperationLegalOrCustom(ISD::MUL, VT))
	return SDValue();

	// TODO: Add non-uniform constant support.
	ConstantSDNode *Divisor = isConstOrConstSplat(REMNode->getOperand(1));
	ConstantSDNode *CompTarget = isConstOrConstSplat(CompTargetNode);
	if (!Divisor \|\| !CompTarget \|\| Divisor->isNullValue() \|\|
	!CompTarget->isNullValue())
	return SDValue();

	const APInt &D = Divisor->getAPIntValue();

	// Decompose D into D0 * 2^K
	unsigned K = D.countTrailingZeros();
	bool DivisorIsEven = (K != 0);
	APInt D0 = D.lshr(K);

	// The fold is invalid when D0 == 1.
	// This is reachable because visitSetCC happens before visitREM.
	if (D0.isOneValue())
	return SDValue();

	// P = inv(D0, 2^W)
	// 2^W requires W + 1 bits, so we have to extend and then truncate.
	unsigned W = D.getBitWidth();
	APInt P = D0.zext(W + 1)
	.multiplicativeInverse(APInt::getSignedMinValue(W + 1))
	.trunc(W);
	assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable
	assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check.");

	// Q = floor((2^W - 1) / D)
	APInt Q = APInt::getAllOnesValue(W).udiv(D);

	SelectionDAG &DAG = DCI.DAG;

	SDValue PVal = DAG.getConstant(P, DL, VT);
	SDValue QVal = DAG.getConstant(Q, DL, VT);
	// (mul N, P)
	SDValue Op1 = DAG.getNode(ISD::MUL, DL, VT, REMNode->getOperand(0), PVal);
	Created.push_back(Op1.getNode());

	// Rotate right only if D was even.
	if (DivisorIsEven) {
	// We need ROTR to do this.
	if (!isOperationLegalOrCustom(ISD::ROTR, VT))
	return SDValue();
	SDValue ShAmt =
	DAG.getConstant(K, DL, getShiftAmountTy(VT, DAG.getDataLayout()));
	SDNodeFlags Flags;
	Flags.setExact(true);
	// UREM: (rotr (mul N, P), K)
	Op1 = DAG.getNode(ISD::ROTR, DL, VT, Op1, ShAmt, Flags);
	Created.push_back(Op1.getNode());
	}

	// UREM: (setule/setugt (rotr (mul N, P), K), Q)
	return DAG.getSetCC(DL, SETCCVT, Op1, QVal,
	((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT));
	}

	bool TargetLowering::
	verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
	if (!isa<ConstantSDNode>(Op.getOperand(0))) {
	DAG.getContext()->emitError("argument to '__builtin_return_address' must "
	"be a constant integer");
	return true;
	}

	return false;
	}

	//===----------------------------------------------------------------------===//
	// Legalization Utilities
	//===----------------------------------------------------------------------===//

	bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl,
	SDValue LHS, SDValue RHS,
	SmallVectorImpl<SDValue> &Result,
	EVT HiLoVT, SelectionDAG &DAG,
	MulExpansionKind Kind, SDValue LL,
	SDValue LH, SDValue RL, SDValue RH) const {
	assert(Opcode == ISD::MUL \|\| Opcode == ISD::UMUL_LOHI \|\|
	Opcode == ISD::SMUL_LOHI);

	bool HasMULHS = (Kind == MulExpansionKind::Always) \|\|
	isOperationLegalOrCustom(ISD::MULHS, HiLoVT);
	bool HasMULHU = (Kind == MulExpansionKind::Always) \|\|
	isOperationLegalOrCustom(ISD::MULHU, HiLoVT);
	bool HasSMUL_LOHI = (Kind == MulExpansionKind::Always) \|\|
	isOperationLegalOrCustom(ISD::SMUL_LOHI, HiLoVT);
	bool HasUMUL_LOHI = (Kind == MulExpansionKind::Always) \|\|
	isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT);

	if (!HasMULHU && !HasMULHS && !HasUMUL_LOHI && !HasSMUL_LOHI)
	return false;

	unsigned OuterBitSize = VT.getScalarSizeInBits();
	unsigned InnerBitSize = HiLoVT.getScalarSizeInBits();
	unsigned LHSSB = DAG.ComputeNumSignBits(LHS);
	unsigned RHSSB = DAG.ComputeNumSignBits(RHS);

	// LL, LH, RL, and RH must be either all NULL or all set to a value.
	assert((LL.getNode() && LH.getNode() && RL.getNode() && RH.getNode()) \|\|
	(!LL.getNode() && !LH.getNode() && !RL.getNode() && !RH.getNode()));

	SDVTList VTs = DAG.getVTList(HiLoVT, HiLoVT);
	auto MakeMUL_LOHI = [&](SDValue L, SDValue R, SDValue &Lo, SDValue &Hi,
	bool Signed) -> bool {
	if ((Signed && HasSMUL_LOHI) \|\| (!Signed && HasUMUL_LOHI)) {
	Lo = DAG.getNode(Signed ? ISD::SMUL_LOHI : ISD::UMUL_LOHI, dl, VTs, L, R);
	Hi = SDValue(Lo.getNode(), 1);
	return true;
	}
	if ((Signed && HasMULHS) \|\| (!Signed && HasMULHU)) {
	Lo = DAG.getNode(ISD::MUL, dl, HiLoVT, L, R);
	Hi = DAG.getNode(Signed ? ISD::MULHS : ISD::MULHU, dl, HiLoVT, L, R);
	return true;
	}
	return false;
	};

	SDValue Lo, Hi;

	if (!LL.getNode() && !RL.getNode() &&
	isOperationLegalOrCustom(ISD::TRUNCATE, HiLoVT)) {
	LL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, LHS);
	RL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, RHS);
	}

	if (!LL.getNode())
	return false;

	APInt HighMask = APInt::getHighBitsSet(OuterBitSize, InnerBitSize);
	if (DAG.MaskedValueIsZero(LHS, HighMask) &&
	DAG.MaskedValueIsZero(RHS, HighMask)) {
	// The inputs are both zero-extended.
	if (MakeMUL_LOHI(LL, RL, Lo, Hi, false)) {
	Result.push_back(Lo);
	Result.push_back(Hi);
	if (Opcode != ISD::MUL) {
	SDValue Zero = DAG.getConstant(0, dl, HiLoVT);
	Result.push_back(Zero);
	Result.push_back(Zero);
	}
	return true;
	}
	}

	if (!VT.isVector() && Opcode == ISD::MUL && LHSSB > InnerBitSize &&
	RHSSB > InnerBitSize) {
	// The input values are both sign-extended.
	// TODO non-MUL case?
	if (MakeMUL_LOHI(LL, RL, Lo, Hi, true)) {
	Result.push_back(Lo);
	Result.push_back(Hi);
	return true;
	}
	}

	unsigned ShiftAmount = OuterBitSize - InnerBitSize;
	EVT ShiftAmountTy = getShiftAmountTy(VT, DAG.getDataLayout());
	if (APInt::getMaxValue(ShiftAmountTy.getSizeInBits()).ult(ShiftAmount)) {
	// FIXME getShiftAmountTy does not always return a sensible result when VT
	// is an illegal type, and so the type may be too small to fit the shift
	// amount. Override it with i32. The shift will have to be legalized.
	ShiftAmountTy = MVT::i32;
	}
	SDValue Shift = DAG.getConstant(ShiftAmount, dl, ShiftAmountTy);

	if (!LH.getNode() && !RH.getNode() &&
	isOperationLegalOrCustom(ISD::SRL, VT) &&
	isOperationLegalOrCustom(ISD::TRUNCATE, HiLoVT)) {
	LH = DAG.getNode(ISD::SRL, dl, VT, LHS, Shift);
	LH = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, LH);
	RH = DAG.getNode(ISD::SRL, dl, VT, RHS, Shift);
	RH = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, RH);
	}

	if (!LH.getNode())
	return false;

	if (!MakeMUL_LOHI(LL, RL, Lo, Hi, false))
	return false;

	Result.push_back(Lo);

	if (Opcode == ISD::MUL) {
	RH = DAG.getNode(ISD::MUL, dl, HiLoVT, LL, RH);
	LH = DAG.getNode(ISD::MUL, dl, HiLoVT, LH, RL);
	Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, RH);
	Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, LH);
	Result.push_back(Hi);
	return true;
	}

	// Compute the full width result.
	auto Merge = [&](SDValue Lo, SDValue Hi) -> SDValue {
	Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Lo);
	Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Hi);
	Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
	return DAG.getNode(ISD::OR, dl, VT, Lo, Hi);
	};

	SDValue Next = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Hi);
	if (!MakeMUL_LOHI(LL, RH, Lo, Hi, false))
	return false;

	// This is effectively the add part of a multiply-add of half-sized operands,
	// so it cannot overflow.
	Next = DAG.getNode(ISD::ADD, dl, VT, Next, Merge(Lo, Hi));

	if (!MakeMUL_LOHI(LH, RL, Lo, Hi, false))
	return false;

	SDValue Zero = DAG.getConstant(0, dl, HiLoVT);
	EVT BoolType = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

	bool UseGlue = (isOperationLegalOrCustom(ISD::ADDC, VT) &&
	isOperationLegalOrCustom(ISD::ADDE, VT));
	if (UseGlue)
	Next = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), Next,
	Merge(Lo, Hi));
	else
	Next = DAG.getNode(ISD::ADDCARRY, dl, DAG.getVTList(VT, BoolType), Next,
	Merge(Lo, Hi), DAG.getConstant(0, dl, BoolType));

	SDValue Carry = Next.getValue(1);
	Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next));
	Next = DAG.getNode(ISD::SRL, dl, VT, Next, Shift);

	if (!MakeMUL_LOHI(LH, RH, Lo, Hi, Opcode == ISD::SMUL_LOHI))
	return false;

	if (UseGlue)
	Hi = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(HiLoVT, MVT::Glue), Hi, Zero,
	Carry);
	else
	Hi = DAG.getNode(ISD::ADDCARRY, dl, DAG.getVTList(HiLoVT, BoolType), Hi,
	Zero, Carry);

	Next = DAG.getNode(ISD::ADD, dl, VT, Next, Merge(Lo, Hi));

	if (Opcode == ISD::SMUL_LOHI) {
	SDValue NextSub = DAG.getNode(ISD::SUB, dl, VT, Next,
	DAG.getNode(ISD::ZERO_EXTEND, dl, VT, RL));
	Next = DAG.getSelectCC(dl, LH, Zero, NextSub, Next, ISD::SETLT);

	NextSub = DAG.getNode(ISD::SUB, dl, VT, Next,
	DAG.getNode(ISD::ZERO_EXTEND, dl, VT, LL));
	Next = DAG.getSelectCC(dl, RH, Zero, NextSub, Next, ISD::SETLT);
	}

	Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next));
	Next = DAG.getNode(ISD::SRL, dl, VT, Next, Shift);
	Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next));
	return true;
	}

	bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
	SelectionDAG &DAG, MulExpansionKind Kind,
	SDValue LL, SDValue LH, SDValue RL,
	SDValue RH) const {
	SmallVector<SDValue, 2> Result;
	bool Ok = expandMUL_LOHI(N->getOpcode(), N->getValueType(0), N,
	N->getOperand(0), N->getOperand(1), Result, HiLoVT,
	DAG, Kind, LL, LH, RL, RH);
	if (Ok) {
	assert(Result.size() == 2);
	Lo = Result[0];
	Hi = Result[1];
	}
	return Ok;
	}

	bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
	SelectionDAG &DAG) const {
	EVT VT = Node->getValueType(0);

	if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) \|\|
	!isOperationLegalOrCustom(ISD::SRL, VT) \|\|
	!isOperationLegalOrCustom(ISD::SUB, VT) \|\|
	!isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
	return false;

	// fshl: (X << (Z % BW)) \| (Y >> (BW - (Z % BW)))
	// fshr: (X << (BW - (Z % BW))) \| (Y >> (Z % BW))
	SDValue X = Node->getOperand(0);
	SDValue Y = Node->getOperand(1);
	SDValue Z = Node->getOperand(2);

	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	bool IsFSHL = Node->getOpcode() == ISD::FSHL;
	SDLoc DL(SDValue(Node, 0));

	EVT ShVT = Z.getValueType();
	SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
	SDValue Zero = DAG.getConstant(0, DL, ShVT);

	SDValue ShAmt;
	if (isPowerOf2_32(EltSizeInBits)) {
	SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
	ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask);
	} else {
	ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC);
	}

	SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt);
	SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt);
	SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt);
	SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);

	// If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth,
	// and that is undefined. We must compare and select to avoid UB.
	EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ShVT);

	// For fshl, 0-shift returns the 1st arg (X).
	// For fshr, 0-shift returns the 2nd arg (Y).
	SDValue IsZeroShift = DAG.getSetCC(DL, CCVT, ShAmt, Zero, ISD::SETEQ);
	Result = DAG.getSelect(DL, VT, IsZeroShift, IsFSHL ? X : Y, Or);
	return true;
	}

	// TODO: Merge with expandFunnelShift.
	bool TargetLowering::expandROT(SDNode *Node, SDValue &Result,
	SelectionDAG &DAG) const {
	EVT VT = Node->getValueType(0);
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	bool IsLeft = Node->getOpcode() == ISD::ROTL;
	SDValue Op0 = Node->getOperand(0);
	SDValue Op1 = Node->getOperand(1);
	SDLoc DL(SDValue(Node, 0));

	EVT ShVT = Op1.getValueType();
	SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);

	// If a rotate in the other direction is legal, use it.
	unsigned RevRot = IsLeft ? ISD::ROTR : ISD::ROTL;
	if (isOperationLegal(RevRot, VT)) {
	SDValue Sub = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, Op1);
	Result = DAG.getNode(RevRot, DL, VT, Op0, Sub);
	return true;
	}

	if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) \|\|
	!isOperationLegalOrCustom(ISD::SRL, VT) \|\|
	!isOperationLegalOrCustom(ISD::SUB, VT) \|\|
	!isOperationLegalOrCustomOrPromote(ISD::OR, VT) \|\|
	!isOperationLegalOrCustomOrPromote(ISD::AND, VT)))
	return false;

	// Otherwise,
	// (rotl x, c) -> (or (shl x, (and c, w-1)), (srl x, (and w-c, w-1)))
	// (rotr x, c) -> (or (srl x, (and c, w-1)), (shl x, (and w-c, w-1)))
	//
	assert(isPowerOf2_32(EltSizeInBits) && EltSizeInBits > 1 &&
	"Expecting the type bitwidth to be a power of 2");
	unsigned ShOpc = IsLeft ? ISD::SHL : ISD::SRL;
	unsigned HsOpc = IsLeft ? ISD::SRL : ISD::SHL;
	SDValue BitWidthMinusOneC = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
	SDValue NegOp1 = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, Op1);
	SDValue And0 = DAG.getNode(ISD::AND, DL, ShVT, Op1, BitWidthMinusOneC);
	SDValue And1 = DAG.getNode(ISD::AND, DL, ShVT, NegOp1, BitWidthMinusOneC);
	Result = DAG.getNode(ISD::OR, DL, VT, DAG.getNode(ShOpc, DL, VT, Op0, And0),
	DAG.getNode(HsOpc, DL, VT, Op0, And1));
	return true;
	}

	bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
	SelectionDAG &DAG) const {
	SDValue Src = Node->getOperand(0);
	EVT SrcVT = Src.getValueType();
	EVT DstVT = Node->getValueType(0);
	SDLoc dl(SDValue(Node, 0));

	// FIXME: Only f32 to i64 conversions are supported.
	if (SrcVT != MVT::f32 \|\| DstVT != MVT::i64)
	return false;

	// Expand f32 -> i64 conversion
	// This algorithm comes from compiler-rt's implementation of fixsfdi:
	// https://github.com/llvm/llvm-project/blob/master/compiler-rt/lib/builtins/fixsfdi.c
	unsigned SrcEltBits = SrcVT.getScalarSizeInBits();
	EVT IntVT = SrcVT.changeTypeToInteger();
	EVT IntShVT = getShiftAmountTy(IntVT, DAG.getDataLayout());

	SDValue ExponentMask = DAG.getConstant(0x7F800000, dl, IntVT);
	SDValue ExponentLoBit = DAG.getConstant(23, dl, IntVT);
	SDValue Bias = DAG.getConstant(127, dl, IntVT);
	SDValue SignMask = DAG.getConstant(APInt::getSignMask(SrcEltBits), dl, IntVT);
	SDValue SignLowBit = DAG.getConstant(SrcEltBits - 1, dl, IntVT);
	SDValue MantissaMask = DAG.getConstant(0x007FFFFF, dl, IntVT);

	SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Src);

	SDValue ExponentBits = DAG.getNode(
	ISD::SRL, dl, IntVT, DAG.getNode(ISD::AND, dl, IntVT, Bits, ExponentMask),
	DAG.getZExtOrTrunc(ExponentLoBit, dl, IntShVT));
	SDValue Exponent = DAG.getNode(ISD::SUB, dl, IntVT, ExponentBits, Bias);

	SDValue Sign = DAG.getNode(ISD::SRA, dl, IntVT,
	DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask),
	DAG.getZExtOrTrunc(SignLowBit, dl, IntShVT));
	Sign = DAG.getSExtOrTrunc(Sign, dl, DstVT);

	SDValue R = DAG.getNode(ISD::OR, dl, IntVT,
	DAG.getNode(ISD::AND, dl, IntVT, Bits, MantissaMask),
	DAG.getConstant(0x00800000, dl, IntVT));

	R = DAG.getZExtOrTrunc(R, dl, DstVT);

	R = DAG.getSelectCC(
	dl, Exponent, ExponentLoBit,
	DAG.getNode(ISD::SHL, dl, DstVT, R,
	DAG.getZExtOrTrunc(
	DAG.getNode(ISD::SUB, dl, IntVT, Exponent, ExponentLoBit),
	dl, IntShVT)),
	DAG.getNode(ISD::SRL, dl, DstVT, R,
	DAG.getZExtOrTrunc(
	DAG.getNode(ISD::SUB, dl, IntVT, ExponentLoBit, Exponent),
	dl, IntShVT)),
	ISD::SETGT);

	SDValue Ret = DAG.getNode(ISD::SUB, dl, DstVT,
	DAG.getNode(ISD::XOR, dl, DstVT, R, Sign), Sign);

	Result = DAG.getSelectCC(dl, Exponent, DAG.getConstant(0, dl, IntVT),
	DAG.getConstant(0, dl, DstVT), Ret, ISD::SETLT);
	return true;
	}

	bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
	SelectionDAG &DAG) const {
	SDLoc dl(SDValue(Node, 0));
	SDValue Src = Node->getOperand(0);

	EVT SrcVT = Src.getValueType();
	EVT DstVT = Node->getValueType(0);
	EVT SetCCVT =
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);

	// Only expand vector types if we have the appropriate vector bit operations.
	if (DstVT.isVector() && (!isOperationLegalOrCustom(ISD::FP_TO_SINT, DstVT) \|\|
	!isOperationLegalOrCustomOrPromote(ISD::XOR, SrcVT)))
	return false;

	// If the maximum float value is smaller then the signed integer range,
	// the destination signmask can't be represented by the float, so we can
	// just use FP_TO_SINT directly.
	const fltSemantics &APFSem = DAG.EVTToAPFloatSemantics(SrcVT);
	APFloat APF(APFSem, APInt::getNullValue(SrcVT.getScalarSizeInBits()));
	APInt SignMask = APInt::getSignMask(DstVT.getScalarSizeInBits());
	if (APFloat::opOverflow &
	APF.convertFromAPInt(SignMask, false, APFloat::rmNearestTiesToEven)) {
	Result = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
	return true;
	}

	SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT);
	SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT);

	bool Strict = shouldUseStrictFP_TO_INT(SrcVT, DstVT, /IsSigned/ false);
	if (Strict) {
	// Expand based on maximum range of FP_TO_SINT, if the value exceeds the
	// signmask then offset (the result of which should be fully representable).
	// Sel = Src < 0x8000000000000000
	// Val = select Sel, Src, Src - 0x8000000000000000
	// Ofs = select Sel, 0, 0x8000000000000000
	// Result = fp_to_sint(Val) ^ Ofs

	// TODO: Should any fast-math-flags be set for the FSUB?
	SDValue Val = DAG.getSelect(dl, SrcVT, Sel, Src,
	DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst));
	SDValue Ofs = DAG.getSelect(dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT),
	DAG.getConstant(SignMask, dl, DstVT));
	Result = DAG.getNode(ISD::XOR, dl, DstVT,
	DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Val), Ofs);
	} else {
	// Expand based on maximum range of FP_TO_SINT:
	// True = fp_to_sint(Src)
	// False = 0x8000000000000000 + fp_to_sint(Src - 0x8000000000000000)
	// Result = select (Src < 0x8000000000000000), True, False

	SDValue True = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
	// TODO: Should any fast-math-flags be set for the FSUB?
	SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT,
	DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst));
	False = DAG.getNode(ISD::XOR, dl, DstVT, False,
	DAG.getConstant(SignMask, dl, DstVT));
	Result = DAG.getSelect(dl, DstVT, Sel, True, False);
	}
	return true;
	}

	bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
	SelectionDAG &DAG) const {
	SDValue Src = Node->getOperand(0);
	EVT SrcVT = Src.getValueType();
	EVT DstVT = Node->getValueType(0);

	if (SrcVT.getScalarType() != MVT::i64)
	return false;

	SDLoc dl(SDValue(Node, 0));
	EVT ShiftVT = getShiftAmountTy(SrcVT, DAG.getDataLayout());

	if (DstVT.getScalarType() == MVT::f32) {
	// Only expand vector types if we have the appropriate vector bit
	// operations.
	if (SrcVT.isVector() &&
	(!isOperationLegalOrCustom(ISD::SRL, SrcVT) \|\|
	!isOperationLegalOrCustom(ISD::FADD, DstVT) \|\|
	!isOperationLegalOrCustom(ISD::SINT_TO_FP, SrcVT) \|\|
	!isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) \|\|
	!isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT)))
	return false;

	// For unsigned conversions, convert them to signed conversions using the
	// algorithm from the x86_64 __floatundidf in compiler_rt.
	SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);

	SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT);
	SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Src, ShiftConst);
	SDValue AndConst = DAG.getConstant(1, dl, SrcVT);
	SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Src, AndConst);
	SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr);

	SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Or);
	SDValue Slow = DAG.getNode(ISD::FADD, dl, DstVT, SignCvt, SignCvt);

	// TODO: This really should be implemented using a branch rather than a
	// select. We happen to get lucky and machinesink does the right
	// thing most of the time. This would be a good candidate for a
	// pseudo-op, or, even better, for whole-function isel.
	EVT SetCCVT =
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);

	SDValue SignBitTest = DAG.getSetCC(
	dl, SetCCVT, Src, DAG.getConstant(0, dl, SrcVT), ISD::SETLT);
	Result = DAG.getSelect(dl, DstVT, SignBitTest, Slow, Fast);
	return true;
	}

	if (DstVT.getScalarType() == MVT::f64) {
	// Only expand vector types if we have the appropriate vector bit
	// operations.
	if (SrcVT.isVector() &&
	(!isOperationLegalOrCustom(ISD::SRL, SrcVT) \|\|
	!isOperationLegalOrCustom(ISD::FADD, DstVT) \|\|
	!isOperationLegalOrCustom(ISD::FSUB, DstVT) \|\|
	!isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) \|\|
	!isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT)))
	return false;

	// Implementation of unsigned i64 to f64 following the algorithm in
	// __floatundidf in compiler_rt. This implementation has the advantage
	// of performing rounding correctly, both in the default rounding mode
	// and in all alternate rounding modes.
	SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT);
	SDValue TwoP84PlusTwoP52 = DAG.getConstantFP(
	BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT);
	SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT);
	SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT);
	SDValue HiShift = DAG.getConstant(32, dl, ShiftVT);

	SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Src, LoMask);
	SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Src, HiShift);
	SDValue LoOr = DAG.getNode(ISD::OR, dl, SrcVT, Lo, TwoP52);
	SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84);
	SDValue LoFlt = DAG.getBitcast(DstVT, LoOr);
	SDValue HiFlt = DAG.getBitcast(DstVT, HiOr);
	SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
	Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
	return true;
	}

	return false;
	}

	SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
	SelectionDAG &DAG) const {
	SDLoc dl(Node);
	unsigned NewOp = Node->getOpcode() == ISD::FMINNUM ?
	ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
	EVT VT = Node->getValueType(0);
	if (isOperationLegalOrCustom(NewOp, VT)) {
	SDValue Quiet0 = Node->getOperand(0);
	SDValue Quiet1 = Node->getOperand(1);

	if (!Node->getFlags().hasNoNaNs()) {
	// Insert canonicalizes if it's possible we need to quiet to get correct
	// sNaN behavior.
	if (!DAG.isKnownNeverSNaN(Quiet0)) {
	Quiet0 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet0,
	Node->getFlags());
	}
	if (!DAG.isKnownNeverSNaN(Quiet1)) {
	Quiet1 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet1,
	Node->getFlags());
	}
	}

	return DAG.getNode(NewOp, dl, VT, Quiet0, Quiet1, Node->getFlags());
	}

	// If the target has FMINIMUM/FMAXIMUM but not FMINNUM/FMAXNUM use that
	// instead if there are no NaNs.
	if (Node->getFlags().hasNoNaNs()) {
	unsigned IEEE2018Op =
	Node->getOpcode() == ISD::FMINNUM ? ISD::FMINIMUM : ISD::FMAXIMUM;
	if (isOperationLegalOrCustom(IEEE2018Op, VT)) {
	return DAG.getNode(IEEE2018Op, dl, VT, Node->getOperand(0),
	Node->getOperand(1), Node->getFlags());
	}
	}

	return SDValue();
	}

	bool TargetLowering::expandCTPOP(SDNode *Node, SDValue &Result,
	SelectionDAG &DAG) const {
	SDLoc dl(Node);
	EVT VT = Node->getValueType(0);
	EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
	SDValue Op = Node->getOperand(0);
	unsigned Len = VT.getScalarSizeInBits();
	assert(VT.isInteger() && "CTPOP not implemented for this type.");

	// TODO: Add support for irregular type lengths.
	if (!(Len <= 128 && Len % 8 == 0))
	return false;

	// Only expand vector types if we have the appropriate vector bit operations.
	if (VT.isVector() && (!isOperationLegalOrCustom(ISD::ADD, VT) \|\|
	!isOperationLegalOrCustom(ISD::SUB, VT) \|\|
	!isOperationLegalOrCustom(ISD::SRL, VT) \|\|
	(Len != 8 && !isOperationLegalOrCustom(ISD::MUL, VT)) \|\|
	!isOperationLegalOrCustomOrPromote(ISD::AND, VT)))
	return false;

	// This is the "best" algorithm from
	// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
	SDValue Mask55 =
	DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl, VT);
	SDValue Mask33 =
	DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl, VT);
	SDValue Mask0F =
	DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl, VT);
	SDValue Mask01 =
	DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);

	// v = v - ((v >> 1) & 0x55555555...)
	Op = DAG.getNode(ISD::SUB, dl, VT, Op,
	DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRL, dl, VT, Op,
	DAG.getConstant(1, dl, ShVT)),
	Mask55));
	// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
	Op = DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::AND, dl, VT, Op, Mask33),
	DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRL, dl, VT, Op,
	DAG.getConstant(2, dl, ShVT)),
	Mask33));
	// v = (v + (v >> 4)) & 0x0F0F0F0F...
	Op = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::ADD, dl, VT, Op,
	DAG.getNode(ISD::SRL, dl, VT, Op,
	DAG.getConstant(4, dl, ShVT))),
	Mask0F);
	// v = (v * 0x01010101...) >> (Len - 8)
	if (Len > 8)
	Op =
	DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
	DAG.getConstant(Len - 8, dl, ShVT));

	Result = Op;
	return true;
	}

	bool TargetLowering::expandCTLZ(SDNode *Node, SDValue &Result,
	SelectionDAG &DAG) const {
	SDLoc dl(Node);
	EVT VT = Node->getValueType(0);
	EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
	SDValue Op = Node->getOperand(0);
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();

	// If the non-ZERO_UNDEF version is supported we can use that instead.
	if (Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF &&
	isOperationLegalOrCustom(ISD::CTLZ, VT)) {
	Result = DAG.getNode(ISD::CTLZ, dl, VT, Op);
	return true;
	}

	// If the ZERO_UNDEF version is supported use that and handle the zero case.
	if (isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
	EVT SetCCVT =
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
	SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
	Result = DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
	DAG.getConstant(NumBitsPerElt, dl, VT), CTLZ);
	return true;
	}

	// Only expand vector types if we have the appropriate vector bit operations.
	if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) \|\|
	!isOperationLegalOrCustom(ISD::CTPOP, VT) \|\|
	!isOperationLegalOrCustom(ISD::SRL, VT) \|\|
	!isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
	return false;

	// for now, we do this:
	// x = x \| (x >> 1);
	// x = x \| (x >> 2);
	// ...
	// x = x \| (x >>16);
	// x = x \| (x >>32); // for 64-bit input
	// return popcount(~x);
	//
	// Ref: "Hacker's Delight" by Henry Warren
	for (unsigned i = 0; (1U << i) <= (NumBitsPerElt / 2); ++i) {
	SDValue Tmp = DAG.getConstant(1ULL << i, dl, ShVT);
	Op = DAG.getNode(ISD::OR, dl, VT, Op,
	DAG.getNode(ISD::SRL, dl, VT, Op, Tmp));
	}
	Op = DAG.getNOT(dl, Op, VT);
	Result = DAG.getNode(ISD::CTPOP, dl, VT, Op);
	return true;
	}

	bool TargetLowering::expandCTTZ(SDNode *Node, SDValue &Result,
	SelectionDAG &DAG) const {
	SDLoc dl(Node);
	EVT VT = Node->getValueType(0);
	SDValue Op = Node->getOperand(0);
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();

	// If the non-ZERO_UNDEF version is supported we can use that instead.
	if (Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF &&
	isOperationLegalOrCustom(ISD::CTTZ, VT)) {
	Result = DAG.getNode(ISD::CTTZ, dl, VT, Op);
	return true;
	}

	// If the ZERO_UNDEF version is supported use that and handle the zero case.
	if (isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
	EVT SetCCVT =
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
	SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op);
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
	Result = DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
	DAG.getConstant(NumBitsPerElt, dl, VT), CTTZ);
	return true;
	}

	// Only expand vector types if we have the appropriate vector bit operations.
	if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) \|\|
	(!isOperationLegalOrCustom(ISD::CTPOP, VT) &&
	!isOperationLegalOrCustom(ISD::CTLZ, VT)) \|\|
	!isOperationLegalOrCustom(ISD::SUB, VT) \|\|
	!isOperationLegalOrCustomOrPromote(ISD::AND, VT) \|\|
	!isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
	return false;

	// for now, we use: { return popcount(~x & (x - 1)); }
	// unless the target has ctlz but not ctpop, in which case we use:
	// { return 32 - nlz(~x & (x-1)); }
	// Ref: "Hacker's Delight" by Henry Warren
	SDValue Tmp = DAG.getNode(
	ISD::AND, dl, VT, DAG.getNOT(dl, Op, VT),
	DAG.getNode(ISD::SUB, dl, VT, Op, DAG.getConstant(1, dl, VT)));

	// If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
	if (isOperationLegal(ISD::CTLZ, VT) && !isOperationLegal(ISD::CTPOP, VT)) {
	Result =
	DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(NumBitsPerElt, dl, VT),
	DAG.getNode(ISD::CTLZ, dl, VT, Tmp));
	return true;
	}

	Result = DAG.getNode(ISD::CTPOP, dl, VT, Tmp);
	return true;
	}

	bool TargetLowering::expandABS(SDNode *N, SDValue &Result,
	SelectionDAG &DAG) const {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
	SDValue Op = N->getOperand(0);

	// Only expand vector types if we have the appropriate vector operations.
	if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SRA, VT) \|\|
	!isOperationLegalOrCustom(ISD::ADD, VT) \|\|
	!isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
	return false;

	SDValue Shift =
	DAG.getNode(ISD::SRA, dl, VT, Op,
	DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, ShVT));
	SDValue Add = DAG.getNode(ISD::ADD, dl, VT, Op, Shift);
	Result = DAG.getNode(ISD::XOR, dl, VT, Add, Shift);
	return true;
	}

	SDValue TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
	SelectionDAG &DAG) const {
	SDLoc SL(LD);
	SDValue Chain = LD->getChain();
	SDValue BasePTR = LD->getBasePtr();
	EVT SrcVT = LD->getMemoryVT();
	ISD::LoadExtType ExtType = LD->getExtensionType();

	unsigned NumElem = SrcVT.getVectorNumElements();

	EVT SrcEltVT = SrcVT.getScalarType();
	EVT DstEltVT = LD->getValueType(0).getScalarType();

	unsigned Stride = SrcEltVT.getSizeInBits() / 8;
	assert(SrcEltVT.isByteSized());

	SmallVector<SDValue, 8> Vals;
	SmallVector<SDValue, 8> LoadChains;

	for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
	SDValue ScalarLoad =
	DAG.getExtLoad(ExtType, SL, DstEltVT, Chain, BasePTR,
	LD->getPointerInfo().getWithOffset(Idx * Stride),
	SrcEltVT, MinAlign(LD->getAlignment(), Idx * Stride),
	LD->getMemOperand()->getFlags(), LD->getAAInfo());

	BasePTR = DAG.getObjectPtrOffset(SL, BasePTR, Stride);

	Vals.push_back(ScalarLoad.getValue(0));
	LoadChains.push_back(ScalarLoad.getValue(1));
	}

	SDValue NewChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoadChains);
	SDValue Value = DAG.getBuildVector(LD->getValueType(0), SL, Vals);

	return DAG.getMergeValues({Value, NewChain}, SL);
	}

	SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
	SelectionDAG &DAG) const {
	SDLoc SL(ST);

	SDValue Chain = ST->getChain();
	SDValue BasePtr = ST->getBasePtr();
	SDValue Value = ST->getValue();
	EVT StVT = ST->getMemoryVT();

	// The type of the data we want to save
	EVT RegVT = Value.getValueType();
	EVT RegSclVT = RegVT.getScalarType();

	// The type of data as saved in memory.
	EVT MemSclVT = StVT.getScalarType();

	EVT IdxVT = getVectorIdxTy(DAG.getDataLayout());
	unsigned NumElem = StVT.getVectorNumElements();

	// A vector must always be stored in memory as-is, i.e. without any padding
	// between the elements, since various code depend on it, e.g. in the
	// handling of a bitcast of a vector type to int, which may be done with a
	// vector store followed by an integer load. A vector that does not have
	// elements that are byte-sized must therefore be stored as an integer
	// built out of the extracted vector elements.
	if (!MemSclVT.isByteSized()) {
	unsigned NumBits = StVT.getSizeInBits();
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumBits);

	SDValue CurrVal = DAG.getConstant(0, SL, IntVT);

	for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, RegSclVT, Value,
	DAG.getConstant(Idx, SL, IdxVT));
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MemSclVT, Elt);
	SDValue ExtElt = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Trunc);
	unsigned ShiftIntoIdx =
	(DAG.getDataLayout().isBigEndian() ? (NumElem - 1) - Idx : Idx);
	SDValue ShiftAmount =
	DAG.getConstant(ShiftIntoIdx * MemSclVT.getSizeInBits(), SL, IntVT);
	SDValue ShiftedElt =
	DAG.getNode(ISD::SHL, SL, IntVT, ExtElt, ShiftAmount);
	CurrVal = DAG.getNode(ISD::OR, SL, IntVT, CurrVal, ShiftedElt);
	}

	return DAG.getStore(Chain, SL, CurrVal, BasePtr, ST->getPointerInfo(),
	ST->getAlignment(), ST->getMemOperand()->getFlags(),
	ST->getAAInfo());
	}

	// Store Stride in bytes
	unsigned Stride = MemSclVT.getSizeInBits() / 8;
	assert(Stride && "Zero stride!");
	// Extract each of the elements from the original vector and save them into
	// memory individually.
	SmallVector<SDValue, 8> Stores;
	for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, RegSclVT, Value,
	DAG.getConstant(Idx, SL, IdxVT));

	SDValue Ptr = DAG.getObjectPtrOffset(SL, BasePtr, Idx * Stride);

	// This scalar TruncStore may be illegal, but we legalize it later.
	SDValue Store = DAG.getTruncStore(
	Chain, SL, Elt, Ptr, ST->getPointerInfo().getWithOffset(Idx * Stride),
	MemSclVT, MinAlign(ST->getAlignment(), Idx * Stride),
	ST->getMemOperand()->getFlags(), ST->getAAInfo());

	Stores.push_back(Store);
	}

	return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Stores);
	}

	std::pair<SDValue, SDValue>
	TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
	assert(LD->getAddressingMode() == ISD::UNINDEXED &&
	"unaligned indexed loads not implemented!");
	SDValue Chain = LD->getChain();
	SDValue Ptr = LD->getBasePtr();
	EVT VT = LD->getValueType(0);
	EVT LoadedVT = LD->getMemoryVT();
	SDLoc dl(LD);
	auto &MF = DAG.getMachineFunction();

	if (VT.isFloatingPoint() \|\| VT.isVector()) {
	EVT intVT = EVT::getIntegerVT(*DAG.getContext(), LoadedVT.getSizeInBits());
	if (isTypeLegal(intVT) && isTypeLegal(LoadedVT)) {
	if (!isOperationLegalOrCustom(ISD::LOAD, intVT) &&
	LoadedVT.isVector()) {
	// Scalarize the load and let the individual components be handled.
	SDValue Scalarized = scalarizeVectorLoad(LD, DAG);
	if (Scalarized->getOpcode() == ISD::MERGE_VALUES)
	return std::make_pair(Scalarized.getOperand(0), Scalarized.getOperand(1));
	return std::make_pair(Scalarized.getValue(0), Scalarized.getValue(1));
	}

	// Expand to a (misaligned) integer load of the same size,
	// then bitconvert to floating point or vector.
	SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr,
	LD->getMemOperand());
	SDValue Result = DAG.getNode(ISD::BITCAST, dl, LoadedVT, newLoad);
	if (LoadedVT != VT)
	Result = DAG.getNode(VT.isFloatingPoint() ? ISD::FP_EXTEND :
	ISD::ANY_EXTEND, dl, VT, Result);

	return std::make_pair(Result, newLoad.getValue(1));
	}

	// Copy the value to a (aligned) stack slot using (unaligned) integer
	// loads and stores, then do a (aligned) load from the stack slot.
	MVT RegVT = getRegisterType(*DAG.getContext(), intVT);
	unsigned LoadedBytes = LoadedVT.getStoreSize();
	unsigned RegBytes = RegVT.getSizeInBits() / 8;
	unsigned NumRegs = (LoadedBytes + RegBytes - 1) / RegBytes;

	// Make sure the stack slot is also aligned for the register type.
	SDValue StackBase = DAG.CreateStackTemporary(LoadedVT, RegVT);
	auto FrameIndex = cast<FrameIndexSDNode>(StackBase.getNode())->getIndex();
	SmallVector<SDValue, 8> Stores;
	SDValue StackPtr = StackBase;
	unsigned Offset = 0;

	EVT PtrVT = Ptr.getValueType();
	EVT StackPtrVT = StackPtr.getValueType();

	SDValue PtrIncrement = DAG.getConstant(RegBytes, dl, PtrVT);
	SDValue StackPtrIncrement = DAG.getConstant(RegBytes, dl, StackPtrVT);

	// Do all but one copies using the full register width.
	for (unsigned i = 1; i < NumRegs; i++) {
	// Load one integer register's worth from the original location.
	SDValue Load = DAG.getLoad(
	RegVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(Offset),
	MinAlign(LD->getAlignment(), Offset), LD->getMemOperand()->getFlags(),
	LD->getAAInfo());
	// Follow the load with a store to the stack slot. Remember the store.
	Stores.push_back(DAG.getStore(
	Load.getValue(1), dl, Load, StackPtr,
	MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset)));
	// Increment the pointers.
	Offset += RegBytes;

	Ptr = DAG.getObjectPtrOffset(dl, Ptr, PtrIncrement);
	StackPtr = DAG.getObjectPtrOffset(dl, StackPtr, StackPtrIncrement);
	}

	// The last copy may be partial. Do an extending load.
	EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
	8 * (LoadedBytes - Offset));
	SDValue Load =
	DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Chain, Ptr,
	LD->getPointerInfo().getWithOffset(Offset), MemVT,
	MinAlign(LD->getAlignment(), Offset),
	LD->getMemOperand()->getFlags(), LD->getAAInfo());
	// Follow the load with a store to the stack slot. Remember the store.
	// On big-endian machines this requires a truncating store to ensure
	// that the bits end up in the right place.
	Stores.push_back(DAG.getTruncStore(
	Load.getValue(1), dl, Load, StackPtr,
	MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset), MemVT));

	// The order of the stores doesn't matter - say it with a TokenFactor.
	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);

	// Finally, perform the original load only redirected to the stack slot.
	Load = DAG.getExtLoad(LD->getExtensionType(), dl, VT, TF, StackBase,
	MachinePointerInfo::getFixedStack(MF, FrameIndex, 0),
	LoadedVT);

	// Callers expect a MERGE_VALUES node.
	return std::make_pair(Load, TF);
	}

	assert(LoadedVT.isInteger() && !LoadedVT.isVector() &&
	"Unaligned load of unsupported type.");

	// Compute the new VT that is half the size of the old one. This is an
	// integer MVT.
	unsigned NumBits = LoadedVT.getSizeInBits();
	EVT NewLoadedVT;
	NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2);
	NumBits >>= 1;

	unsigned Alignment = LD->getAlignment();
	unsigned IncrementSize = NumBits / 8;
	ISD::LoadExtType HiExtType = LD->getExtensionType();

	// If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
	if (HiExtType == ISD::NON_EXTLOAD)
	HiExtType = ISD::ZEXTLOAD;

	// Load the value in two parts
	SDValue Lo, Hi;
	if (DAG.getDataLayout().isLittleEndian()) {
	Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
	NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
	LD->getAAInfo());

	Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
	Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
	LD->getPointerInfo().getWithOffset(IncrementSize),
	NewLoadedVT, MinAlign(Alignment, IncrementSize),
	LD->getMemOperand()->getFlags(), LD->getAAInfo());
	} else {
	Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
	NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
	LD->getAAInfo());

	Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
	Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
	LD->getPointerInfo().getWithOffset(IncrementSize),
	NewLoadedVT, MinAlign(Alignment, IncrementSize),
	LD->getMemOperand()->getFlags(), LD->getAAInfo());
	}

	// aggregate the two parts
	SDValue ShiftAmount =
	DAG.getConstant(NumBits, dl, getShiftAmountTy(Hi.getValueType(),
	DAG.getDataLayout()));
	SDValue Result = DAG.getNode(ISD::SHL, dl, VT, Hi, ShiftAmount);
	Result = DAG.getNode(ISD::OR, dl, VT, Result, Lo);

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	return std::make_pair(Result, TF);
	}

	SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
	SelectionDAG &DAG) const {
	assert(ST->getAddressingMode() == ISD::UNINDEXED &&
	"unaligned indexed stores not implemented!");
	SDValue Chain = ST->getChain();
	SDValue Ptr = ST->getBasePtr();
	SDValue Val = ST->getValue();
	EVT VT = Val.getValueType();
	int Alignment = ST->getAlignment();
	auto &MF = DAG.getMachineFunction();
	EVT StoreMemVT = ST->getMemoryVT();

	SDLoc dl(ST);
	if (StoreMemVT.isFloatingPoint() \|\| StoreMemVT.isVector()) {
	EVT intVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
	if (isTypeLegal(intVT)) {
	if (!isOperationLegalOrCustom(ISD::STORE, intVT) &&
	StoreMemVT.isVector()) {
	// Scalarize the store and let the individual components be handled.
	SDValue Result = scalarizeVectorStore(ST, DAG);
	return Result;
	}
	// Expand to a bitconvert of the value to the integer type of the
	// same size, then a (misaligned) int store.
	// FIXME: Does not handle truncating floating point stores!
	SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val);
	Result = DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(),
	Alignment, ST->getMemOperand()->getFlags());
	return Result;
	}
	// Do a (aligned) store to a stack slot, then copy from the stack slot
	// to the final destination using (unaligned) integer loads and stores.
	MVT RegVT = getRegisterType(
	*DAG.getContext(),
	EVT::getIntegerVT(*DAG.getContext(), StoreMemVT.getSizeInBits()));
	EVT PtrVT = Ptr.getValueType();
	unsigned StoredBytes = StoreMemVT.getStoreSize();
	unsigned RegBytes = RegVT.getSizeInBits() / 8;
	unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes;

	// Make sure the stack slot is also aligned for the register type.
	SDValue StackPtr = DAG.CreateStackTemporary(StoreMemVT, RegVT);
	auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

	// Perform the original store, only redirected to the stack slot.
	SDValue Store = DAG.getTruncStore(
	Chain, dl, Val, StackPtr,
	MachinePointerInfo::getFixedStack(MF, FrameIndex, 0), StoreMemVT);

	EVT StackPtrVT = StackPtr.getValueType();

	SDValue PtrIncrement = DAG.getConstant(RegBytes, dl, PtrVT);
	SDValue StackPtrIncrement = DAG.getConstant(RegBytes, dl, StackPtrVT);
	SmallVector<SDValue, 8> Stores;
	unsigned Offset = 0;

	// Do all but one copies using the full register width.
	for (unsigned i = 1; i < NumRegs; i++) {
	// Load one integer register's worth from the stack slot.
	SDValue Load = DAG.getLoad(
	RegVT, dl, Store, StackPtr,
	MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset));
	// Store it to the final location. Remember the store.
	Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr,
	ST->getPointerInfo().getWithOffset(Offset),
	MinAlign(ST->getAlignment(), Offset),
	ST->getMemOperand()->getFlags()));
	// Increment the pointers.
	Offset += RegBytes;
	StackPtr = DAG.getObjectPtrOffset(dl, StackPtr, StackPtrIncrement);
	Ptr = DAG.getObjectPtrOffset(dl, Ptr, PtrIncrement);
	}

	// The last store may be partial. Do a truncating store. On big-endian
	// machines this requires an extending load from the stack slot to ensure
	// that the bits are in the right place.
	EVT LoadMemVT =
	EVT::getIntegerVT(DAG.getContext(), 8 (StoredBytes - Offset));

	// Load from the stack slot.
	SDValue Load = DAG.getExtLoad(
	ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
	MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset), LoadMemVT);

	Stores.push_back(
	DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
	ST->getPointerInfo().getWithOffset(Offset), LoadMemVT,
	MinAlign(ST->getAlignment(), Offset),
	ST->getMemOperand()->getFlags(), ST->getAAInfo()));
	// The order of the stores doesn't matter - say it with a TokenFactor.
	SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
	return Result;
	}

	assert(StoreMemVT.isInteger() && !StoreMemVT.isVector() &&
	"Unaligned store of unknown type.");
	// Get the half-size VT
	EVT NewStoredVT = StoreMemVT.getHalfSizedIntegerVT(*DAG.getContext());
	int NumBits = NewStoredVT.getSizeInBits();
	int IncrementSize = NumBits / 8;

	// Divide the stored value in two parts.
	SDValue ShiftAmount = DAG.getConstant(
	NumBits, dl, getShiftAmountTy(Val.getValueType(), DAG.getDataLayout()));
	SDValue Lo = Val;
	SDValue Hi = DAG.getNode(ISD::SRL, dl, VT, Val, ShiftAmount);

	// Store the two parts
	SDValue Store1, Store2;
	Store1 = DAG.getTruncStore(Chain, dl,
	DAG.getDataLayout().isLittleEndian() ? Lo : Hi,
	Ptr, ST->getPointerInfo(), NewStoredVT, Alignment,
	ST->getMemOperand()->getFlags());

	Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
	Alignment = MinAlign(Alignment, IncrementSize);
	Store2 = DAG.getTruncStore(
	Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr,
	ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT, Alignment,
	ST->getMemOperand()->getFlags(), ST->getAAInfo());

	SDValue Result =
	DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
	return Result;
	}

	SDValue
	TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
	const SDLoc &DL, EVT DataVT,
	SelectionDAG &DAG,
	bool IsCompressedMemory) const {
	SDValue Increment;
	EVT AddrVT = Addr.getValueType();
	EVT MaskVT = Mask.getValueType();
	assert(DataVT.getVectorNumElements() == MaskVT.getVectorNumElements() &&
	"Incompatible types of Data and Mask");
	if (IsCompressedMemory) {
	// Incrementing the pointer according to number of '1's in the mask.
	EVT MaskIntVT = EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits());
	SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask);
	if (MaskIntVT.getSizeInBits() < 32) {
	MaskInIntReg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskInIntReg);
	MaskIntVT = MVT::i32;
	}

	// Count '1's with POPCNT.
	Increment = DAG.getNode(ISD::CTPOP, DL, MaskIntVT, MaskInIntReg);
	Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT);
	// Scale is an element size in bytes.
	SDValue Scale = DAG.getConstant(DataVT.getScalarSizeInBits() / 8, DL,
	AddrVT);
	Increment = DAG.getNode(ISD::MUL, DL, AddrVT, Increment, Scale);
	} else
	Increment = DAG.getConstant(DataVT.getStoreSize(), DL, AddrVT);

	return DAG.getNode(ISD::ADD, DL, AddrVT, Addr, Increment);
	}

	static SDValue clampDynamicVectorIndex(SelectionDAG &DAG,
	SDValue Idx,
	EVT VecVT,
	const SDLoc &dl) {
	if (isa<ConstantSDNode>(Idx))
	return Idx;

	EVT IdxVT = Idx.getValueType();
	unsigned NElts = VecVT.getVectorNumElements();
	if (isPowerOf2_32(NElts)) {
	APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(),
	Log2_32(NElts));
	return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
	DAG.getConstant(Imm, dl, IdxVT));
	}

	return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx,
	DAG.getConstant(NElts - 1, dl, IdxVT));
	}

	SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
	SDValue VecPtr, EVT VecVT,
	SDValue Index) const {
	SDLoc dl(Index);
	// Make sure the index type is big enough to compute in.
	Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType());

	EVT EltVT = VecVT.getVectorElementType();

	// Calculate the element offset and add it to the pointer.
	unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
	assert(EltSize * 8 == EltVT.getSizeInBits() &&
	"Converting bits to bytes lost precision");

	Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl);

	EVT IdxVT = Index.getValueType();

	Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index,
	DAG.getConstant(EltSize, dl, IdxVT));
	return DAG.getNode(ISD::ADD, dl, IdxVT, VecPtr, Index);
	}

	//===----------------------------------------------------------------------===//
	// Implementation of Emulated TLS Model
	//===----------------------------------------------------------------------===//

	SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
	SelectionDAG &DAG) const {
	// Access to address of TLS varialbe xyz is lowered to a function call:
	// __emutls_get_address( address of global variable named "__emutls_v.xyz" )
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	PointerType VoidPtrType = Type::getInt8PtrTy(DAG.getContext());
	SDLoc dl(GA);

	ArgListTy Args;
	ArgListEntry Entry;
	std::string NameString = ("__emutls_v." + GA->getGlobal()->getName()).str();
	Module VariableModule = const_cast<Module>(GA->getGlobal()->getParent());
	StringRef EmuTlsVarName(NameString);
	GlobalVariable *EmuTlsVar = VariableModule->getNamedGlobal(EmuTlsVarName);
	assert(EmuTlsVar && "Cannot find EmuTlsVar ");
	Entry.Node = DAG.getGlobalAddress(EmuTlsVar, dl, PtrVT);
	Entry.Ty = VoidPtrType;
	Args.push_back(Entry);

	SDValue EmuTlsGetAddr = DAG.getExternalSymbol("__emutls_get_address", PtrVT);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl).setChain(DAG.getEntryNode());
	CLI.setLibCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args));
	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);

	// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
	// At last for X86 targets, maybe good for other targets too?
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true); // Is this only for X86 target?
	MFI.setHasCalls(true);

	assert((GA->getOffset() == 0) &&
	"Emulated TLS must have zero offset in GlobalAddressSDNode");
	return CallResult.first;
	}

	SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op,
	SelectionDAG &DAG) const {
	assert((Op->getOpcode() == ISD::SETCC) && "Input has to be a SETCC node.");
	if (!isCtlzFast())
	return SDValue();
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDLoc dl(Op);
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	if (C->isNullValue() && CC == ISD::SETEQ) {
	EVT VT = Op.getOperand(0).getValueType();
	SDValue Zext = Op.getOperand(0);
	if (VT.bitsLT(MVT::i32)) {
	VT = MVT::i32;
	Zext = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Op.getOperand(0));
	}
	unsigned Log2b = Log2_32(VT.getSizeInBits());
	SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext);
	SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz,
	DAG.getConstant(Log2b, dl, MVT::i32));
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc);
	}
	}
	return SDValue();
	}

	SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
	unsigned Opcode = Node->getOpcode();
	SDValue LHS = Node->getOperand(0);
	SDValue RHS = Node->getOperand(1);
	EVT VT = LHS.getValueType();
	SDLoc dl(Node);

	assert(VT == RHS.getValueType() && "Expected operands to be the same type");
	assert(VT.isInteger() && "Expected operands to be integers");

	// usub.sat(a, b) -> umax(a, b) - b
	if (Opcode == ISD::USUBSAT && isOperationLegalOrCustom(ISD::UMAX, VT)) {
	SDValue Max = DAG.getNode(ISD::UMAX, dl, VT, LHS, RHS);
	return DAG.getNode(ISD::SUB, dl, VT, Max, RHS);
	}

	if (Opcode == ISD::UADDSAT && isOperationLegalOrCustom(ISD::UMIN, VT)) {
	SDValue InvRHS = DAG.getNOT(dl, RHS, VT);
	SDValue Min = DAG.getNode(ISD::UMIN, dl, VT, LHS, InvRHS);
	return DAG.getNode(ISD::ADD, dl, VT, Min, RHS);
	}

	unsigned OverflowOp;
	switch (Opcode) {
	case ISD::SADDSAT:
	OverflowOp = ISD::SADDO;
	break;
	case ISD::UADDSAT:
	OverflowOp = ISD::UADDO;
	break;
	case ISD::SSUBSAT:
	OverflowOp = ISD::SSUBO;
	break;
	case ISD::USUBSAT:
	OverflowOp = ISD::USUBO;
	break;
	default:
	llvm_unreachable("Expected method to receive signed or unsigned saturation "
	"addition or subtraction node.");
	}

	unsigned BitWidth = LHS.getScalarValueSizeInBits();
	EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
	SDValue Result = DAG.getNode(OverflowOp, dl, DAG.getVTList(VT, BoolVT),
	LHS, RHS);
	SDValue SumDiff = Result.getValue(0);
	SDValue Overflow = Result.getValue(1);
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);

	if (Opcode == ISD::UADDSAT) {
	if (getBooleanContents(VT) == ZeroOrNegativeOneBooleanContent) {
	// (LHS + RHS) \| OverflowMask
	SDValue OverflowMask = DAG.getSExtOrTrunc(Overflow, dl, VT);
	return DAG.getNode(ISD::OR, dl, VT, SumDiff, OverflowMask);
	}
	// Overflow ? 0xffff.... : (LHS + RHS)
	return DAG.getSelect(dl, VT, Overflow, AllOnes, SumDiff);
	} else if (Opcode == ISD::USUBSAT) {
	if (getBooleanContents(VT) == ZeroOrNegativeOneBooleanContent) {
	// (LHS - RHS) & ~OverflowMask
	SDValue OverflowMask = DAG.getSExtOrTrunc(Overflow, dl, VT);
	SDValue Not = DAG.getNOT(dl, OverflowMask, VT);
	return DAG.getNode(ISD::AND, dl, VT, SumDiff, Not);
	}
	// Overflow ? 0 : (LHS - RHS)
	return DAG.getSelect(dl, VT, Overflow, Zero, SumDiff);
	} else {
	// SatMax -> Overflow && SumDiff < 0
	// SatMin -> Overflow && SumDiff >= 0
	APInt MinVal = APInt::getSignedMinValue(BitWidth);
	APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
	SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
	SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
	SDValue SumNeg = DAG.getSetCC(dl, BoolVT, SumDiff, Zero, ISD::SETLT);
	Result = DAG.getSelect(dl, VT, SumNeg, SatMax, SatMin);
	return DAG.getSelect(dl, VT, Overflow, Result, SumDiff);
	}
	}

	SDValue
	TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
	assert((Node->getOpcode() == ISD::SMULFIX \|\|
	Node->getOpcode() == ISD::UMULFIX \|\|
	Node->getOpcode() == ISD::SMULFIXSAT) &&
	"Expected a fixed point multiplication opcode");

	SDLoc dl(Node);
	SDValue LHS = Node->getOperand(0);
	SDValue RHS = Node->getOperand(1);
	EVT VT = LHS.getValueType();
	unsigned Scale = Node->getConstantOperandVal(2);
	bool Saturating = Node->getOpcode() == ISD::SMULFIXSAT;
	EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
	unsigned VTSize = VT.getScalarSizeInBits();

	if (!Scale) {
	// [us]mul.fix(a, b, 0) -> mul(a, b)
	if (!Saturating && isOperationLegalOrCustom(ISD::MUL, VT)) {
	return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
	} else if (Saturating && isOperationLegalOrCustom(ISD::SMULO, VT)) {
	SDValue Result =
	DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
	SDValue Product = Result.getValue(0);
	SDValue Overflow = Result.getValue(1);
	SDValue Zero = DAG.getConstant(0, dl, VT);

	APInt MinVal = APInt::getSignedMinValue(VTSize);
	APInt MaxVal = APInt::getSignedMaxValue(VTSize);
	SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
	SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
	SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
	Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
	return DAG.getSelect(dl, VT, Overflow, Result, Product);
	}
	}

	bool Signed =
	Node->getOpcode() == ISD::SMULFIX \|\| Node->getOpcode() == ISD::SMULFIXSAT;
	assert(((Signed && Scale < VTSize) \|\| (!Signed && Scale <= VTSize)) &&
	"Expected scale to be less than the number of bits if signed or at "
	"most the number of bits if unsigned.");
	assert(LHS.getValueType() == RHS.getValueType() &&
	"Expected both operands to be the same type");

	// Get the upper and lower bits of the result.
	SDValue Lo, Hi;
	unsigned LoHiOp = Signed ? ISD::SMUL_LOHI : ISD::UMUL_LOHI;
	unsigned HiOp = Signed ? ISD::MULHS : ISD::MULHU;
	if (isOperationLegalOrCustom(LoHiOp, VT)) {
	SDValue Result = DAG.getNode(LoHiOp, dl, DAG.getVTList(VT, VT), LHS, RHS);
	Lo = Result.getValue(0);
	Hi = Result.getValue(1);
	} else if (isOperationLegalOrCustom(HiOp, VT)) {
	Lo = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
	Hi = DAG.getNode(HiOp, dl, VT, LHS, RHS);
	} else if (VT.isVector()) {
	return SDValue();
	} else {
	report_fatal_error("Unable to expand fixed point multiplication.");
	}

	if (Scale == VTSize)
	// Result is just the top half since we'd be shifting by the width of the
	// operand.
	return Hi;

	// The result will need to be shifted right by the scale since both operands
	// are scaled. The result is given to us in 2 halves, so we only want part of
	// both in the result.
	EVT ShiftTy = getShiftAmountTy(VT, DAG.getDataLayout());
	SDValue Result = DAG.getNode(ISD::FSHR, dl, VT, Hi, Lo,
	DAG.getConstant(Scale, dl, ShiftTy));
	if (!Saturating)
	return Result;

	unsigned OverflowBits = VTSize - Scale + 1; // +1 for the sign
	SDValue HiMask =
	DAG.getConstant(APInt::getHighBitsSet(VTSize, OverflowBits), dl, VT);
	SDValue LoMask = DAG.getConstant(
	APInt::getLowBitsSet(VTSize, VTSize - OverflowBits), dl, VT);
	APInt MaxVal = APInt::getSignedMaxValue(VTSize);
	APInt MinVal = APInt::getSignedMinValue(VTSize);

	Result = DAG.getSelectCC(dl, Hi, LoMask,
	DAG.getConstant(MaxVal, dl, VT), Result,
	ISD::SETGT);
	return DAG.getSelectCC(dl, Hi, HiMask,
	DAG.getConstant(MinVal, dl, VT), Result,
	ISD::SETLT);
	}

	void TargetLowering::expandUADDSUBO(
	SDNode *Node, SDValue &Result, SDValue &Overflow, SelectionDAG &DAG) const {
	SDLoc dl(Node);
	SDValue LHS = Node->getOperand(0);
	SDValue RHS = Node->getOperand(1);
	bool IsAdd = Node->getOpcode() == ISD::UADDO;

	// If ADD/SUBCARRY is legal, use that instead.
	unsigned OpcCarry = IsAdd ? ISD::ADDCARRY : ISD::SUBCARRY;
	if (isOperationLegalOrCustom(OpcCarry, Node->getValueType(0))) {
	SDValue CarryIn = DAG.getConstant(0, dl, Node->getValueType(1));
	SDValue NodeCarry = DAG.getNode(OpcCarry, dl, Node->getVTList(),
	{ LHS, RHS, CarryIn });
	Result = SDValue(NodeCarry.getNode(), 0);
	Overflow = SDValue(NodeCarry.getNode(), 1);
	return;
	}

	Result = DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, dl,
	LHS.getValueType(), LHS, RHS);

	EVT ResultType = Node->getValueType(1);
	EVT SetCCType = getSetCCResultType(
	DAG.getDataLayout(), *DAG.getContext(), Node->getValueType(0));
	ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
	SDValue SetCC = DAG.getSetCC(dl, SetCCType, Result, LHS, CC);
	Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);
	}

	void TargetLowering::expandSADDSUBO(
	SDNode *Node, SDValue &Result, SDValue &Overflow, SelectionDAG &DAG) const {
	SDLoc dl(Node);
	SDValue LHS = Node->getOperand(0);
	SDValue RHS = Node->getOperand(1);
	bool IsAdd = Node->getOpcode() == ISD::SADDO;

	Result = DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, dl,
	LHS.getValueType(), LHS, RHS);

	EVT ResultType = Node->getValueType(1);
	EVT OType = getSetCCResultType(
	DAG.getDataLayout(), *DAG.getContext(), Node->getValueType(0));

	// If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
	unsigned OpcSat = IsAdd ? ISD::SADDSAT : ISD::SSUBSAT;
	if (isOperationLegalOrCustom(OpcSat, LHS.getValueType())) {
	SDValue Sat = DAG.getNode(OpcSat, dl, LHS.getValueType(), LHS, RHS);
	SDValue SetCC = DAG.getSetCC(dl, OType, Result, Sat, ISD::SETNE);
	Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);
	return;
	}

	SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());

	// LHSSign -> LHS >= 0
	// RHSSign -> RHS >= 0
	// SumSign -> Result >= 0
	//
	// Add:
	// Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
	// Sub:
	// Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
	SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
	SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
	SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
	IsAdd ? ISD::SETEQ : ISD::SETNE);

	SDValue SumSign = DAG.getSetCC(dl, OType, Result, Zero, ISD::SETGE);
	SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);

	SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
	Overflow = DAG.getBoolExtOrTrunc(Cmp, dl, ResultType, ResultType);
	}

	bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
	SDValue &Overflow, SelectionDAG &DAG) const {
	SDLoc dl(Node);
	EVT VT = Node->getValueType(0);
	EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
	SDValue LHS = Node->getOperand(0);
	SDValue RHS = Node->getOperand(1);
	bool isSigned = Node->getOpcode() == ISD::SMULO;

	// For power-of-two multiplications we can use a simpler shift expansion.
	if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
	const APInt &C = RHSC->getAPIntValue();
	// mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
	if (C.isPowerOf2()) {
	// smulo(x, signed_min) is same as umulo(x, signed_min).
	bool UseArithShift = isSigned && !C.isMinSignedValue();
	EVT ShiftAmtTy = getShiftAmountTy(VT, DAG.getDataLayout());
	SDValue ShiftAmt = DAG.getConstant(C.logBase2(), dl, ShiftAmtTy);
	Result = DAG.getNode(ISD::SHL, dl, VT, LHS, ShiftAmt);
	Overflow = DAG.getSetCC(dl, SetCCVT,
	DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
	dl, VT, Result, ShiftAmt),
	LHS, ISD::SETNE);
	return true;
	}
	}

	EVT WideVT = EVT::getIntegerVT(DAG.getContext(), VT.getScalarSizeInBits() 2);
	if (VT.isVector())
	WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT,
	VT.getVectorNumElements());

	SDValue BottomHalf;
	SDValue TopHalf;
	static const unsigned Ops[2][3] =
	{ { ISD::MULHU, ISD::UMUL_LOHI, ISD::ZERO_EXTEND },
	{ ISD::MULHS, ISD::SMUL_LOHI, ISD::SIGN_EXTEND }};
	if (isOperationLegalOrCustom(Ops[isSigned][0], VT)) {
	BottomHalf = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
	TopHalf = DAG.getNode(Ops[isSigned][0], dl, VT, LHS, RHS);
	} else if (isOperationLegalOrCustom(Ops[isSigned][1], VT)) {
	BottomHalf = DAG.getNode(Ops[isSigned][1], dl, DAG.getVTList(VT, VT), LHS,
	RHS);
	TopHalf = BottomHalf.getValue(1);
	} else if (isTypeLegal(WideVT)) {
	LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS);
	RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
	BottomHalf = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
	SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits(), dl,
	getShiftAmountTy(WideVT, DAG.getDataLayout()));
	TopHalf = DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::SRL, dl, WideVT, Mul, ShiftAmt));
	} else {
	if (VT.isVector())
	return false;

	// We can fall back to a libcall with an illegal type for the MUL if we
	// have a libcall big enough.
	// Also, we can fall back to a division in some cases, but that's a big
	// performance hit in the general case.
	RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
	if (WideVT == MVT::i16)
	LC = RTLIB::MUL_I16;
	else if (WideVT == MVT::i32)
	LC = RTLIB::MUL_I32;
	else if (WideVT == MVT::i64)
	LC = RTLIB::MUL_I64;
	else if (WideVT == MVT::i128)
	LC = RTLIB::MUL_I128;
	assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");

	SDValue HiLHS;
	SDValue HiRHS;
	if (isSigned) {
	// The high part is obtained by SRA'ing all but one of the bits of low
	// part.
	unsigned LoSize = VT.getSizeInBits();
	HiLHS =
	DAG.getNode(ISD::SRA, dl, VT, LHS,
	DAG.getConstant(LoSize - 1, dl,
	getPointerTy(DAG.getDataLayout())));
	HiRHS =
	DAG.getNode(ISD::SRA, dl, VT, RHS,
	DAG.getConstant(LoSize - 1, dl,
	getPointerTy(DAG.getDataLayout())));
	} else {
	HiLHS = DAG.getConstant(0, dl, VT);
	HiRHS = DAG.getConstant(0, dl, VT);
	}

	// Here we're passing the 2 arguments explicitly as 4 arguments that are
	// pre-lowered to the correct types. This all depends upon WideVT not
	// being a legal type for the architecture and thus has to be split to
	// two arguments.
	SDValue Ret;
	if (shouldSplitFunctionArgumentsAsLittleEndian(DAG.getDataLayout())) {
	// Halves of WideVT are packed into registers in different order
	// depending on platform endianness. This is usually handled by
	// the C calling convention, but we can't defer to it in
	// the legalizer.
	SDValue Args[] = { LHS, HiLHS, RHS, HiRHS };
	Ret = makeLibCall(DAG, LC, WideVT, Args, isSigned, dl,
	/* doesNotReturn / false, / isReturnValueUsed */ true,
	/* isPostTypeLegalization */ true).first;
	} else {
	SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
	Ret = makeLibCall(DAG, LC, WideVT, Args, isSigned, dl,
	/* doesNotReturn / false, / isReturnValueUsed */ true,
	/* isPostTypeLegalization */ true).first;
	}
	assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
	"Ret value is a collection of constituent nodes holding result.");
	if (DAG.getDataLayout().isLittleEndian()) {
	// Same as above.
	BottomHalf = Ret.getOperand(0);
	TopHalf = Ret.getOperand(1);
	} else {
	BottomHalf = Ret.getOperand(1);
	TopHalf = Ret.getOperand(0);
	}
	}

	Result = BottomHalf;
	if (isSigned) {
	SDValue ShiftAmt = DAG.getConstant(
	VT.getScalarSizeInBits() - 1, dl,
	getShiftAmountTy(BottomHalf.getValueType(), DAG.getDataLayout()));
	SDValue Sign = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, ShiftAmt);
	Overflow = DAG.getSetCC(dl, SetCCVT, TopHalf, Sign, ISD::SETNE);
	} else {
	Overflow = DAG.getSetCC(dl, SetCCVT, TopHalf,
	DAG.getConstant(0, dl, VT), ISD::SETNE);
	}

	// Truncate the result if SetCC returns a larger type than needed.
	EVT RType = Node->getValueType(1);
	if (RType.getSizeInBits() < Overflow.getValueSizeInBits())
	Overflow = DAG.getNode(ISD::TRUNCATE, dl, RType, Overflow);

	assert(RType.getSizeInBits() == Overflow.getValueSizeInBits() &&
	"Unexpected result type for S/UMULO legalization");
	return true;
	}

	SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
	SDLoc dl(Node);
	bool NoNaN = Node->getFlags().hasNoNaNs();
	unsigned BaseOpcode = 0;
	switch (Node->getOpcode()) {
	default: llvm_unreachable("Expected VECREDUCE opcode");
	case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
	case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
	case ISD::VECREDUCE_ADD: BaseOpcode = ISD::ADD; break;
	case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
	case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
	case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
	case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
	case ISD::VECREDUCE_SMAX: BaseOpcode = ISD::SMAX; break;
	case ISD::VECREDUCE_SMIN: BaseOpcode = ISD::SMIN; break;
	case ISD::VECREDUCE_UMAX: BaseOpcode = ISD::UMAX; break;
	case ISD::VECREDUCE_UMIN: BaseOpcode = ISD::UMIN; break;
	case ISD::VECREDUCE_FMAX:
	BaseOpcode = NoNaN ? ISD::FMAXNUM : ISD::FMAXIMUM;
	break;
	case ISD::VECREDUCE_FMIN:
	BaseOpcode = NoNaN ? ISD::FMINNUM : ISD::FMINIMUM;
	break;
	}

	SDValue Op = Node->getOperand(0);
	EVT VT = Op.getValueType();

	// Try to use a shuffle reduction for power of two vectors.
	if (VT.isPow2VectorType()) {
	while (VT.getVectorNumElements() > 1) {
	EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
	if (!isOperationLegalOrCustom(BaseOpcode, HalfVT))
	break;

	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(Op, dl);
	Op = DAG.getNode(BaseOpcode, dl, HalfVT, Lo, Hi);
	VT = HalfVT;
	}
	}

	EVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();

	SmallVector<SDValue, 8> Ops;
	DAG.ExtractVectorElements(Op, Ops, 0, NumElts);

	SDValue Res = Ops[0];
	for (unsigned i = 1; i < NumElts; i++)
	Res = DAG.getNode(BaseOpcode, dl, EltVT, Res, Ops[i], Node->getFlags());

	// Result type may be wider than element type.
	if (EltVT != Node->getValueType(0))
	Res = DAG.getNode(ISD::ANY_EXTEND, dl, Node->getValueType(0), Res);
	return Res;
	}
	Index: vendor/llvm/dist-release_90/lib/MC/MCContext.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/MC/MCContext.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/MC/MCContext.cpp (revision 351303)
	@@ -1,701 +1,707 @@
	//===- lib/MC/MCContext.cpp - Machine Code Context ------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/MC/MCContext.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringMap.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/BinaryFormat/COFF.h"
	#include "llvm/BinaryFormat/ELF.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCCodeView.h"
	#include "llvm/MC/MCDwarf.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCFragment.h"
	#include "llvm/MC/MCLabel.h"
	#include "llvm/MC/MCObjectFileInfo.h"
	#include "llvm/MC/MCSectionCOFF.h"
	#include "llvm/MC/MCSectionELF.h"
	#include "llvm/MC/MCSectionMachO.h"
	#include "llvm/MC/MCSectionWasm.h"
	#include "llvm/MC/MCSectionXCOFF.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/MC/MCSymbolCOFF.h"
	#include "llvm/MC/MCSymbolELF.h"
	#include "llvm/MC/MCSymbolMachO.h"
	#include "llvm/MC/MCSymbolWasm.h"
	#include "llvm/MC/MCSymbolXCOFF.h"
	#include "llvm/MC/SectionKind.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MemoryBuffer.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/Signals.h"
	#include "llvm/Support/SourceMgr.h"
	#include "llvm/Support/raw_ostream.h"
	#include <cassert>
	#include <cstdlib>
	#include <tuple>
	#include <utility>

	using namespace llvm;

	static cl::opt<char*>
	AsSecureLogFileName("as-secure-log-file-name",
	cl::desc("As secure log file name (initialized from "
	"AS_SECURE_LOG_FILE env variable)"),
	cl::init(getenv("AS_SECURE_LOG_FILE")), cl::Hidden);

	MCContext::MCContext(const MCAsmInfo mai, const MCRegisterInfo mri,
	const MCObjectFileInfo mofi, const SourceMgr mgr,
	bool DoAutoReset)
	: SrcMgr(mgr), InlineSrcMgr(nullptr), MAI(mai), MRI(mri), MOFI(mofi),
	Symbols(Allocator), UsedNames(Allocator),
	+ InlineAsmUsedLabelNames(Allocator),
	CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0),
	AutoReset(DoAutoReset) {
	SecureLogFile = AsSecureLogFileName;

	if (SrcMgr && SrcMgr->getNumBuffers())
	MainFileName =
	SrcMgr->getMemoryBuffer(SrcMgr->getMainFileID())->getBufferIdentifier();
	}

	MCContext::~MCContext() {
	if (AutoReset)
	reset();

	// NOTE: The symbols are all allocated out of a bump pointer allocator,
	// we don't need to free them here.
	}

	//===----------------------------------------------------------------------===//
	// Module Lifetime Management
	//===----------------------------------------------------------------------===//

	void MCContext::reset() {
	// Call the destructors so the fragments are freed
	COFFAllocator.DestroyAll();
	ELFAllocator.DestroyAll();
	MachOAllocator.DestroyAll();
	XCOFFAllocator.DestroyAll();

	MCSubtargetAllocator.DestroyAll();
	+ InlineAsmUsedLabelNames.clear();
	UsedNames.clear();
	Symbols.clear();
	Allocator.Reset();
	Instances.clear();
	CompilationDir.clear();
	MainFileName.clear();
	MCDwarfLineTablesCUMap.clear();
	SectionsForRanges.clear();
	MCGenDwarfLabelEntries.clear();
	DwarfDebugFlags = StringRef();
	DwarfCompileUnitID = 0;
	CurrentDwarfLoc = MCDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0);

	CVContext.reset();

	MachOUniquingMap.clear();
	ELFUniquingMap.clear();
	COFFUniquingMap.clear();
	WasmUniquingMap.clear();
	XCOFFUniquingMap.clear();

	NextID.clear();
	AllowTemporaryLabels = true;
	DwarfLocSeen = false;
	GenDwarfForAssembly = false;
	GenDwarfFileNumber = 0;

	HadError = false;
	}

	//===----------------------------------------------------------------------===//
	// Symbol Manipulation
	//===----------------------------------------------------------------------===//

	MCSymbol *MCContext::getOrCreateSymbol(const Twine &Name) {
	SmallString<128> NameSV;
	StringRef NameRef = Name.toStringRef(NameSV);

	assert(!NameRef.empty() && "Normal symbols cannot be unnamed!");

	MCSymbol *&Sym = Symbols[NameRef];
	if (!Sym)
	Sym = createSymbol(NameRef, false, false);

	return Sym;
	}

	MCSymbol *MCContext::getOrCreateFrameAllocSymbol(StringRef FuncName,
	unsigned Idx) {
	return getOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix()) + FuncName +
	"$frame_escape_" + Twine(Idx));
	}

	MCSymbol *MCContext::getOrCreateParentFrameOffsetSymbol(StringRef FuncName) {
	return getOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix()) + FuncName +
	"$parent_frame_offset");
	}

	MCSymbol *MCContext::getOrCreateLSDASymbol(StringRef FuncName) {
	return getOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix()) + "__ehtable$" +
	FuncName);
	}

	MCSymbol MCContext::createSymbolImpl(const StringMapEntry<bool> Name,
	bool IsTemporary) {
	if (MOFI) {
	switch (MOFI->getObjectFileType()) {
	case MCObjectFileInfo::IsCOFF:
	return new (Name, *this) MCSymbolCOFF(Name, IsTemporary);
	case MCObjectFileInfo::IsELF:
	return new (Name, *this) MCSymbolELF(Name, IsTemporary);
	case MCObjectFileInfo::IsMachO:
	return new (Name, *this) MCSymbolMachO(Name, IsTemporary);
	case MCObjectFileInfo::IsWasm:
	return new (Name, *this) MCSymbolWasm(Name, IsTemporary);
	case MCObjectFileInfo::IsXCOFF:
	return new (Name, *this) MCSymbolXCOFF(Name, IsTemporary);
	}
	}
	return new (Name, *this) MCSymbol(MCSymbol::SymbolKindUnset, Name,
	IsTemporary);
	}

	MCSymbol *MCContext::createSymbol(StringRef Name, bool AlwaysAddSuffix,
	bool CanBeUnnamed) {
	if (CanBeUnnamed && !UseNamesOnTempLabels)
	return createSymbolImpl(nullptr, true);

	// Determine whether this is a user written assembler temporary or normal
	// label, if used.
	bool IsTemporary = CanBeUnnamed;
	if (AllowTemporaryLabels && !IsTemporary)
	IsTemporary = Name.startswith(MAI->getPrivateGlobalPrefix());

	SmallString<128> NewName = Name;
	bool AddSuffix = AlwaysAddSuffix;
	unsigned &NextUniqueID = NextID[Name];
	while (true) {
	if (AddSuffix) {
	NewName.resize(Name.size());
	raw_svector_ostream(NewName) << NextUniqueID++;
	}
	auto NameEntry = UsedNames.insert(std::make_pair(NewName, true));
	if (NameEntry.second \|\| !NameEntry.first->second) {
	// Ok, we found a name.
	// Mark it as used for a non-section symbol.
	NameEntry.first->second = true;
	// Have the MCSymbol object itself refer to the copy of the string that is
	// embedded in the UsedNames entry.
	return createSymbolImpl(&*NameEntry.first, IsTemporary);
	}
	assert(IsTemporary && "Cannot rename non-temporary symbols");
	AddSuffix = true;
	}
	llvm_unreachable("Infinite loop");
	}

	MCSymbol *MCContext::createTempSymbol(const Twine &Name, bool AlwaysAddSuffix,
	bool CanBeUnnamed) {
	SmallString<128> NameSV;
	raw_svector_ostream(NameSV) << MAI->getPrivateGlobalPrefix() << Name;
	return createSymbol(NameSV, AlwaysAddSuffix, CanBeUnnamed);
	}

	MCSymbol *MCContext::createLinkerPrivateTempSymbol() {
	SmallString<128> NameSV;
	raw_svector_ostream(NameSV) << MAI->getLinkerPrivateGlobalPrefix() << "tmp";
	return createSymbol(NameSV, true, false);
	}

	MCSymbol *MCContext::createTempSymbol(bool CanBeUnnamed) {
	return createTempSymbol("tmp", true, CanBeUnnamed);
	}

	unsigned MCContext::NextInstance(unsigned LocalLabelVal) {
	MCLabel *&Label = Instances[LocalLabelVal];
	if (!Label)
	Label = new (*this) MCLabel(0);
	return Label->incInstance();
	}

	unsigned MCContext::GetInstance(unsigned LocalLabelVal) {
	MCLabel *&Label = Instances[LocalLabelVal];
	if (!Label)
	Label = new (*this) MCLabel(0);
	return Label->getInstance();
	}

	MCSymbol *MCContext::getOrCreateDirectionalLocalSymbol(unsigned LocalLabelVal,
	unsigned Instance) {
	MCSymbol *&Sym = LocalSymbols[std::make_pair(LocalLabelVal, Instance)];
	if (!Sym)
	Sym = createTempSymbol(false);
	return Sym;
	}

	MCSymbol *MCContext::createDirectionalLocalSymbol(unsigned LocalLabelVal) {
	unsigned Instance = NextInstance(LocalLabelVal);
	return getOrCreateDirectionalLocalSymbol(LocalLabelVal, Instance);
	}

	MCSymbol *MCContext::getDirectionalLocalSymbol(unsigned LocalLabelVal,
	bool Before) {
	unsigned Instance = GetInstance(LocalLabelVal);
	if (!Before)
	++Instance;
	return getOrCreateDirectionalLocalSymbol(LocalLabelVal, Instance);
	}

	MCSymbol *MCContext::lookupSymbol(const Twine &Name) const {
	SmallString<128> NameSV;
	StringRef NameRef = Name.toStringRef(NameSV);
	return Symbols.lookup(NameRef);
	}

	void MCContext::setSymbolValue(MCStreamer &Streamer,
	StringRef Sym,
	uint64_t Val) {
	auto Symbol = getOrCreateSymbol(Sym);
	Streamer.EmitAssignment(Symbol, MCConstantExpr::create(Val, *this));
	+}
	+
	+void MCContext::registerInlineAsmLabel(MCSymbol *Sym) {
	+ InlineAsmUsedLabelNames[Sym->getName()] = Sym;
	}

	//===----------------------------------------------------------------------===//
	// Section Management
	//===----------------------------------------------------------------------===//

	MCSectionMachO *MCContext::getMachOSection(StringRef Segment, StringRef Section,
	unsigned TypeAndAttributes,
	unsigned Reserved2, SectionKind Kind,
	const char *BeginSymName) {
	// We unique sections by their segment/section pair. The returned section
	// may not have the same flags as the requested section, if so this should be
	// diagnosed by the client as an error.

	// Form the name to look up.
	SmallString<64> Name;
	Name += Segment;
	Name.push_back(',');
	Name += Section;

	// Do the lookup, if we have a hit, return it.
	MCSectionMachO *&Entry = MachOUniquingMap[Name];
	if (Entry)
	return Entry;

	MCSymbol *Begin = nullptr;
	if (BeginSymName)
	Begin = createTempSymbol(BeginSymName, false);

	// Otherwise, return a new section.
	return Entry = new (MachOAllocator.Allocate()) MCSectionMachO(
	Segment, Section, TypeAndAttributes, Reserved2, Kind, Begin);
	}

	void MCContext::renameELFSection(MCSectionELF *Section, StringRef Name) {
	StringRef GroupName;
	if (const MCSymbol *Group = Section->getGroup())
	GroupName = Group->getName();

	unsigned UniqueID = Section->getUniqueID();
	ELFUniquingMap.erase(
	ELFSectionKey{Section->getSectionName(), GroupName, UniqueID});
	auto I = ELFUniquingMap.insert(std::make_pair(
	ELFSectionKey{Name, GroupName, UniqueID},
	Section))
	.first;
	StringRef CachedName = I->first.SectionName;
	const_cast<MCSectionELF *>(Section)->setSectionName(CachedName);
	}

	MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type,
	unsigned Flags, SectionKind K,
	unsigned EntrySize,
	const MCSymbolELF *Group,
	unsigned UniqueID,
	const MCSymbolELF *Associated) {
	MCSymbolELF *R;
	MCSymbol *&Sym = Symbols[Section];
	// A section symbol can not redefine regular symbols. There may be multiple
	// sections with the same name, in which case the first such section wins.
	if (Sym && Sym->isDefined() &&
	(!Sym->isInSection() \|\| Sym->getSection().getBeginSymbol() != Sym))
	reportError(SMLoc(), "invalid symbol redefinition");
	if (Sym && Sym->isUndefined()) {
	R = cast<MCSymbolELF>(Sym);
	} else {
	auto NameIter = UsedNames.insert(std::make_pair(Section, false)).first;
	R = new (&NameIter, this) MCSymbolELF(&NameIter, /isTemporary*/ false);
	if (!Sym)
	Sym = R;
	}
	R->setBinding(ELF::STB_LOCAL);
	R->setType(ELF::STT_SECTION);

	auto *Ret = new (ELFAllocator.Allocate()) MCSectionELF(
	Section, Type, Flags, K, EntrySize, Group, UniqueID, R, Associated);

	auto *F = new MCDataFragment();
	Ret->getFragmentList().insert(Ret->begin(), F);
	F->setParent(Ret);
	R->setFragment(F);

	return Ret;
	}

	MCSectionELF *MCContext::createELFRelSection(const Twine &Name, unsigned Type,
	unsigned Flags, unsigned EntrySize,
	const MCSymbolELF *Group,
	const MCSectionELF *RelInfoSection) {
	StringMap<bool>::iterator I;
	bool Inserted;
	std::tie(I, Inserted) =
	RelSecNames.insert(std::make_pair(Name.str(), true));

	return createELFSectionImpl(
	I->getKey(), Type, Flags, SectionKind::getReadOnly(), EntrySize, Group,
	true, cast<MCSymbolELF>(RelInfoSection->getBeginSymbol()));
	}

	MCSectionELF *MCContext::getELFNamedSection(const Twine &Prefix,
	const Twine &Suffix, unsigned Type,
	unsigned Flags,
	unsigned EntrySize) {
	return getELFSection(Prefix + "." + Suffix, Type, Flags, EntrySize, Suffix);
	}

	MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
	unsigned Flags, unsigned EntrySize,
	const Twine &Group, unsigned UniqueID,
	const MCSymbolELF *Associated) {
	MCSymbolELF *GroupSym = nullptr;
	if (!Group.isTriviallyEmpty() && !Group.str().empty())
	GroupSym = cast<MCSymbolELF>(getOrCreateSymbol(Group));

	return getELFSection(Section, Type, Flags, EntrySize, GroupSym, UniqueID,
	Associated);
	}

	MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
	unsigned Flags, unsigned EntrySize,
	const MCSymbolELF *GroupSym,
	unsigned UniqueID,
	const MCSymbolELF *Associated) {
	StringRef Group = "";
	if (GroupSym)
	Group = GroupSym->getName();
	// Do the lookup, if we have a hit, return it.
	auto IterBool = ELFUniquingMap.insert(
	std::make_pair(ELFSectionKey{Section.str(), Group, UniqueID}, nullptr));
	auto &Entry = *IterBool.first;
	if (!IterBool.second)
	return Entry.second;

	StringRef CachedName = Entry.first.SectionName;

	SectionKind Kind;
	if (Flags & ELF::SHF_ARM_PURECODE)
	Kind = SectionKind::getExecuteOnly();
	else if (Flags & ELF::SHF_EXECINSTR)
	Kind = SectionKind::getText();
	else
	Kind = SectionKind::getReadOnly();

	MCSectionELF *Result = createELFSectionImpl(
	CachedName, Type, Flags, Kind, EntrySize, GroupSym, UniqueID, Associated);
	Entry.second = Result;
	return Result;
	}

	MCSectionELF MCContext::createELFGroupSection(const MCSymbolELF Group) {
	return createELFSectionImpl(".group", ELF::SHT_GROUP, 0,
	SectionKind::getReadOnly(), 4, Group, ~0,
	nullptr);
	}

	MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
	unsigned Characteristics,
	SectionKind Kind,
	StringRef COMDATSymName, int Selection,
	unsigned UniqueID,
	const char *BeginSymName) {
	MCSymbol *COMDATSymbol = nullptr;
	if (!COMDATSymName.empty()) {
	COMDATSymbol = getOrCreateSymbol(COMDATSymName);
	COMDATSymName = COMDATSymbol->getName();
	}


	// Do the lookup, if we have a hit, return it.
	COFFSectionKey T{Section, COMDATSymName, Selection, UniqueID};
	auto IterBool = COFFUniquingMap.insert(std::make_pair(T, nullptr));
	auto Iter = IterBool.first;
	if (!IterBool.second)
	return Iter->second;

	MCSymbol *Begin = nullptr;
	if (BeginSymName)
	Begin = createTempSymbol(BeginSymName, false);

	StringRef CachedName = Iter->first.SectionName;
	MCSectionCOFF *Result = new (COFFAllocator.Allocate()) MCSectionCOFF(
	CachedName, Characteristics, COMDATSymbol, Selection, Kind, Begin);

	Iter->second = Result;
	return Result;
	}

	MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
	unsigned Characteristics,
	SectionKind Kind,
	const char *BeginSymName) {
	return getCOFFSection(Section, Characteristics, Kind, "", 0, GenericSectionID,
	BeginSymName);
	}

	MCSectionCOFF MCContext::getAssociativeCOFFSection(MCSectionCOFF Sec,
	const MCSymbol *KeySym,
	unsigned UniqueID) {
	// Return the normal section if we don't have to be associative or unique.
	if (!KeySym && UniqueID == GenericSectionID)
	return Sec;

	// If we have a key symbol, make an associative section with the same name and
	// kind as the normal section.
	unsigned Characteristics = Sec->getCharacteristics();
	if (KeySym) {
	Characteristics \|= COFF::IMAGE_SCN_LNK_COMDAT;
	return getCOFFSection(Sec->getSectionName(), Characteristics,
	Sec->getKind(), KeySym->getName(),
	COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, UniqueID);
	}

	return getCOFFSection(Sec->getSectionName(), Characteristics, Sec->getKind(),
	"", 0, UniqueID);
	}

	MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind K,
	const Twine &Group, unsigned UniqueID,
	const char *BeginSymName) {
	MCSymbolWasm *GroupSym = nullptr;
	if (!Group.isTriviallyEmpty() && !Group.str().empty()) {
	GroupSym = cast<MCSymbolWasm>(getOrCreateSymbol(Group));
	GroupSym->setComdat(true);
	}

	return getWasmSection(Section, K, GroupSym, UniqueID, BeginSymName);
	}

	MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind Kind,
	const MCSymbolWasm *GroupSym,
	unsigned UniqueID,
	const char *BeginSymName) {
	StringRef Group = "";
	if (GroupSym)
	Group = GroupSym->getName();
	// Do the lookup, if we have a hit, return it.
	auto IterBool = WasmUniquingMap.insert(
	std::make_pair(WasmSectionKey{Section.str(), Group, UniqueID}, nullptr));
	auto &Entry = *IterBool.first;
	if (!IterBool.second)
	return Entry.second;

	StringRef CachedName = Entry.first.SectionName;

	MCSymbol *Begin = createSymbol(CachedName, false, false);
	cast<MCSymbolWasm>(Begin)->setType(wasm::WASM_SYMBOL_TYPE_SECTION);

	MCSectionWasm *Result = new (WasmAllocator.Allocate())
	MCSectionWasm(CachedName, Kind, GroupSym, UniqueID, Begin);
	Entry.second = Result;

	auto *F = new MCDataFragment();
	Result->getFragmentList().insert(Result->begin(), F);
	F->setParent(Result);
	Begin->setFragment(F);

	return Result;
	}

	MCSectionXCOFF *MCContext::getXCOFFSection(StringRef Section,
	XCOFF::StorageMappingClass SMC,
	SectionKind Kind,
	const char *BeginSymName) {
	// Do the lookup. If we have a hit, return it.
	auto IterBool = XCOFFUniquingMap.insert(
	std::make_pair(XCOFFSectionKey{Section.str(), SMC}, nullptr));
	auto &Entry = *IterBool.first;
	if (!IterBool.second)
	return Entry.second;

	// Otherwise, return a new section.
	StringRef CachedName = Entry.first.SectionName;

	MCSymbol *Begin = nullptr;
	if (BeginSymName)
	Begin = createTempSymbol(BeginSymName, false);

	MCSectionXCOFF *Result = new (XCOFFAllocator.Allocate())
	MCSectionXCOFF(CachedName, SMC, Kind, Begin);
	Entry.second = Result;

	auto *F = new MCDataFragment();
	Result->getFragmentList().insert(Result->begin(), F);
	F->setParent(Result);

	if (Begin)
	Begin->setFragment(F);

	return Result;
	}

	MCSubtargetInfo &MCContext::getSubtargetCopy(const MCSubtargetInfo &STI) {
	return *new (MCSubtargetAllocator.Allocate()) MCSubtargetInfo(STI);
	}

	void MCContext::addDebugPrefixMapEntry(const std::string &From,
	const std::string &To) {
	DebugPrefixMap.insert(std::make_pair(From, To));
	}

	void MCContext::RemapDebugPaths() {
	const auto &DebugPrefixMap = this->DebugPrefixMap;
	const auto RemapDebugPath = [&DebugPrefixMap](std::string &Path) {
	for (const auto &Entry : DebugPrefixMap)
	if (StringRef(Path).startswith(Entry.first)) {
	std::string RemappedPath =
	(Twine(Entry.second) + Path.substr(Entry.first.size())).str();
	Path.swap(RemappedPath);
	}
	};

	// Remap compilation directory.
	std::string CompDir = CompilationDir.str();
	RemapDebugPath(CompDir);
	CompilationDir = CompDir;

	// Remap MCDwarfDirs in all compilation units.
	for (auto &CUIDTablePair : MCDwarfLineTablesCUMap)
	for (auto &Dir : CUIDTablePair.second.getMCDwarfDirs())
	RemapDebugPath(Dir);
	}

	//===----------------------------------------------------------------------===//
	// Dwarf Management
	//===----------------------------------------------------------------------===//

	void MCContext::setGenDwarfRootFile(StringRef InputFileName, StringRef Buffer) {
	// MCDwarf needs the root file as well as the compilation directory.
	// If we find a '.file 0' directive that will supersede these values.
	Optional<MD5::MD5Result> Cksum;
	if (getDwarfVersion() >= 5) {
	MD5 Hash;
	MD5::MD5Result Sum;
	Hash.update(Buffer);
	Hash.final(Sum);
	Cksum = Sum;
	}
	// Canonicalize the root filename. It cannot be empty, and should not
	// repeat the compilation dir.
	// The MCContext ctor initializes MainFileName to the name associated with
	// the SrcMgr's main file ID, which might be the same as InputFileName (and
	// possibly include directory components).
	// Or, MainFileName might have been overridden by a -main-file-name option,
	// which is supposed to be just a base filename with no directory component.
	// So, if the InputFileName and MainFileName are not equal, assume
	// MainFileName is a substitute basename and replace the last component.
	SmallString<1024> FileNameBuf = InputFileName;
	if (FileNameBuf.empty() \|\| FileNameBuf == "-")
	FileNameBuf = "<stdin>";
	if (!getMainFileName().empty() && FileNameBuf != getMainFileName()) {
	llvm::sys::path::remove_filename(FileNameBuf);
	llvm::sys::path::append(FileNameBuf, getMainFileName());
	}
	StringRef FileName = FileNameBuf;
	if (FileName.consume_front(getCompilationDir()))
	if (llvm::sys::path::is_separator(FileName.front()))
	FileName = FileName.drop_front();
	assert(!FileName.empty());
	setMCLineTableRootFile(
	/CUID=/0, getCompilationDir(), FileName, Cksum, None);
	}

	/// getDwarfFile - takes a file name and number to place in the dwarf file and
	/// directory tables. If the file number has already been allocated it is an
	/// error and zero is returned and the client reports the error, else the
	/// allocated file number is returned. The file numbers may be in any order.
	Expected<unsigned> MCContext::getDwarfFile(StringRef Directory,
	StringRef FileName,
	unsigned FileNumber,
	Optional<MD5::MD5Result> Checksum,
	Optional<StringRef> Source,
	unsigned CUID) {
	MCDwarfLineTable &Table = MCDwarfLineTablesCUMap[CUID];
	return Table.tryGetFile(Directory, FileName, Checksum, Source, DwarfVersion,
	FileNumber);
	}

	/// isValidDwarfFileNumber - takes a dwarf file number and returns true if it
	/// currently is assigned and false otherwise.
	bool MCContext::isValidDwarfFileNumber(unsigned FileNumber, unsigned CUID) {
	const MCDwarfLineTable &LineTable = getMCDwarfLineTable(CUID);
	if (FileNumber == 0)
	return getDwarfVersion() >= 5;
	if (FileNumber >= LineTable.getMCDwarfFiles().size())
	return false;

	return !LineTable.getMCDwarfFiles()[FileNumber].Name.empty();
	}

	/// Remove empty sections from SectionsForRanges, to avoid generating
	/// useless debug info for them.
	void MCContext::finalizeDwarfSections(MCStreamer &MCOS) {
	SectionsForRanges.remove_if(
	[&](MCSection Sec) { return !MCOS.mayHaveInstructions(Sec); });
	}

	CodeViewContext &MCContext::getCVContext() {
	if (!CVContext.get())
	CVContext.reset(new CodeViewContext);
	return *CVContext.get();
	}

	//===----------------------------------------------------------------------===//
	// Error Reporting
	//===----------------------------------------------------------------------===//

	void MCContext::reportError(SMLoc Loc, const Twine &Msg) {
	HadError = true;

	// If we have a source manager use it. Otherwise, try using the inline source
	// manager.
	// If that fails, use the generic report_fatal_error().
	if (SrcMgr)
	SrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg);
	else if (InlineSrcMgr)
	InlineSrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg);
	else
	report_fatal_error(Msg, false);
	}

	void MCContext::reportFatalError(SMLoc Loc, const Twine &Msg) {
	reportError(Loc, Msg);

	// If we reached here, we are failing ungracefully. Run the interrupt handlers
	// to make sure any special cleanups get done, in particular that we remove
	// files registered with RemoveFileOnSignal.
	sys::RunInterruptHandlers();
	exit(1);
	}
	Index: vendor/llvm/dist-release_90/lib/MC/MCParser/AsmParser.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/MC/MCParser/AsmParser.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/MC/MCParser/AsmParser.cpp (revision 351303)
	@@ -1,5960 +1,5962 @@
	//===- AsmParser.cpp - Parser for Assembly Files --------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This class implements the parser for assembly files.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringMap.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/BinaryFormat/Dwarf.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCCodeView.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCDirectives.h"
	#include "llvm/MC/MCDwarf.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCInstPrinter.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/MC/MCInstrInfo.h"
	#include "llvm/MC/MCObjectFileInfo.h"
	#include "llvm/MC/MCParser/AsmCond.h"
	#include "llvm/MC/MCParser/AsmLexer.h"
	#include "llvm/MC/MCParser/MCAsmLexer.h"
	#include "llvm/MC/MCParser/MCAsmParser.h"
	#include "llvm/MC/MCParser/MCAsmParserExtension.h"
	#include "llvm/MC/MCParser/MCAsmParserUtils.h"
	#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
	#include "llvm/MC/MCParser/MCTargetAsmParser.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/MC/MCSection.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/MC/MCTargetOptions.h"
	#include "llvm/MC/MCValue.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MD5.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/MemoryBuffer.h"
	#include "llvm/Support/SMLoc.h"
	#include "llvm/Support/SourceMgr.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <cctype>
	#include <climits>
	#include <cstddef>
	#include <cstdint>
	#include <deque>
	#include <memory>
	#include <sstream>
	#include <string>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;

	MCAsmParserSemaCallback::~MCAsmParserSemaCallback() = default;

	static cl::opt<unsigned> AsmMacroMaxNestingDepth(
	"asm-macro-max-nesting-depth", cl::init(20), cl::Hidden,
	cl::desc("The maximum nesting depth allowed for assembly macros."));

	namespace {

	/// Helper types for tracking macro definitions.
	typedef std::vector<AsmToken> MCAsmMacroArgument;
	typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;

	/// Helper class for storing information about an active macro
	/// instantiation.
	struct MacroInstantiation {
	/// The location of the instantiation.
	SMLoc InstantiationLoc;

	/// The buffer where parsing should resume upon instantiation completion.
	int ExitBuffer;

	/// The location where parsing should resume upon instantiation completion.
	SMLoc ExitLoc;

	/// The depth of TheCondStack at the start of the instantiation.
	size_t CondStackDepth;

	public:
	MacroInstantiation(SMLoc IL, int EB, SMLoc EL, size_t CondStackDepth);
	};

	struct ParseStatementInfo {
	/// The parsed operands from the last parsed statement.
	SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> ParsedOperands;

	/// The opcode from the last parsed instruction.
	unsigned Opcode = ~0U;

	/// Was there an error parsing the inline assembly?
	bool ParseError = false;

	SmallVectorImpl<AsmRewrite> *AsmRewrites = nullptr;

	ParseStatementInfo() = delete;
	ParseStatementInfo(SmallVectorImpl<AsmRewrite> *rewrites)
	: AsmRewrites(rewrites) {}
	};

	/// The concrete assembly parser instance.
	class AsmParser : public MCAsmParser {
	private:
	AsmLexer Lexer;
	MCContext &Ctx;
	MCStreamer &Out;
	const MCAsmInfo &MAI;
	SourceMgr &SrcMgr;
	SourceMgr::DiagHandlerTy SavedDiagHandler;
	void *SavedDiagContext;
	std::unique_ptr<MCAsmParserExtension> PlatformParser;

	/// This is the current buffer index we're lexing from as managed by the
	/// SourceMgr object.
	unsigned CurBuffer;

	AsmCond TheCondState;
	std::vector<AsmCond> TheCondStack;

	/// maps directive names to handler methods in parser
	/// extensions. Extensions register themselves in this map by calling
	/// addDirectiveHandler.
	StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;

	/// Stack of active macro instantiations.
	std::vector<MacroInstantiation*> ActiveMacros;

	/// List of bodies of anonymous macros.
	std::deque<MCAsmMacro> MacroLikeBodies;

	/// Boolean tracking whether macro substitution is enabled.
	unsigned MacrosEnabledFlag : 1;

	/// Keeps track of how many .macro's have been instantiated.
	unsigned NumOfMacroInstantiations;

	/// The values from the last parsed cpp hash file line comment if any.
	struct CppHashInfoTy {
	StringRef Filename;
	int64_t LineNumber;
	SMLoc Loc;
	unsigned Buf;
	CppHashInfoTy() : Filename(), LineNumber(0), Loc(), Buf(0) {}
	};
	CppHashInfoTy CppHashInfo;

	/// The filename from the first cpp hash file line comment, if any.
	StringRef FirstCppHashFilename;

	/// List of forward directional labels for diagnosis at the end.
	SmallVector<std::tuple<SMLoc, CppHashInfoTy, MCSymbol *>, 4> DirLabels;

	/// AssemblerDialect. ~OU means unset value and use value provided by MAI.
	unsigned AssemblerDialect = ~0U;

	/// is Darwin compatibility enabled?
	bool IsDarwin = false;

	/// Are we parsing ms-style inline assembly?
	bool ParsingInlineAsm = false;

	/// Did we already inform the user about inconsistent MD5 usage?
	bool ReportedInconsistentMD5 = false;

	// Is alt macro mode enabled.
	bool AltMacroMode = false;

	public:
	AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
	const MCAsmInfo &MAI, unsigned CB);
	AsmParser(const AsmParser &) = delete;
	AsmParser &operator=(const AsmParser &) = delete;
	~AsmParser() override;

	bool Run(bool NoInitialTextSection, bool NoFinalize = false) override;

	void addDirectiveHandler(StringRef Directive,
	ExtensionDirectiveHandler Handler) override {
	ExtensionDirectiveMap[Directive] = Handler;
	}

	void addAliasForDirective(StringRef Directive, StringRef Alias) override {
	DirectiveKindMap[Directive] = DirectiveKindMap[Alias];
	}

	/// @name MCAsmParser Interface
	/// {

	SourceMgr &getSourceManager() override { return SrcMgr; }
	MCAsmLexer &getLexer() override { return Lexer; }
	MCContext &getContext() override { return Ctx; }
	MCStreamer &getStreamer() override { return Out; }

	CodeViewContext &getCVContext() { return Ctx.getCVContext(); }

	unsigned getAssemblerDialect() override {
	if (AssemblerDialect == ~0U)
	return MAI.getAssemblerDialect();
	else
	return AssemblerDialect;
	}
	void setAssemblerDialect(unsigned i) override {
	AssemblerDialect = i;
	}

	void Note(SMLoc L, const Twine &Msg, SMRange Range = None) override;
	bool Warning(SMLoc L, const Twine &Msg, SMRange Range = None) override;
	bool printError(SMLoc L, const Twine &Msg, SMRange Range = None) override;

	const AsmToken &Lex() override;

	void setParsingInlineAsm(bool V) override {
	ParsingInlineAsm = V;
	// When parsing MS inline asm, we must lex 0b1101 and 0ABCH as binary and
	// hex integer literals.
	Lexer.setLexMasmIntegers(V);
	}
	bool isParsingInlineAsm() override { return ParsingInlineAsm; }

	bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
	unsigned &NumOutputs, unsigned &NumInputs,
	SmallVectorImpl<std::pair<void *,bool>> &OpDecls,
	SmallVectorImpl<std::string> &Constraints,
	SmallVectorImpl<std::string> &Clobbers,
	const MCInstrInfo MII, const MCInstPrinter IP,
	MCAsmParserSemaCallback &SI) override;

	bool parseExpression(const MCExpr *&Res);
	bool parseExpression(const MCExpr *&Res, SMLoc &EndLoc) override;
	bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override;
	bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) override;
	bool parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res,
	SMLoc &EndLoc) override;
	bool parseAbsoluteExpression(int64_t &Res) override;

	/// Parse a floating point expression using the float \p Semantics
	/// and set \p Res to the value.
	bool parseRealValue(const fltSemantics &Semantics, APInt &Res);

	/// Parse an identifier or string (as a quoted identifier)
	/// and set \p Res to the identifier contents.
	bool parseIdentifier(StringRef &Res) override;
	void eatToEndOfStatement() override;

	bool checkForValidSection() override;

	/// }

	private:
	bool parseStatement(ParseStatementInfo &Info,
	MCAsmParserSemaCallback *SI);
	bool parseCurlyBlockScope(SmallVectorImpl<AsmRewrite>& AsmStrRewrites);
	bool parseCppHashLineFilenameComment(SMLoc L);

	void checkForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
	ArrayRef<MCAsmMacroParameter> Parameters);
	bool expandMacro(raw_svector_ostream &OS, StringRef Body,
	ArrayRef<MCAsmMacroParameter> Parameters,
	ArrayRef<MCAsmMacroArgument> A, bool EnableAtPseudoVariable,
	SMLoc L);

	/// Are macros enabled in the parser?
	bool areMacrosEnabled() {return MacrosEnabledFlag;}

	/// Control a flag in the parser that enables or disables macros.
	void setMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}

	/// Are we inside a macro instantiation?
	bool isInsideMacroInstantiation() {return !ActiveMacros.empty();}

	/// Handle entry to macro instantiation.
	///
	/// \param M The macro.
	/// \param NameLoc Instantiation location.
	bool handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc);

	/// Handle exit from macro instantiation.
	void handleMacroExit();

	/// Extract AsmTokens for a macro argument.
	bool parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg);

	/// Parse all macro arguments for a given macro.
	bool parseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);

	void printMacroInstantiations();
	void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
	SMRange Range = None) const {
	ArrayRef<SMRange> Ranges(Range);
	SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
	}
	static void DiagHandler(const SMDiagnostic &Diag, void *Context);

	/// Should we emit DWARF describing this assembler source? (Returns false if
	/// the source has .file directives, which means we don't want to generate
	/// info describing the assembler source itself.)
	bool enabledGenDwarfForAssembly();

	/// Enter the specified file. This returns true on failure.
	bool enterIncludeFile(const std::string &Filename);

	/// Process the specified file for the .incbin directive.
	/// This returns true on failure.
	bool processIncbinFile(const std::string &Filename, int64_t Skip = 0,
	const MCExpr *Count = nullptr, SMLoc Loc = SMLoc());

	/// Reset the current lexer position to that given by \p Loc. The
	/// current token is not set; clients should ensure Lex() is called
	/// subsequently.
	///
	/// \param InBuffer If not 0, should be the known buffer id that contains the
	/// location.
	void jumpToLoc(SMLoc Loc, unsigned InBuffer = 0);

	/// Parse up to the end of statement and a return the contents from the
	/// current token until the end of the statement; the current token on exit
	/// will be either the EndOfStatement or EOF.
	StringRef parseStringToEndOfStatement() override;

	/// Parse until the end of a statement or a comma is encountered,
	/// return the contents from the current token up to the end or comma.
	StringRef parseStringToComma();

	bool parseAssignment(StringRef Name, bool allow_redef,
	bool NoDeadStrip = false);

	unsigned getBinOpPrecedence(AsmToken::TokenKind K,
	MCBinaryExpr::Opcode &Kind);

	bool parseBinOpRHS(unsigned Precedence, const MCExpr *&Res, SMLoc &EndLoc);
	bool parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc);
	bool parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc);

	bool parseRegisterOrRegisterNumber(int64_t &Register, SMLoc DirectiveLoc);

	bool parseCVFunctionId(int64_t &FunctionId, StringRef DirectiveName);
	bool parseCVFileId(int64_t &FileId, StringRef DirectiveName);

	// Generic (target and platform independent) directive parsing.
	enum DirectiveKind {
	DK_NO_DIRECTIVE, // Placeholder
	DK_SET,
	DK_EQU,
	DK_EQUIV,
	DK_ASCII,
	DK_ASCIZ,
	DK_STRING,
	DK_BYTE,
	DK_SHORT,
	DK_RELOC,
	DK_VALUE,
	DK_2BYTE,
	DK_LONG,
	DK_INT,
	DK_4BYTE,
	DK_QUAD,
	DK_8BYTE,
	DK_OCTA,
	DK_DC,
	DK_DC_A,
	DK_DC_B,
	DK_DC_D,
	DK_DC_L,
	DK_DC_S,
	DK_DC_W,
	DK_DC_X,
	DK_DCB,
	DK_DCB_B,
	DK_DCB_D,
	DK_DCB_L,
	DK_DCB_S,
	DK_DCB_W,
	DK_DCB_X,
	DK_DS,
	DK_DS_B,
	DK_DS_D,
	DK_DS_L,
	DK_DS_P,
	DK_DS_S,
	DK_DS_W,
	DK_DS_X,
	DK_SINGLE,
	DK_FLOAT,
	DK_DOUBLE,
	DK_ALIGN,
	DK_ALIGN32,
	DK_BALIGN,
	DK_BALIGNW,
	DK_BALIGNL,
	DK_P2ALIGN,
	DK_P2ALIGNW,
	DK_P2ALIGNL,
	DK_ORG,
	DK_FILL,
	DK_ENDR,
	DK_BUNDLE_ALIGN_MODE,
	DK_BUNDLE_LOCK,
	DK_BUNDLE_UNLOCK,
	DK_ZERO,
	DK_EXTERN,
	DK_GLOBL,
	DK_GLOBAL,
	DK_LAZY_REFERENCE,
	DK_NO_DEAD_STRIP,
	DK_SYMBOL_RESOLVER,
	DK_PRIVATE_EXTERN,
	DK_REFERENCE,
	DK_WEAK_DEFINITION,
	DK_WEAK_REFERENCE,
	DK_WEAK_DEF_CAN_BE_HIDDEN,
	DK_COLD,
	DK_COMM,
	DK_COMMON,
	DK_LCOMM,
	DK_ABORT,
	DK_INCLUDE,
	DK_INCBIN,
	DK_CODE16,
	DK_CODE16GCC,
	DK_REPT,
	DK_IRP,
	DK_IRPC,
	DK_IF,
	DK_IFEQ,
	DK_IFGE,
	DK_IFGT,
	DK_IFLE,
	DK_IFLT,
	DK_IFNE,
	DK_IFB,
	DK_IFNB,
	DK_IFC,
	DK_IFEQS,
	DK_IFNC,
	DK_IFNES,
	DK_IFDEF,
	DK_IFNDEF,
	DK_IFNOTDEF,
	DK_ELSEIF,
	DK_ELSE,
	DK_ENDIF,
	DK_SPACE,
	DK_SKIP,
	DK_FILE,
	DK_LINE,
	DK_LOC,
	DK_STABS,
	DK_CV_FILE,
	DK_CV_FUNC_ID,
	DK_CV_INLINE_SITE_ID,
	DK_CV_LOC,
	DK_CV_LINETABLE,
	DK_CV_INLINE_LINETABLE,
	DK_CV_DEF_RANGE,
	DK_CV_STRINGTABLE,
	DK_CV_STRING,
	DK_CV_FILECHECKSUMS,
	DK_CV_FILECHECKSUM_OFFSET,
	DK_CV_FPO_DATA,
	DK_CFI_SECTIONS,
	DK_CFI_STARTPROC,
	DK_CFI_ENDPROC,
	DK_CFI_DEF_CFA,
	DK_CFI_DEF_CFA_OFFSET,
	DK_CFI_ADJUST_CFA_OFFSET,
	DK_CFI_DEF_CFA_REGISTER,
	DK_CFI_OFFSET,
	DK_CFI_REL_OFFSET,
	DK_CFI_PERSONALITY,
	DK_CFI_LSDA,
	DK_CFI_REMEMBER_STATE,
	DK_CFI_RESTORE_STATE,
	DK_CFI_SAME_VALUE,
	DK_CFI_RESTORE,
	DK_CFI_ESCAPE,
	DK_CFI_RETURN_COLUMN,
	DK_CFI_SIGNAL_FRAME,
	DK_CFI_UNDEFINED,
	DK_CFI_REGISTER,
	DK_CFI_WINDOW_SAVE,
	DK_CFI_B_KEY_FRAME,
	DK_MACROS_ON,
	DK_MACROS_OFF,
	DK_ALTMACRO,
	DK_NOALTMACRO,
	DK_MACRO,
	DK_EXITM,
	DK_ENDM,
	DK_ENDMACRO,
	DK_PURGEM,
	DK_SLEB128,
	DK_ULEB128,
	DK_ERR,
	DK_ERROR,
	DK_WARNING,
	DK_PRINT,
	DK_ADDRSIG,
	DK_ADDRSIG_SYM,
	DK_END
	};

	/// Maps directive name --> DirectiveKind enum, for
	/// directives parsed by this class.
	StringMap<DirectiveKind> DirectiveKindMap;

	// ".ascii", ".asciz", ".string"
	bool parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
	bool parseDirectiveReloc(SMLoc DirectiveLoc); // ".reloc"
	bool parseDirectiveValue(StringRef IDVal,
	unsigned Size); // ".byte", ".long", ...
	bool parseDirectiveOctaValue(StringRef IDVal); // ".octa", ...
	bool parseDirectiveRealValue(StringRef IDVal,
	const fltSemantics &); // ".single", ...
	bool parseDirectiveFill(); // ".fill"
	bool parseDirectiveZero(); // ".zero"
	// ".set", ".equ", ".equiv"
	bool parseDirectiveSet(StringRef IDVal, bool allow_redef);
	bool parseDirectiveOrg(); // ".org"
	// ".align{,32}", ".p2align{,w,l}"
	bool parseDirectiveAlign(bool IsPow2, unsigned ValueSize);

	// ".file", ".line", ".loc", ".stabs"
	bool parseDirectiveFile(SMLoc DirectiveLoc);
	bool parseDirectiveLine();
	bool parseDirectiveLoc();
	bool parseDirectiveStabs();

	// ".cv_file", ".cv_func_id", ".cv_inline_site_id", ".cv_loc", ".cv_linetable",
	// ".cv_inline_linetable", ".cv_def_range", ".cv_string"
	bool parseDirectiveCVFile();
	bool parseDirectiveCVFuncId();
	bool parseDirectiveCVInlineSiteId();
	bool parseDirectiveCVLoc();
	bool parseDirectiveCVLinetable();
	bool parseDirectiveCVInlineLinetable();
	bool parseDirectiveCVDefRange();
	bool parseDirectiveCVString();
	bool parseDirectiveCVStringTable();
	bool parseDirectiveCVFileChecksums();
	bool parseDirectiveCVFileChecksumOffset();
	bool parseDirectiveCVFPOData();

	// .cfi directives
	bool parseDirectiveCFIRegister(SMLoc DirectiveLoc);
	bool parseDirectiveCFIWindowSave();
	bool parseDirectiveCFISections();
	bool parseDirectiveCFIStartProc();
	bool parseDirectiveCFIEndProc();
	bool parseDirectiveCFIDefCfaOffset();
	bool parseDirectiveCFIDefCfa(SMLoc DirectiveLoc);
	bool parseDirectiveCFIAdjustCfaOffset();
	bool parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc);
	bool parseDirectiveCFIOffset(SMLoc DirectiveLoc);
	bool parseDirectiveCFIRelOffset(SMLoc DirectiveLoc);
	bool parseDirectiveCFIPersonalityOrLsda(bool IsPersonality);
	bool parseDirectiveCFIRememberState();
	bool parseDirectiveCFIRestoreState();
	bool parseDirectiveCFISameValue(SMLoc DirectiveLoc);
	bool parseDirectiveCFIRestore(SMLoc DirectiveLoc);
	bool parseDirectiveCFIEscape();
	bool parseDirectiveCFIReturnColumn(SMLoc DirectiveLoc);
	bool parseDirectiveCFISignalFrame();
	bool parseDirectiveCFIUndefined(SMLoc DirectiveLoc);

	// macro directives
	bool parseDirectivePurgeMacro(SMLoc DirectiveLoc);
	bool parseDirectiveExitMacro(StringRef Directive);
	bool parseDirectiveEndMacro(StringRef Directive);
	bool parseDirectiveMacro(SMLoc DirectiveLoc);
	bool parseDirectiveMacrosOnOff(StringRef Directive);
	// alternate macro mode directives
	bool parseDirectiveAltmacro(StringRef Directive);
	// ".bundle_align_mode"
	bool parseDirectiveBundleAlignMode();
	// ".bundle_lock"
	bool parseDirectiveBundleLock();
	// ".bundle_unlock"
	bool parseDirectiveBundleUnlock();

	// ".space", ".skip"
	bool parseDirectiveSpace(StringRef IDVal);

	// ".dcb"
	bool parseDirectiveDCB(StringRef IDVal, unsigned Size);
	bool parseDirectiveRealDCB(StringRef IDVal, const fltSemantics &);
	// ".ds"
	bool parseDirectiveDS(StringRef IDVal, unsigned Size);

	// .sleb128 (Signed=true) and .uleb128 (Signed=false)
	bool parseDirectiveLEB128(bool Signed);

	/// Parse a directive like ".globl" which
	/// accepts a single symbol (which should be a label or an external).
	bool parseDirectiveSymbolAttribute(MCSymbolAttr Attr);

	bool parseDirectiveComm(bool IsLocal); // ".comm" and ".lcomm"

	bool parseDirectiveAbort(); // ".abort"
	bool parseDirectiveInclude(); // ".include"
	bool parseDirectiveIncbin(); // ".incbin"

	// ".if", ".ifeq", ".ifge", ".ifgt" , ".ifle", ".iflt" or ".ifne"
	bool parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind);
	// ".ifb" or ".ifnb", depending on ExpectBlank.
	bool parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank);
	// ".ifc" or ".ifnc", depending on ExpectEqual.
	bool parseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual);
	// ".ifeqs" or ".ifnes", depending on ExpectEqual.
	bool parseDirectiveIfeqs(SMLoc DirectiveLoc, bool ExpectEqual);
	// ".ifdef" or ".ifndef", depending on expect_defined
	bool parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined);
	bool parseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif"
	bool parseDirectiveElse(SMLoc DirectiveLoc); // ".else"
	bool parseDirectiveEndIf(SMLoc DirectiveLoc); // .endif
	bool parseEscapedString(std::string &Data) override;

	const MCExpr applyModifierToExpr(const MCExpr E,
	MCSymbolRefExpr::VariantKind Variant);

	// Macro-like directives
	MCAsmMacro *parseMacroLikeBody(SMLoc DirectiveLoc);
	void instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
	raw_svector_ostream &OS);
	bool parseDirectiveRept(SMLoc DirectiveLoc, StringRef Directive);
	bool parseDirectiveIrp(SMLoc DirectiveLoc); // ".irp"
	bool parseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
	bool parseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"

	// "_emit" or "__emit"
	bool parseDirectiveMSEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info,
	size_t Len);

	// "align"
	bool parseDirectiveMSAlign(SMLoc DirectiveLoc, ParseStatementInfo &Info);

	// "end"
	bool parseDirectiveEnd(SMLoc DirectiveLoc);

	// ".err" or ".error"
	bool parseDirectiveError(SMLoc DirectiveLoc, bool WithMessage);

	// ".warning"
	bool parseDirectiveWarning(SMLoc DirectiveLoc);

	// .print <double-quotes-string>
	bool parseDirectivePrint(SMLoc DirectiveLoc);

	// Directives to support address-significance tables.
	bool parseDirectiveAddrsig();
	bool parseDirectiveAddrsigSym();

	void initializeDirectiveKindMap();
	};

	} // end anonymous namespace

	namespace llvm {

	extern MCAsmParserExtension *createDarwinAsmParser();
	extern MCAsmParserExtension *createELFAsmParser();
	extern MCAsmParserExtension *createCOFFAsmParser();
	extern MCAsmParserExtension *createWasmAsmParser();

	} // end namespace llvm

	enum { DEFAULT_ADDRSPACE = 0 };

	AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
	const MCAsmInfo &MAI, unsigned CB = 0)
	: Lexer(MAI), Ctx(Ctx), Out(Out), MAI(MAI), SrcMgr(SM),
	CurBuffer(CB ? CB : SM.getMainFileID()), MacrosEnabledFlag(true) {
	HadError = false;
	// Save the old handler.
	SavedDiagHandler = SrcMgr.getDiagHandler();
	SavedDiagContext = SrcMgr.getDiagContext();
	// Set our own handler which calls the saved handler.
	SrcMgr.setDiagHandler(DiagHandler, this);
	Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());

	// Initialize the platform / file format parser.
	switch (Ctx.getObjectFileInfo()->getObjectFileType()) {
	case MCObjectFileInfo::IsCOFF:
	PlatformParser.reset(createCOFFAsmParser());
	break;
	case MCObjectFileInfo::IsMachO:
	PlatformParser.reset(createDarwinAsmParser());
	IsDarwin = true;
	break;
	case MCObjectFileInfo::IsELF:
	PlatformParser.reset(createELFAsmParser());
	break;
	case MCObjectFileInfo::IsWasm:
	PlatformParser.reset(createWasmAsmParser());
	break;
	case MCObjectFileInfo::IsXCOFF:
	// TODO: Need to implement createXCOFFAsmParser for XCOFF format.
	break;
	}

	PlatformParser->Initialize(*this);
	initializeDirectiveKindMap();

	NumOfMacroInstantiations = 0;
	}

	AsmParser::~AsmParser() {
	assert((HadError \|\| ActiveMacros.empty()) &&
	"Unexpected active macro instantiation!");

	// Restore the saved diagnostics handler and context for use during
	// finalization.
	SrcMgr.setDiagHandler(SavedDiagHandler, SavedDiagContext);
	}

	void AsmParser::printMacroInstantiations() {
	// Print the active macro instantiation stack.
	for (std::vector<MacroInstantiation *>::const_reverse_iterator
	it = ActiveMacros.rbegin(),
	ie = ActiveMacros.rend();
	it != ie; ++it)
	printMessage((*it)->InstantiationLoc, SourceMgr::DK_Note,
	"while in macro instantiation");
	}

	void AsmParser::Note(SMLoc L, const Twine &Msg, SMRange Range) {
	printPendingErrors();
	printMessage(L, SourceMgr::DK_Note, Msg, Range);
	printMacroInstantiations();
	}

	bool AsmParser::Warning(SMLoc L, const Twine &Msg, SMRange Range) {
	if(getTargetParser().getTargetOptions().MCNoWarn)
	return false;
	if (getTargetParser().getTargetOptions().MCFatalWarnings)
	return Error(L, Msg, Range);
	printMessage(L, SourceMgr::DK_Warning, Msg, Range);
	printMacroInstantiations();
	return false;
	}

	bool AsmParser::printError(SMLoc L, const Twine &Msg, SMRange Range) {
	HadError = true;
	printMessage(L, SourceMgr::DK_Error, Msg, Range);
	printMacroInstantiations();
	return true;
	}

	bool AsmParser::enterIncludeFile(const std::string &Filename) {
	std::string IncludedFile;
	unsigned NewBuf =
	SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
	if (!NewBuf)
	return true;

	CurBuffer = NewBuf;
	Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
	return false;
	}

	/// Process the specified .incbin file by searching for it in the include paths
	/// then just emitting the byte contents of the file to the streamer. This
	/// returns true on failure.
	bool AsmParser::processIncbinFile(const std::string &Filename, int64_t Skip,
	const MCExpr *Count, SMLoc Loc) {
	std::string IncludedFile;
	unsigned NewBuf =
	SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
	if (!NewBuf)
	return true;

	// Pick up the bytes from the file and emit them.
	StringRef Bytes = SrcMgr.getMemoryBuffer(NewBuf)->getBuffer();
	Bytes = Bytes.drop_front(Skip);
	if (Count) {
	int64_t Res;
	if (!Count->evaluateAsAbsolute(Res, getStreamer().getAssemblerPtr()))
	return Error(Loc, "expected absolute expression");
	if (Res < 0)
	return Warning(Loc, "negative count has no effect");
	Bytes = Bytes.take_front(Res);
	}
	getStreamer().EmitBytes(Bytes);
	return false;
	}

	void AsmParser::jumpToLoc(SMLoc Loc, unsigned InBuffer) {
	CurBuffer = InBuffer ? InBuffer : SrcMgr.FindBufferContainingLoc(Loc);
	Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(),
	Loc.getPointer());
	}

	const AsmToken &AsmParser::Lex() {
	if (Lexer.getTok().is(AsmToken::Error))
	Error(Lexer.getErrLoc(), Lexer.getErr());

	// if it's a end of statement with a comment in it
	if (getTok().is(AsmToken::EndOfStatement)) {
	// if this is a line comment output it.
	if (!getTok().getString().empty() && getTok().getString().front() != '\n' &&
	getTok().getString().front() != '\r' && MAI.preserveAsmComments())
	Out.addExplicitComment(Twine(getTok().getString()));
	}

	const AsmToken *tok = &Lexer.Lex();

	// Parse comments here to be deferred until end of next statement.
	while (tok->is(AsmToken::Comment)) {
	if (MAI.preserveAsmComments())
	Out.addExplicitComment(Twine(tok->getString()));
	tok = &Lexer.Lex();
	}

	if (tok->is(AsmToken::Eof)) {
	// If this is the end of an included file, pop the parent file off the
	// include stack.
	SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
	if (ParentIncludeLoc != SMLoc()) {
	jumpToLoc(ParentIncludeLoc);
	return Lex();
	}
	}

	return *tok;
	}

	bool AsmParser::enabledGenDwarfForAssembly() {
	// Check whether the user specified -g.
	if (!getContext().getGenDwarfForAssembly())
	return false;
	// If we haven't encountered any .file directives (which would imply that
	// the assembler source was produced with debug info already) then emit one
	// describing the assembler source file itself.
	if (getContext().getGenDwarfFileNumber() == 0) {
	// Use the first #line directive for this, if any. It's preprocessed, so
	// there is no checksum, and of course no source directive.
	if (!FirstCppHashFilename.empty())
	getContext().setMCLineTableRootFile(/CUID=/0,
	getContext().getCompilationDir(),
	FirstCppHashFilename,
	/Cksum=/None, /Source=/None);
	const MCDwarfFile &RootFile =
	getContext().getMCDwarfLineTable(/CUID=/0).getRootFile();
	getContext().setGenDwarfFileNumber(getStreamer().EmitDwarfFileDirective(
	/CUID=/0, getContext().getCompilationDir(), RootFile.Name,
	RootFile.Checksum, RootFile.Source));
	}
	return true;
	}

	bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
	// Create the initial section, if requested.
	if (!NoInitialTextSection)
	Out.InitSections(false);

	// Prime the lexer.
	Lex();

	HadError = false;
	AsmCond StartingCondState = TheCondState;
	SmallVector<AsmRewrite, 4> AsmStrRewrites;

	// If we are generating dwarf for assembly source files save the initial text
	// section. (Don't use enabledGenDwarfForAssembly() here, as we aren't
	// emitting any actual debug info yet and haven't had a chance to parse any
	// embedded .file directives.)
	if (getContext().getGenDwarfForAssembly()) {
	MCSection *Sec = getStreamer().getCurrentSectionOnly();
	if (!Sec->getBeginSymbol()) {
	MCSymbol *SectionStartSym = getContext().createTempSymbol();
	getStreamer().EmitLabel(SectionStartSym);
	Sec->setBeginSymbol(SectionStartSym);
	}
	bool InsertResult = getContext().addGenDwarfSection(Sec);
	assert(InsertResult && ".text section should not have debug info yet");
	(void)InsertResult;
	}

	// While we have input, parse each statement.
	while (Lexer.isNot(AsmToken::Eof)) {
	ParseStatementInfo Info(&AsmStrRewrites);
	if (!parseStatement(Info, nullptr))
	continue;

	// If we have a Lexer Error we are on an Error Token. Load in Lexer Error
	// for printing ErrMsg via Lex() only if no (presumably better) parser error
	// exists.
	if (!hasPendingError() && Lexer.getTok().is(AsmToken::Error)) {
	Lex();
	}

	// parseStatement returned true so may need to emit an error.
	printPendingErrors();

	// Skipping to the next line if needed.
	if (!getLexer().isAtStartOfStatement())
	eatToEndOfStatement();
	}

	getTargetParser().onEndOfFile();
	printPendingErrors();

	// All errors should have been emitted.
	assert(!hasPendingError() && "unexpected error from parseStatement");

	getTargetParser().flushPendingInstructions(getStreamer());

	if (TheCondState.TheCond != StartingCondState.TheCond \|\|
	TheCondState.Ignore != StartingCondState.Ignore)
	printError(getTok().getLoc(), "unmatched .ifs or .elses");
	// Check to see there are no empty DwarfFile slots.
	const auto &LineTables = getContext().getMCDwarfLineTables();
	if (!LineTables.empty()) {
	unsigned Index = 0;
	for (const auto &File : LineTables.begin()->second.getMCDwarfFiles()) {
	if (File.Name.empty() && Index != 0)
	printError(getTok().getLoc(), "unassigned file number: " +
	Twine(Index) +
	" for .file directives");
	++Index;
	}
	}

	// Check to see that all assembler local symbols were actually defined.
	// Targets that don't do subsections via symbols may not want this, though,
	// so conservatively exclude them. Only do this if we're finalizing, though,
	// as otherwise we won't necessarilly have seen everything yet.
	if (!NoFinalize) {
	if (MAI.hasSubsectionsViaSymbols()) {
	for (const auto &TableEntry : getContext().getSymbols()) {
	MCSymbol *Sym = TableEntry.getValue();
	// Variable symbols may not be marked as defined, so check those
	// explicitly. If we know it's a variable, we have a definition for
	// the purposes of this check.
	if (Sym->isTemporary() && !Sym->isVariable() && !Sym->isDefined())
	// FIXME: We would really like to refer back to where the symbol was
	// first referenced for a source location. We need to add something
	// to track that. Currently, we just point to the end of the file.
	printError(getTok().getLoc(), "assembler local symbol '" +
	Sym->getName() + "' not defined");
	}
	}

	// Temporary symbols like the ones for directional jumps don't go in the
	// symbol table. They also need to be diagnosed in all (final) cases.
	for (std::tuple<SMLoc, CppHashInfoTy, MCSymbol *> &LocSym : DirLabels) {
	if (std::get<2>(LocSym)->isUndefined()) {
	// Reset the state of any "# line file" directives we've seen to the
	// context as it was at the diagnostic site.
	CppHashInfo = std::get<1>(LocSym);
	printError(std::get<0>(LocSym), "directional label undefined");
	}
	}
	}

	// Finalize the output stream if there are no errors and if the client wants
	// us to.
	if (!HadError && !NoFinalize)
	Out.Finish();

	return HadError \|\| getContext().hadError();
	}

	bool AsmParser::checkForValidSection() {
	if (!ParsingInlineAsm && !getStreamer().getCurrentSectionOnly()) {
	Out.InitSections(false);
	return Error(getTok().getLoc(),
	"expected section directive before assembly directive");
	}
	return false;
	}

	/// Throw away the rest of the line for testing purposes.
	void AsmParser::eatToEndOfStatement() {
	while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
	Lexer.Lex();

	// Eat EOL.
	if (Lexer.is(AsmToken::EndOfStatement))
	Lexer.Lex();
	}

	StringRef AsmParser::parseStringToEndOfStatement() {
	const char *Start = getTok().getLoc().getPointer();

	while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
	Lexer.Lex();

	const char *End = getTok().getLoc().getPointer();
	return StringRef(Start, End - Start);
	}

	StringRef AsmParser::parseStringToComma() {
	const char *Start = getTok().getLoc().getPointer();

	while (Lexer.isNot(AsmToken::EndOfStatement) &&
	Lexer.isNot(AsmToken::Comma) && Lexer.isNot(AsmToken::Eof))
	Lexer.Lex();

	const char *End = getTok().getLoc().getPointer();
	return StringRef(Start, End - Start);
	}

	/// Parse a paren expression and return it.
	/// NOTE: This assumes the leading '(' has already been consumed.
	///
	/// parenexpr ::= expr)
	///
	bool AsmParser::parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
	if (parseExpression(Res))
	return true;
	if (Lexer.isNot(AsmToken::RParen))
	return TokError("expected ')' in parentheses expression");
	EndLoc = Lexer.getTok().getEndLoc();
	Lex();
	return false;
	}

	/// Parse a bracket expression and return it.
	/// NOTE: This assumes the leading '[' has already been consumed.
	///
	/// bracketexpr ::= expr]
	///
	bool AsmParser::parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
	if (parseExpression(Res))
	return true;
	EndLoc = getTok().getEndLoc();
	if (parseToken(AsmToken::RBrac, "expected ']' in brackets expression"))
	return true;
	return false;
	}

	/// Parse a primary expression and return it.
	/// primaryexpr ::= (parenexpr
	/// primaryexpr ::= symbol
	/// primaryexpr ::= number
	/// primaryexpr ::= '.'
	/// primaryexpr ::= ~,+,- primaryexpr
	bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
	SMLoc FirstTokenLoc = getLexer().getLoc();
	AsmToken::TokenKind FirstTokenKind = Lexer.getKind();
	switch (FirstTokenKind) {
	default:
	return TokError("unknown token in expression");
	// If we have an error assume that we've already handled it.
	case AsmToken::Error:
	return true;
	case AsmToken::Exclaim:
	Lex(); // Eat the operator.
	if (parsePrimaryExpr(Res, EndLoc))
	return true;
	Res = MCUnaryExpr::createLNot(Res, getContext(), FirstTokenLoc);
	return false;
	case AsmToken::Dollar:
	case AsmToken::At:
	case AsmToken::String:
	case AsmToken::Identifier: {
	StringRef Identifier;
	if (parseIdentifier(Identifier)) {
	// We may have failed but $ may be a valid token.
	if (getTok().is(AsmToken::Dollar)) {
	if (Lexer.getMAI().getDollarIsPC()) {
	Lex();
	// This is a '$' reference, which references the current PC. Emit a
	// temporary label to the streamer and refer to it.
	MCSymbol *Sym = Ctx.createTempSymbol();
	Out.EmitLabel(Sym);
	Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
	getContext());
	EndLoc = FirstTokenLoc;
	return false;
	}
	return Error(FirstTokenLoc, "invalid token in expression");
	}
	}
	// Parse symbol variant
	std::pair<StringRef, StringRef> Split;
	if (!MAI.useParensForSymbolVariant()) {
	if (FirstTokenKind == AsmToken::String) {
	if (Lexer.is(AsmToken::At)) {
	Lex(); // eat @
	SMLoc AtLoc = getLexer().getLoc();
	StringRef VName;
	if (parseIdentifier(VName))
	return Error(AtLoc, "expected symbol variant after '@'");

	Split = std::make_pair(Identifier, VName);
	}
	} else {
	Split = Identifier.split('@');
	}
	} else if (Lexer.is(AsmToken::LParen)) {
	Lex(); // eat '('.
	StringRef VName;
	parseIdentifier(VName);
	// eat ')'.
	if (parseToken(AsmToken::RParen,
	"unexpected token in variant, expected ')'"))
	return true;
	Split = std::make_pair(Identifier, VName);
	}

	EndLoc = SMLoc::getFromPointer(Identifier.end());

	// This is a symbol reference.
	StringRef SymbolName = Identifier;
	if (SymbolName.empty())
	return Error(getLexer().getLoc(), "expected a symbol reference");

	MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;

	// Lookup the symbol variant if used.
	if (!Split.second.empty()) {
	Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
	if (Variant != MCSymbolRefExpr::VK_Invalid) {
	SymbolName = Split.first;
	} else if (MAI.doesAllowAtInName() && !MAI.useParensForSymbolVariant()) {
	Variant = MCSymbolRefExpr::VK_None;
	} else {
	return Error(SMLoc::getFromPointer(Split.second.begin()),
	"invalid variant '" + Split.second + "'");
	}
	}

	- MCSymbol *Sym = getContext().getOrCreateSymbol(SymbolName);
	+ MCSymbol *Sym = getContext().getInlineAsmLabel(SymbolName);
	+ if (!Sym)
	+ Sym = getContext().getOrCreateSymbol(SymbolName);

	// If this is an absolute variable reference, substitute it now to preserve
	// semantics in the face of reassignment.
	if (Sym->isVariable()) {
	auto V = Sym->getVariableValue(/SetUsed/ false);
	bool DoInline = isa<MCConstantExpr>(V) && !Variant;
	if (auto TV = dyn_cast<MCTargetExpr>(V))
	DoInline = TV->inlineAssignedExpr();
	if (DoInline) {
	if (Variant)
	return Error(EndLoc, "unexpected modifier on variable reference");
	Res = Sym->getVariableValue(/SetUsed/ false);
	return false;
	}
	}

	// Otherwise create a symbol ref.
	Res = MCSymbolRefExpr::create(Sym, Variant, getContext(), FirstTokenLoc);
	return false;
	}
	case AsmToken::BigNum:
	return TokError("literal value out of range for directive");
	case AsmToken::Integer: {
	SMLoc Loc = getTok().getLoc();
	int64_t IntVal = getTok().getIntVal();
	Res = MCConstantExpr::create(IntVal, getContext());
	EndLoc = Lexer.getTok().getEndLoc();
	Lex(); // Eat token.
	// Look for 'b' or 'f' following an Integer as a directional label
	if (Lexer.getKind() == AsmToken::Identifier) {
	StringRef IDVal = getTok().getString();
	// Lookup the symbol variant if used.
	std::pair<StringRef, StringRef> Split = IDVal.split('@');
	MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
	if (Split.first.size() != IDVal.size()) {
	Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
	if (Variant == MCSymbolRefExpr::VK_Invalid)
	return TokError("invalid variant '" + Split.second + "'");
	IDVal = Split.first;
	}
	if (IDVal == "f" \|\| IDVal == "b") {
	MCSymbol *Sym =
	Ctx.getDirectionalLocalSymbol(IntVal, IDVal == "b");
	Res = MCSymbolRefExpr::create(Sym, Variant, getContext());
	if (IDVal == "b" && Sym->isUndefined())
	return Error(Loc, "directional label undefined");
	DirLabels.push_back(std::make_tuple(Loc, CppHashInfo, Sym));
	EndLoc = Lexer.getTok().getEndLoc();
	Lex(); // Eat identifier.
	}
	}
	return false;
	}
	case AsmToken::Real: {
	APFloat RealVal(APFloat::IEEEdouble(), getTok().getString());
	uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
	Res = MCConstantExpr::create(IntVal, getContext());
	EndLoc = Lexer.getTok().getEndLoc();
	Lex(); // Eat token.
	return false;
	}
	case AsmToken::Dot: {
	// This is a '.' reference, which references the current PC. Emit a
	// temporary label to the streamer and refer to it.
	MCSymbol *Sym = Ctx.createTempSymbol();
	Out.EmitLabel(Sym);
	Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
	EndLoc = Lexer.getTok().getEndLoc();
	Lex(); // Eat identifier.
	return false;
	}
	case AsmToken::LParen:
	Lex(); // Eat the '('.
	return parseParenExpr(Res, EndLoc);
	case AsmToken::LBrac:
	if (!PlatformParser->HasBracketExpressions())
	return TokError("brackets expression not supported on this target");
	Lex(); // Eat the '['.
	return parseBracketExpr(Res, EndLoc);
	case AsmToken::Minus:
	Lex(); // Eat the operator.
	if (parsePrimaryExpr(Res, EndLoc))
	return true;
	Res = MCUnaryExpr::createMinus(Res, getContext(), FirstTokenLoc);
	return false;
	case AsmToken::Plus:
	Lex(); // Eat the operator.
	if (parsePrimaryExpr(Res, EndLoc))
	return true;
	Res = MCUnaryExpr::createPlus(Res, getContext(), FirstTokenLoc);
	return false;
	case AsmToken::Tilde:
	Lex(); // Eat the operator.
	if (parsePrimaryExpr(Res, EndLoc))
	return true;
	Res = MCUnaryExpr::createNot(Res, getContext(), FirstTokenLoc);
	return false;
	// MIPS unary expression operators. The lexer won't generate these tokens if
	// MCAsmInfo::HasMipsExpressions is false for the target.
	case AsmToken::PercentCall16:
	case AsmToken::PercentCall_Hi:
	case AsmToken::PercentCall_Lo:
	case AsmToken::PercentDtprel_Hi:
	case AsmToken::PercentDtprel_Lo:
	case AsmToken::PercentGot:
	case AsmToken::PercentGot_Disp:
	case AsmToken::PercentGot_Hi:
	case AsmToken::PercentGot_Lo:
	case AsmToken::PercentGot_Ofst:
	case AsmToken::PercentGot_Page:
	case AsmToken::PercentGottprel:
	case AsmToken::PercentGp_Rel:
	case AsmToken::PercentHi:
	case AsmToken::PercentHigher:
	case AsmToken::PercentHighest:
	case AsmToken::PercentLo:
	case AsmToken::PercentNeg:
	case AsmToken::PercentPcrel_Hi:
	case AsmToken::PercentPcrel_Lo:
	case AsmToken::PercentTlsgd:
	case AsmToken::PercentTlsldm:
	case AsmToken::PercentTprel_Hi:
	case AsmToken::PercentTprel_Lo:
	Lex(); // Eat the operator.
	if (Lexer.isNot(AsmToken::LParen))
	return TokError("expected '(' after operator");
	Lex(); // Eat the operator.
	if (parseExpression(Res, EndLoc))
	return true;
	if (Lexer.isNot(AsmToken::RParen))
	return TokError("expected ')'");
	Lex(); // Eat the operator.
	Res = getTargetParser().createTargetUnaryExpr(Res, FirstTokenKind, Ctx);
	return !Res;
	}
	}

	bool AsmParser::parseExpression(const MCExpr *&Res) {
	SMLoc EndLoc;
	return parseExpression(Res, EndLoc);
	}

	const MCExpr *
	AsmParser::applyModifierToExpr(const MCExpr *E,
	MCSymbolRefExpr::VariantKind Variant) {
	// Ask the target implementation about this expression first.
	const MCExpr *NewE = getTargetParser().applyModifierToExpr(E, Variant, Ctx);
	if (NewE)
	return NewE;
	// Recurse over the given expression, rebuilding it to apply the given variant
	// if there is exactly one symbol.
	switch (E->getKind()) {
	case MCExpr::Target:
	case MCExpr::Constant:
	return nullptr;

	case MCExpr::SymbolRef: {
	const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);

	if (SRE->getKind() != MCSymbolRefExpr::VK_None) {
	TokError("invalid variant on expression '" + getTok().getIdentifier() +
	"' (already modified)");
	return E;
	}

	return MCSymbolRefExpr::create(&SRE->getSymbol(), Variant, getContext());
	}

	case MCExpr::Unary: {
	const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
	const MCExpr *Sub = applyModifierToExpr(UE->getSubExpr(), Variant);
	if (!Sub)
	return nullptr;
	return MCUnaryExpr::create(UE->getOpcode(), Sub, getContext());
	}

	case MCExpr::Binary: {
	const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
	const MCExpr *LHS = applyModifierToExpr(BE->getLHS(), Variant);
	const MCExpr *RHS = applyModifierToExpr(BE->getRHS(), Variant);

	if (!LHS && !RHS)
	return nullptr;

	if (!LHS)
	LHS = BE->getLHS();
	if (!RHS)
	RHS = BE->getRHS();

	return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, getContext());
	}
	}

	llvm_unreachable("Invalid expression kind!");
	}

	/// This function checks if the next token is <string> type or arithmetic.
	/// string that begin with character '<' must end with character '>'.
	/// otherwise it is arithmetics.
	/// If the function returns a 'true' value,
	/// the End argument will be filled with the last location pointed to the '>'
	/// character.

	/// There is a gap between the AltMacro's documentation and the single quote
	/// implementation. GCC does not fully support this feature and so we will not
	/// support it.
	/// TODO: Adding single quote as a string.
	static bool isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
	assert((StrLoc.getPointer() != nullptr) &&
	"Argument to the function cannot be a NULL value");
	const char *CharPtr = StrLoc.getPointer();
	while ((CharPtr != '>') && (CharPtr != '\n') && (*CharPtr != '\r') &&
	(*CharPtr != '\0')) {
	if (*CharPtr == '!')
	CharPtr++;
	CharPtr++;
	}
	if (*CharPtr == '>') {
	EndLoc = StrLoc.getFromPointer(CharPtr + 1);
	return true;
	}
	return false;
	}

	/// creating a string without the escape characters '!'.
	static std::string altMacroString(StringRef AltMacroStr) {
	std::string Res;
	for (size_t Pos = 0; Pos < AltMacroStr.size(); Pos++) {
	if (AltMacroStr[Pos] == '!')
	Pos++;
	Res += AltMacroStr[Pos];
	}
	return Res;
	}

	/// Parse an expression and return it.
	///
	/// expr ::= expr &&,\|\| expr -> lowest.
	/// expr ::= expr \|,^,&,! expr
	/// expr ::= expr ==,!=,<>,<,<=,>,>= expr
	/// expr ::= expr <<,>> expr
	/// expr ::= expr +,- expr
	/// expr ::= expr *,/,% expr -> highest.
	/// expr ::= primaryexpr
	///
	bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
	// Parse the expression.
	Res = nullptr;
	if (getTargetParser().parsePrimaryExpr(Res, EndLoc) \|\|
	parseBinOpRHS(1, Res, EndLoc))
	return true;

	// As a special case, we support 'a op b @ modifier' by rewriting the
	// expression to include the modifier. This is inefficient, but in general we
	// expect users to use 'a@modifier op b'.
	if (Lexer.getKind() == AsmToken::At) {
	Lex();

	if (Lexer.isNot(AsmToken::Identifier))
	return TokError("unexpected symbol modifier following '@'");

	MCSymbolRefExpr::VariantKind Variant =
	MCSymbolRefExpr::getVariantKindForName(getTok().getIdentifier());
	if (Variant == MCSymbolRefExpr::VK_Invalid)
	return TokError("invalid variant '" + getTok().getIdentifier() + "'");

	const MCExpr *ModifiedRes = applyModifierToExpr(Res, Variant);
	if (!ModifiedRes) {
	return TokError("invalid modifier '" + getTok().getIdentifier() +
	"' (no symbols present)");
	}

	Res = ModifiedRes;
	Lex();
	}

	// Try to constant fold it up front, if possible. Do not exploit
	// assembler here.
	int64_t Value;
	if (Res->evaluateAsAbsolute(Value))
	Res = MCConstantExpr::create(Value, getContext());

	return false;
	}

	bool AsmParser::parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) {
	Res = nullptr;
	return parseParenExpr(Res, EndLoc) \|\| parseBinOpRHS(1, Res, EndLoc);
	}

	bool AsmParser::parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res,
	SMLoc &EndLoc) {
	if (parseParenExpr(Res, EndLoc))
	return true;

	for (; ParenDepth > 0; --ParenDepth) {
	if (parseBinOpRHS(1, Res, EndLoc))
	return true;

	// We don't Lex() the last RParen.
	// This is the same behavior as parseParenExpression().
	if (ParenDepth - 1 > 0) {
	EndLoc = getTok().getEndLoc();
	if (parseToken(AsmToken::RParen,
	"expected ')' in parentheses expression"))
	return true;
	}
	}
	return false;
	}

	bool AsmParser::parseAbsoluteExpression(int64_t &Res) {
	const MCExpr *Expr;

	SMLoc StartLoc = Lexer.getLoc();
	if (parseExpression(Expr))
	return true;

	if (!Expr->evaluateAsAbsolute(Res, getStreamer().getAssemblerPtr()))
	return Error(StartLoc, "expected absolute expression");

	return false;
	}

	static unsigned getDarwinBinOpPrecedence(AsmToken::TokenKind K,
	MCBinaryExpr::Opcode &Kind,
	bool ShouldUseLogicalShr) {
	switch (K) {
	default:
	return 0; // not a binop.

	// Lowest Precedence: &&, \|\|
	case AsmToken::AmpAmp:
	Kind = MCBinaryExpr::LAnd;
	return 1;
	case AsmToken::PipePipe:
	Kind = MCBinaryExpr::LOr;
	return 1;

	// Low Precedence: \|, &, ^
	//
	// FIXME: gas seems to support '!' as an infix operator?
	case AsmToken::Pipe:
	Kind = MCBinaryExpr::Or;
	return 2;
	case AsmToken::Caret:
	Kind = MCBinaryExpr::Xor;
	return 2;
	case AsmToken::Amp:
	Kind = MCBinaryExpr::And;
	return 2;

	// Low Intermediate Precedence: ==, !=, <>, <, <=, >, >=
	case AsmToken::EqualEqual:
	Kind = MCBinaryExpr::EQ;
	return 3;
	case AsmToken::ExclaimEqual:
	case AsmToken::LessGreater:
	Kind = MCBinaryExpr::NE;
	return 3;
	case AsmToken::Less:
	Kind = MCBinaryExpr::LT;
	return 3;
	case AsmToken::LessEqual:
	Kind = MCBinaryExpr::LTE;
	return 3;
	case AsmToken::Greater:
	Kind = MCBinaryExpr::GT;
	return 3;
	case AsmToken::GreaterEqual:
	Kind = MCBinaryExpr::GTE;
	return 3;

	// Intermediate Precedence: <<, >>
	case AsmToken::LessLess:
	Kind = MCBinaryExpr::Shl;
	return 4;
	case AsmToken::GreaterGreater:
	Kind = ShouldUseLogicalShr ? MCBinaryExpr::LShr : MCBinaryExpr::AShr;
	return 4;

	// High Intermediate Precedence: +, -
	case AsmToken::Plus:
	Kind = MCBinaryExpr::Add;
	return 5;
	case AsmToken::Minus:
	Kind = MCBinaryExpr::Sub;
	return 5;

	// Highest Precedence: *, /, %
	case AsmToken::Star:
	Kind = MCBinaryExpr::Mul;
	return 6;
	case AsmToken::Slash:
	Kind = MCBinaryExpr::Div;
	return 6;
	case AsmToken::Percent:
	Kind = MCBinaryExpr::Mod;
	return 6;
	}
	}

	static unsigned getGNUBinOpPrecedence(AsmToken::TokenKind K,
	MCBinaryExpr::Opcode &Kind,
	bool ShouldUseLogicalShr) {
	switch (K) {
	default:
	return 0; // not a binop.

	// Lowest Precedence: &&, \|\|
	case AsmToken::AmpAmp:
	Kind = MCBinaryExpr::LAnd;
	return 2;
	case AsmToken::PipePipe:
	Kind = MCBinaryExpr::LOr;
	return 1;

	// Low Precedence: ==, !=, <>, <, <=, >, >=
	case AsmToken::EqualEqual:
	Kind = MCBinaryExpr::EQ;
	return 3;
	case AsmToken::ExclaimEqual:
	case AsmToken::LessGreater:
	Kind = MCBinaryExpr::NE;
	return 3;
	case AsmToken::Less:
	Kind = MCBinaryExpr::LT;
	return 3;
	case AsmToken::LessEqual:
	Kind = MCBinaryExpr::LTE;
	return 3;
	case AsmToken::Greater:
	Kind = MCBinaryExpr::GT;
	return 3;
	case AsmToken::GreaterEqual:
	Kind = MCBinaryExpr::GTE;
	return 3;

	// Low Intermediate Precedence: +, -
	case AsmToken::Plus:
	Kind = MCBinaryExpr::Add;
	return 4;
	case AsmToken::Minus:
	Kind = MCBinaryExpr::Sub;
	return 4;

	// High Intermediate Precedence: \|, &, ^
	//
	// FIXME: gas seems to support '!' as an infix operator?
	case AsmToken::Pipe:
	Kind = MCBinaryExpr::Or;
	return 5;
	case AsmToken::Caret:
	Kind = MCBinaryExpr::Xor;
	return 5;
	case AsmToken::Amp:
	Kind = MCBinaryExpr::And;
	return 5;

	// Highest Precedence: *, /, %, <<, >>
	case AsmToken::Star:
	Kind = MCBinaryExpr::Mul;
	return 6;
	case AsmToken::Slash:
	Kind = MCBinaryExpr::Div;
	return 6;
	case AsmToken::Percent:
	Kind = MCBinaryExpr::Mod;
	return 6;
	case AsmToken::LessLess:
	Kind = MCBinaryExpr::Shl;
	return 6;
	case AsmToken::GreaterGreater:
	Kind = ShouldUseLogicalShr ? MCBinaryExpr::LShr : MCBinaryExpr::AShr;
	return 6;
	}
	}

	unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K,
	MCBinaryExpr::Opcode &Kind) {
	bool ShouldUseLogicalShr = MAI.shouldUseLogicalShr();
	return IsDarwin ? getDarwinBinOpPrecedence(K, Kind, ShouldUseLogicalShr)
	: getGNUBinOpPrecedence(K, Kind, ShouldUseLogicalShr);
	}

	/// Parse all binary operators with precedence >= 'Precedence'.
	/// Res contains the LHS of the expression on input.
	bool AsmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
	SMLoc &EndLoc) {
	SMLoc StartLoc = Lexer.getLoc();
	while (true) {
	MCBinaryExpr::Opcode Kind = MCBinaryExpr::Add;
	unsigned TokPrec = getBinOpPrecedence(Lexer.getKind(), Kind);

	// If the next token is lower precedence than we are allowed to eat, return
	// successfully with what we ate already.
	if (TokPrec < Precedence)
	return false;

	Lex();

	// Eat the next primary expression.
	const MCExpr *RHS;
	if (getTargetParser().parsePrimaryExpr(RHS, EndLoc))
	return true;

	// If BinOp binds less tightly with RHS than the operator after RHS, let
	// the pending operator take RHS as its LHS.
	MCBinaryExpr::Opcode Dummy;
	unsigned NextTokPrec = getBinOpPrecedence(Lexer.getKind(), Dummy);
	if (TokPrec < NextTokPrec && parseBinOpRHS(TokPrec + 1, RHS, EndLoc))
	return true;

	// Merge LHS and RHS according to operator.
	Res = MCBinaryExpr::create(Kind, Res, RHS, getContext(), StartLoc);
	}
	}

	/// ParseStatement:
	/// ::= EndOfStatement
	/// ::= Label* Directive ...Operands... EndOfStatement
	/// ::= Label* Identifier OperandList* EndOfStatement
	bool AsmParser::parseStatement(ParseStatementInfo &Info,
	MCAsmParserSemaCallback *SI) {
	assert(!hasPendingError() && "parseStatement started with pending error");
	// Eat initial spaces and comments
	while (Lexer.is(AsmToken::Space))
	Lex();
	if (Lexer.is(AsmToken::EndOfStatement)) {
	// if this is a line comment we can drop it safely
	if (getTok().getString().empty() \|\| getTok().getString().front() == '\r' \|\|
	getTok().getString().front() == '\n')
	Out.AddBlankLine();
	Lex();
	return false;
	}
	// Statements always start with an identifier.
	AsmToken ID = getTok();
	SMLoc IDLoc = ID.getLoc();
	StringRef IDVal;
	int64_t LocalLabelVal = -1;
	if (Lexer.is(AsmToken::HashDirective))
	return parseCppHashLineFilenameComment(IDLoc);
	// Allow an integer followed by a ':' as a directional local label.
	if (Lexer.is(AsmToken::Integer)) {
	LocalLabelVal = getTok().getIntVal();
	if (LocalLabelVal < 0) {
	if (!TheCondState.Ignore) {
	Lex(); // always eat a token
	return Error(IDLoc, "unexpected token at start of statement");
	}
	IDVal = "";
	} else {
	IDVal = getTok().getString();
	Lex(); // Consume the integer token to be used as an identifier token.
	if (Lexer.getKind() != AsmToken::Colon) {
	if (!TheCondState.Ignore) {
	Lex(); // always eat a token
	return Error(IDLoc, "unexpected token at start of statement");
	}
	}
	}
	} else if (Lexer.is(AsmToken::Dot)) {
	// Treat '.' as a valid identifier in this context.
	Lex();
	IDVal = ".";
	} else if (Lexer.is(AsmToken::LCurly)) {
	// Treat '{' as a valid identifier in this context.
	Lex();
	IDVal = "{";

	} else if (Lexer.is(AsmToken::RCurly)) {
	// Treat '}' as a valid identifier in this context.
	Lex();
	IDVal = "}";
	} else if (Lexer.is(AsmToken::Star) &&
	getTargetParser().starIsStartOfStatement()) {
	// Accept '*' as a valid start of statement.
	Lex();
	IDVal = "*";
	} else if (parseIdentifier(IDVal)) {
	if (!TheCondState.Ignore) {
	Lex(); // always eat a token
	return Error(IDLoc, "unexpected token at start of statement");
	}
	IDVal = "";
	}

	// Handle conditional assembly here before checking for skipping. We
	// have to do this so that .endif isn't skipped in a ".if 0" block for
	// example.
	StringMap<DirectiveKind>::const_iterator DirKindIt =
	DirectiveKindMap.find(IDVal);
	DirectiveKind DirKind = (DirKindIt == DirectiveKindMap.end())
	? DK_NO_DIRECTIVE
	: DirKindIt->getValue();
	switch (DirKind) {
	default:
	break;
	case DK_IF:
	case DK_IFEQ:
	case DK_IFGE:
	case DK_IFGT:
	case DK_IFLE:
	case DK_IFLT:
	case DK_IFNE:
	return parseDirectiveIf(IDLoc, DirKind);
	case DK_IFB:
	return parseDirectiveIfb(IDLoc, true);
	case DK_IFNB:
	return parseDirectiveIfb(IDLoc, false);
	case DK_IFC:
	return parseDirectiveIfc(IDLoc, true);
	case DK_IFEQS:
	return parseDirectiveIfeqs(IDLoc, true);
	case DK_IFNC:
	return parseDirectiveIfc(IDLoc, false);
	case DK_IFNES:
	return parseDirectiveIfeqs(IDLoc, false);
	case DK_IFDEF:
	return parseDirectiveIfdef(IDLoc, true);
	case DK_IFNDEF:
	case DK_IFNOTDEF:
	return parseDirectiveIfdef(IDLoc, false);
	case DK_ELSEIF:
	return parseDirectiveElseIf(IDLoc);
	case DK_ELSE:
	return parseDirectiveElse(IDLoc);
	case DK_ENDIF:
	return parseDirectiveEndIf(IDLoc);
	}

	// Ignore the statement if in the middle of inactive conditional
	// (e.g. ".if 0").
	if (TheCondState.Ignore) {
	eatToEndOfStatement();
	return false;
	}

	// FIXME: Recurse on local labels?

	// See what kind of statement we have.
	switch (Lexer.getKind()) {
	case AsmToken::Colon: {
	if (!getTargetParser().isLabel(ID))
	break;
	if (checkForValidSection())
	return true;

	// identifier ':' -> Label.
	Lex();

	// Diagnose attempt to use '.' as a label.
	if (IDVal == ".")
	return Error(IDLoc, "invalid use of pseudo-symbol '.' as a label");

	// Diagnose attempt to use a variable as a label.
	//
	// FIXME: Diagnostics. Note the location of the definition as a label.
	// FIXME: This doesn't diagnose assignment to a symbol which has been
	// implicitly marked as external.
	MCSymbol *Sym;
	if (LocalLabelVal == -1) {
	if (ParsingInlineAsm && SI) {
	StringRef RewrittenLabel =
	SI->LookupInlineAsmLabel(IDVal, getSourceManager(), IDLoc, true);
	assert(!RewrittenLabel.empty() &&
	"We should have an internal name here.");
	Info.AsmRewrites->emplace_back(AOK_Label, IDLoc, IDVal.size(),
	RewrittenLabel);
	IDVal = RewrittenLabel;
	}
	Sym = getContext().getOrCreateSymbol(IDVal);
	} else
	Sym = Ctx.createDirectionalLocalSymbol(LocalLabelVal);
	// End of Labels should be treated as end of line for lexing
	// purposes but that information is not available to the Lexer who
	// does not understand Labels. This may cause us to see a Hash
	// here instead of a preprocessor line comment.
	if (getTok().is(AsmToken::Hash)) {
	StringRef CommentStr = parseStringToEndOfStatement();
	Lexer.Lex();
	Lexer.UnLex(AsmToken(AsmToken::EndOfStatement, CommentStr));
	}

	// Consume any end of statement token, if present, to avoid spurious
	// AddBlankLine calls().
	if (getTok().is(AsmToken::EndOfStatement)) {
	Lex();
	}

	getTargetParser().doBeforeLabelEmit(Sym);

	// Emit the label.
	if (!getTargetParser().isParsingInlineAsm())
	Out.EmitLabel(Sym, IDLoc);

	// If we are generating dwarf for assembly source files then gather the
	// info to make a dwarf label entry for this label if needed.
	if (enabledGenDwarfForAssembly())
	MCGenDwarfLabelEntry::Make(Sym, &getStreamer(), getSourceManager(),
	IDLoc);

	getTargetParser().onLabelParsed(Sym);

	return false;
	}

	case AsmToken::Equal:
	if (!getTargetParser().equalIsAsmAssignment())
	break;
	// identifier '=' ... -> assignment statement
	Lex();

	return parseAssignment(IDVal, true);

	default: // Normal instruction or directive.
	break;
	}

	// If macros are enabled, check to see if this is a macro instantiation.
	if (areMacrosEnabled())
	if (const MCAsmMacro *M = getContext().lookupMacro(IDVal)) {
	return handleMacroEntry(M, IDLoc);
	}

	// Otherwise, we have a normal instruction or directive.

	// Directives start with "."
	if (IDVal.startswith(".") && IDVal != ".") {
	// There are several entities interested in parsing directives:
	//
	// 1. The target-specific assembly parser. Some directives are target
	// specific or may potentially behave differently on certain targets.
	// 2. Asm parser extensions. For example, platform-specific parsers
	// (like the ELF parser) register themselves as extensions.
	// 3. The generic directive parser implemented by this class. These are
	// all the directives that behave in a target and platform independent
	// manner, or at least have a default behavior that's shared between
	// all targets and platforms.

	getTargetParser().flushPendingInstructions(getStreamer());

	SMLoc StartTokLoc = getTok().getLoc();
	bool TPDirectiveReturn = getTargetParser().ParseDirective(ID);

	if (hasPendingError())
	return true;
	// Currently the return value should be true if we are
	// uninterested but as this is at odds with the standard parsing
	// convention (return true = error) we have instances of a parsed
	// directive that fails returning true as an error. Catch these
	// cases as best as possible errors here.
	if (TPDirectiveReturn && StartTokLoc != getTok().getLoc())
	return true;
	// Return if we did some parsing or believe we succeeded.
	if (!TPDirectiveReturn \|\| StartTokLoc != getTok().getLoc())
	return false;

	// Next, check the extension directive map to see if any extension has
	// registered itself to parse this directive.
	std::pair<MCAsmParserExtension *, DirectiveHandler> Handler =
	ExtensionDirectiveMap.lookup(IDVal);
	if (Handler.first)
	return (*Handler.second)(Handler.first, IDVal, IDLoc);

	// Finally, if no one else is interested in this directive, it must be
	// generic and familiar to this class.
	switch (DirKind) {
	default:
	break;
	case DK_SET:
	case DK_EQU:
	return parseDirectiveSet(IDVal, true);
	case DK_EQUIV:
	return parseDirectiveSet(IDVal, false);
	case DK_ASCII:
	return parseDirectiveAscii(IDVal, false);
	case DK_ASCIZ:
	case DK_STRING:
	return parseDirectiveAscii(IDVal, true);
	case DK_BYTE:
	case DK_DC_B:
	return parseDirectiveValue(IDVal, 1);
	case DK_DC:
	case DK_DC_W:
	case DK_SHORT:
	case DK_VALUE:
	case DK_2BYTE:
	return parseDirectiveValue(IDVal, 2);
	case DK_LONG:
	case DK_INT:
	case DK_4BYTE:
	case DK_DC_L:
	return parseDirectiveValue(IDVal, 4);
	case DK_QUAD:
	case DK_8BYTE:
	return parseDirectiveValue(IDVal, 8);
	case DK_DC_A:
	return parseDirectiveValue(
	IDVal, getContext().getAsmInfo()->getCodePointerSize());
	case DK_OCTA:
	return parseDirectiveOctaValue(IDVal);
	case DK_SINGLE:
	case DK_FLOAT:
	case DK_DC_S:
	return parseDirectiveRealValue(IDVal, APFloat::IEEEsingle());
	case DK_DOUBLE:
	case DK_DC_D:
	return parseDirectiveRealValue(IDVal, APFloat::IEEEdouble());
	case DK_ALIGN: {
	bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
	return parseDirectiveAlign(IsPow2, /ExprSize=/1);
	}
	case DK_ALIGN32: {
	bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
	return parseDirectiveAlign(IsPow2, /ExprSize=/4);
	}
	case DK_BALIGN:
	return parseDirectiveAlign(/IsPow2=/false, /ExprSize=/1);
	case DK_BALIGNW:
	return parseDirectiveAlign(/IsPow2=/false, /ExprSize=/2);
	case DK_BALIGNL:
	return parseDirectiveAlign(/IsPow2=/false, /ExprSize=/4);
	case DK_P2ALIGN:
	return parseDirectiveAlign(/IsPow2=/true, /ExprSize=/1);
	case DK_P2ALIGNW:
	return parseDirectiveAlign(/IsPow2=/true, /ExprSize=/2);
	case DK_P2ALIGNL:
	return parseDirectiveAlign(/IsPow2=/true, /ExprSize=/4);
	case DK_ORG:
	return parseDirectiveOrg();
	case DK_FILL:
	return parseDirectiveFill();
	case DK_ZERO:
	return parseDirectiveZero();
	case DK_EXTERN:
	eatToEndOfStatement(); // .extern is the default, ignore it.
	return false;
	case DK_GLOBL:
	case DK_GLOBAL:
	return parseDirectiveSymbolAttribute(MCSA_Global);
	case DK_LAZY_REFERENCE:
	return parseDirectiveSymbolAttribute(MCSA_LazyReference);
	case DK_NO_DEAD_STRIP:
	return parseDirectiveSymbolAttribute(MCSA_NoDeadStrip);
	case DK_SYMBOL_RESOLVER:
	return parseDirectiveSymbolAttribute(MCSA_SymbolResolver);
	case DK_PRIVATE_EXTERN:
	return parseDirectiveSymbolAttribute(MCSA_PrivateExtern);
	case DK_REFERENCE:
	return parseDirectiveSymbolAttribute(MCSA_Reference);
	case DK_WEAK_DEFINITION:
	return parseDirectiveSymbolAttribute(MCSA_WeakDefinition);
	case DK_WEAK_REFERENCE:
	return parseDirectiveSymbolAttribute(MCSA_WeakReference);
	case DK_WEAK_DEF_CAN_BE_HIDDEN:
	return parseDirectiveSymbolAttribute(MCSA_WeakDefAutoPrivate);
	case DK_COLD:
	return parseDirectiveSymbolAttribute(MCSA_Cold);
	case DK_COMM:
	case DK_COMMON:
	return parseDirectiveComm(/IsLocal=/false);
	case DK_LCOMM:
	return parseDirectiveComm(/IsLocal=/true);
	case DK_ABORT:
	return parseDirectiveAbort();
	case DK_INCLUDE:
	return parseDirectiveInclude();
	case DK_INCBIN:
	return parseDirectiveIncbin();
	case DK_CODE16:
	case DK_CODE16GCC:
	return TokError(Twine(IDVal) +
	" not currently supported for this target");
	case DK_REPT:
	return parseDirectiveRept(IDLoc, IDVal);
	case DK_IRP:
	return parseDirectiveIrp(IDLoc);
	case DK_IRPC:
	return parseDirectiveIrpc(IDLoc);
	case DK_ENDR:
	return parseDirectiveEndr(IDLoc);
	case DK_BUNDLE_ALIGN_MODE:
	return parseDirectiveBundleAlignMode();
	case DK_BUNDLE_LOCK:
	return parseDirectiveBundleLock();
	case DK_BUNDLE_UNLOCK:
	return parseDirectiveBundleUnlock();
	case DK_SLEB128:
	return parseDirectiveLEB128(true);
	case DK_ULEB128:
	return parseDirectiveLEB128(false);
	case DK_SPACE:
	case DK_SKIP:
	return parseDirectiveSpace(IDVal);
	case DK_FILE:
	return parseDirectiveFile(IDLoc);
	case DK_LINE:
	return parseDirectiveLine();
	case DK_LOC:
	return parseDirectiveLoc();
	case DK_STABS:
	return parseDirectiveStabs();
	case DK_CV_FILE:
	return parseDirectiveCVFile();
	case DK_CV_FUNC_ID:
	return parseDirectiveCVFuncId();
	case DK_CV_INLINE_SITE_ID:
	return parseDirectiveCVInlineSiteId();
	case DK_CV_LOC:
	return parseDirectiveCVLoc();
	case DK_CV_LINETABLE:
	return parseDirectiveCVLinetable();
	case DK_CV_INLINE_LINETABLE:
	return parseDirectiveCVInlineLinetable();
	case DK_CV_DEF_RANGE:
	return parseDirectiveCVDefRange();
	case DK_CV_STRING:
	return parseDirectiveCVString();
	case DK_CV_STRINGTABLE:
	return parseDirectiveCVStringTable();
	case DK_CV_FILECHECKSUMS:
	return parseDirectiveCVFileChecksums();
	case DK_CV_FILECHECKSUM_OFFSET:
	return parseDirectiveCVFileChecksumOffset();
	case DK_CV_FPO_DATA:
	return parseDirectiveCVFPOData();
	case DK_CFI_SECTIONS:
	return parseDirectiveCFISections();
	case DK_CFI_STARTPROC:
	return parseDirectiveCFIStartProc();
	case DK_CFI_ENDPROC:
	return parseDirectiveCFIEndProc();
	case DK_CFI_DEF_CFA:
	return parseDirectiveCFIDefCfa(IDLoc);
	case DK_CFI_DEF_CFA_OFFSET:
	return parseDirectiveCFIDefCfaOffset();
	case DK_CFI_ADJUST_CFA_OFFSET:
	return parseDirectiveCFIAdjustCfaOffset();
	case DK_CFI_DEF_CFA_REGISTER:
	return parseDirectiveCFIDefCfaRegister(IDLoc);
	case DK_CFI_OFFSET:
	return parseDirectiveCFIOffset(IDLoc);
	case DK_CFI_REL_OFFSET:
	return parseDirectiveCFIRelOffset(IDLoc);
	case DK_CFI_PERSONALITY:
	return parseDirectiveCFIPersonalityOrLsda(true);
	case DK_CFI_LSDA:
	return parseDirectiveCFIPersonalityOrLsda(false);
	case DK_CFI_REMEMBER_STATE:
	return parseDirectiveCFIRememberState();
	case DK_CFI_RESTORE_STATE:
	return parseDirectiveCFIRestoreState();
	case DK_CFI_SAME_VALUE:
	return parseDirectiveCFISameValue(IDLoc);
	case DK_CFI_RESTORE:
	return parseDirectiveCFIRestore(IDLoc);
	case DK_CFI_ESCAPE:
	return parseDirectiveCFIEscape();
	case DK_CFI_RETURN_COLUMN:
	return parseDirectiveCFIReturnColumn(IDLoc);
	case DK_CFI_SIGNAL_FRAME:
	return parseDirectiveCFISignalFrame();
	case DK_CFI_UNDEFINED:
	return parseDirectiveCFIUndefined(IDLoc);
	case DK_CFI_REGISTER:
	return parseDirectiveCFIRegister(IDLoc);
	case DK_CFI_WINDOW_SAVE:
	return parseDirectiveCFIWindowSave();
	case DK_MACROS_ON:
	case DK_MACROS_OFF:
	return parseDirectiveMacrosOnOff(IDVal);
	case DK_MACRO:
	return parseDirectiveMacro(IDLoc);
	case DK_ALTMACRO:
	case DK_NOALTMACRO:
	return parseDirectiveAltmacro(IDVal);
	case DK_EXITM:
	return parseDirectiveExitMacro(IDVal);
	case DK_ENDM:
	case DK_ENDMACRO:
	return parseDirectiveEndMacro(IDVal);
	case DK_PURGEM:
	return parseDirectivePurgeMacro(IDLoc);
	case DK_END:
	return parseDirectiveEnd(IDLoc);
	case DK_ERR:
	return parseDirectiveError(IDLoc, false);
	case DK_ERROR:
	return parseDirectiveError(IDLoc, true);
	case DK_WARNING:
	return parseDirectiveWarning(IDLoc);
	case DK_RELOC:
	return parseDirectiveReloc(IDLoc);
	case DK_DCB:
	case DK_DCB_W:
	return parseDirectiveDCB(IDVal, 2);
	case DK_DCB_B:
	return parseDirectiveDCB(IDVal, 1);
	case DK_DCB_D:
	return parseDirectiveRealDCB(IDVal, APFloat::IEEEdouble());
	case DK_DCB_L:
	return parseDirectiveDCB(IDVal, 4);
	case DK_DCB_S:
	return parseDirectiveRealDCB(IDVal, APFloat::IEEEsingle());
	case DK_DC_X:
	case DK_DCB_X:
	return TokError(Twine(IDVal) +
	" not currently supported for this target");
	case DK_DS:
	case DK_DS_W:
	return parseDirectiveDS(IDVal, 2);
	case DK_DS_B:
	return parseDirectiveDS(IDVal, 1);
	case DK_DS_D:
	return parseDirectiveDS(IDVal, 8);
	case DK_DS_L:
	case DK_DS_S:
	return parseDirectiveDS(IDVal, 4);
	case DK_DS_P:
	case DK_DS_X:
	return parseDirectiveDS(IDVal, 12);
	case DK_PRINT:
	return parseDirectivePrint(IDLoc);
	case DK_ADDRSIG:
	return parseDirectiveAddrsig();
	case DK_ADDRSIG_SYM:
	return parseDirectiveAddrsigSym();
	}

	return Error(IDLoc, "unknown directive");
	}

	// __asm _emit or __asm __emit
	if (ParsingInlineAsm && (IDVal == "_emit" \|\| IDVal == "__emit" \|\|
	IDVal == "_EMIT" \|\| IDVal == "__EMIT"))
	return parseDirectiveMSEmit(IDLoc, Info, IDVal.size());

	// __asm align
	if (ParsingInlineAsm && (IDVal == "align" \|\| IDVal == "ALIGN"))
	return parseDirectiveMSAlign(IDLoc, Info);

	if (ParsingInlineAsm && (IDVal == "even" \|\| IDVal == "EVEN"))
	Info.AsmRewrites->emplace_back(AOK_EVEN, IDLoc, 4);
	if (checkForValidSection())
	return true;

	// Canonicalize the opcode to lower case.
	std::string OpcodeStr = IDVal.lower();
	ParseInstructionInfo IInfo(Info.AsmRewrites);
	bool ParseHadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr, ID,
	Info.ParsedOperands);
	Info.ParseError = ParseHadError;

	// Dump the parsed representation, if requested.
	if (getShowParsedOperands()) {
	SmallString<256> Str;
	raw_svector_ostream OS(Str);
	OS << "parsed instruction: [";
	for (unsigned i = 0; i != Info.ParsedOperands.size(); ++i) {
	if (i != 0)
	OS << ", ";
	Info.ParsedOperands[i]->print(OS);
	}
	OS << "]";

	printMessage(IDLoc, SourceMgr::DK_Note, OS.str());
	}

	// Fail even if ParseInstruction erroneously returns false.
	if (hasPendingError() \|\| ParseHadError)
	return true;

	// If we are generating dwarf for the current section then generate a .loc
	// directive for the instruction.
	if (!ParseHadError && enabledGenDwarfForAssembly() &&
	getContext().getGenDwarfSectionSyms().count(
	getStreamer().getCurrentSectionOnly())) {
	unsigned Line;
	if (ActiveMacros.empty())
	Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
	else
	Line = SrcMgr.FindLineNumber(ActiveMacros.front()->InstantiationLoc,
	ActiveMacros.front()->ExitBuffer);

	// If we previously parsed a cpp hash file line comment then make sure the
	// current Dwarf File is for the CppHashFilename if not then emit the
	// Dwarf File table for it and adjust the line number for the .loc.
	if (!CppHashInfo.Filename.empty()) {
	unsigned FileNumber = getStreamer().EmitDwarfFileDirective(
	0, StringRef(), CppHashInfo.Filename);
	getContext().setGenDwarfFileNumber(FileNumber);

	unsigned CppHashLocLineNo =
	SrcMgr.FindLineNumber(CppHashInfo.Loc, CppHashInfo.Buf);
	Line = CppHashInfo.LineNumber - 1 + (Line - CppHashLocLineNo);
	}

	getStreamer().EmitDwarfLocDirective(
	getContext().getGenDwarfFileNumber(), Line, 0,
	DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0, 0, 0,
	StringRef());
	}

	// If parsing succeeded, match the instruction.
	if (!ParseHadError) {
	uint64_t ErrorInfo;
	if (getTargetParser().MatchAndEmitInstruction(
	IDLoc, Info.Opcode, Info.ParsedOperands, Out, ErrorInfo,
	getTargetParser().isParsingInlineAsm()))
	return true;
	}
	return false;
	}

	// Parse and erase curly braces marking block start/end
	bool
	AsmParser::parseCurlyBlockScope(SmallVectorImpl<AsmRewrite> &AsmStrRewrites) {
	// Identify curly brace marking block start/end
	if (Lexer.isNot(AsmToken::LCurly) && Lexer.isNot(AsmToken::RCurly))
	return false;

	SMLoc StartLoc = Lexer.getLoc();
	Lex(); // Eat the brace
	if (Lexer.is(AsmToken::EndOfStatement))
	Lex(); // Eat EndOfStatement following the brace

	// Erase the block start/end brace from the output asm string
	AsmStrRewrites.emplace_back(AOK_Skip, StartLoc, Lexer.getLoc().getPointer() -
	StartLoc.getPointer());
	return true;
	}

	/// parseCppHashLineFilenameComment as this:
	/// ::= # number "filename"
	bool AsmParser::parseCppHashLineFilenameComment(SMLoc L) {
	Lex(); // Eat the hash token.
	// Lexer only ever emits HashDirective if it fully formed if it's
	// done the checking already so this is an internal error.
	assert(getTok().is(AsmToken::Integer) &&
	"Lexing Cpp line comment: Expected Integer");
	int64_t LineNumber = getTok().getIntVal();
	Lex();
	assert(getTok().is(AsmToken::String) &&
	"Lexing Cpp line comment: Expected String");
	StringRef Filename = getTok().getString();
	Lex();

	// Get rid of the enclosing quotes.
	Filename = Filename.substr(1, Filename.size() - 2);

	// Save the SMLoc, Filename and LineNumber for later use by diagnostics
	// and possibly DWARF file info.
	CppHashInfo.Loc = L;
	CppHashInfo.Filename = Filename;
	CppHashInfo.LineNumber = LineNumber;
	CppHashInfo.Buf = CurBuffer;
	if (FirstCppHashFilename.empty())
	FirstCppHashFilename = Filename;
	return false;
	}

	/// will use the last parsed cpp hash line filename comment
	/// for the Filename and LineNo if any in the diagnostic.
	void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
	const AsmParser Parser = static_cast<const AsmParser >(Context);
	raw_ostream &OS = errs();

	const SourceMgr &DiagSrcMgr = *Diag.getSourceMgr();
	SMLoc DiagLoc = Diag.getLoc();
	unsigned DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
	unsigned CppHashBuf =
	Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashInfo.Loc);

	// Like SourceMgr::printMessage() we need to print the include stack if any
	// before printing the message.
	unsigned DiagCurBuffer = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
	if (!Parser->SavedDiagHandler && DiagCurBuffer &&
	DiagCurBuffer != DiagSrcMgr.getMainFileID()) {
	SMLoc ParentIncludeLoc = DiagSrcMgr.getParentIncludeLoc(DiagCurBuffer);
	DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS);
	}

	// If we have not parsed a cpp hash line filename comment or the source
	// manager changed or buffer changed (like in a nested include) then just
	// print the normal diagnostic using its Filename and LineNo.
	if (!Parser->CppHashInfo.LineNumber \|\| &DiagSrcMgr != &Parser->SrcMgr \|\|
	DiagBuf != CppHashBuf) {
	if (Parser->SavedDiagHandler)
	Parser->SavedDiagHandler(Diag, Parser->SavedDiagContext);
	else
	Diag.print(nullptr, OS);
	return;
	}

	// Use the CppHashFilename and calculate a line number based on the
	// CppHashInfo.Loc and CppHashInfo.LineNumber relative to this Diag's SMLoc
	// for the diagnostic.
	const std::string &Filename = Parser->CppHashInfo.Filename;

	int DiagLocLineNo = DiagSrcMgr.FindLineNumber(DiagLoc, DiagBuf);
	int CppHashLocLineNo =
	Parser->SrcMgr.FindLineNumber(Parser->CppHashInfo.Loc, CppHashBuf);
	int LineNo =
	Parser->CppHashInfo.LineNumber - 1 + (DiagLocLineNo - CppHashLocLineNo);

	SMDiagnostic NewDiag(*Diag.getSourceMgr(), Diag.getLoc(), Filename, LineNo,
	Diag.getColumnNo(), Diag.getKind(), Diag.getMessage(),
	Diag.getLineContents(), Diag.getRanges());

	if (Parser->SavedDiagHandler)
	Parser->SavedDiagHandler(NewDiag, Parser->SavedDiagContext);
	else
	NewDiag.print(nullptr, OS);
	}

	// FIXME: This is mostly duplicated from the function in AsmLexer.cpp. The
	// difference being that that function accepts '@' as part of identifiers and
	// we can't do that. AsmLexer.cpp should probably be changed to handle
	// '@' as a special case when needed.
	static bool isIdentifierChar(char c) {
	return isalnum(static_cast<unsigned char>(c)) \|\| c == '_' \|\| c == '$' \|\|
	c == '.';
	}

	bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
	ArrayRef<MCAsmMacroParameter> Parameters,
	ArrayRef<MCAsmMacroArgument> A,
	bool EnableAtPseudoVariable, SMLoc L) {
	unsigned NParameters = Parameters.size();
	bool HasVararg = NParameters ? Parameters.back().Vararg : false;
	if ((!IsDarwin \|\| NParameters != 0) && NParameters != A.size())
	return Error(L, "Wrong number of arguments");

	// A macro without parameters is handled differently on Darwin:
	// gas accepts no arguments and does no substitutions
	while (!Body.empty()) {
	// Scan for the next substitution.
	std::size_t End = Body.size(), Pos = 0;
	for (; Pos != End; ++Pos) {
	// Check for a substitution or escape.
	if (IsDarwin && !NParameters) {
	// This macro has no parameters, look for $0, $1, etc.
	if (Body[Pos] != '$' \|\| Pos + 1 == End)
	continue;

	char Next = Body[Pos + 1];
	if (Next == '$' \|\| Next == 'n' \|\|
	isdigit(static_cast<unsigned char>(Next)))
	break;
	} else {
	// This macro has parameters, look for \foo, \bar, etc.
	if (Body[Pos] == '\\' && Pos + 1 != End)
	break;
	}
	}

	// Add the prefix.
	OS << Body.slice(0, Pos);

	// Check if we reached the end.
	if (Pos == End)
	break;

	if (IsDarwin && !NParameters) {
	switch (Body[Pos + 1]) {
	// $$ => $
	case '$':
	OS << '$';
	break;

	// $n => number of arguments
	case 'n':
	OS << A.size();
	break;

	// $[0-9] => argument
	default: {
	// Missing arguments are ignored.
	unsigned Index = Body[Pos + 1] - '0';
	if (Index >= A.size())
	break;

	// Otherwise substitute with the token values, with spaces eliminated.
	for (const AsmToken &Token : A[Index])
	OS << Token.getString();
	break;
	}
	}
	Pos += 2;
	} else {
	unsigned I = Pos + 1;

	// Check for the \@ pseudo-variable.
	if (EnableAtPseudoVariable && Body[I] == '@' && I + 1 != End)
	++I;
	else
	while (isIdentifierChar(Body[I]) && I + 1 != End)
	++I;

	const char *Begin = Body.data() + Pos + 1;
	StringRef Argument(Begin, I - (Pos + 1));
	unsigned Index = 0;

	if (Argument == "@") {
	OS << NumOfMacroInstantiations;
	Pos += 2;
	} else {
	for (; Index < NParameters; ++Index)
	if (Parameters[Index].Name == Argument)
	break;

	if (Index == NParameters) {
	if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
	Pos += 3;
	else {
	OS << '\\' << Argument;
	Pos = I;
	}
	} else {
	bool VarargParameter = HasVararg && Index == (NParameters - 1);
	for (const AsmToken &Token : A[Index])
	// For altmacro mode, you can write '%expr'.
	// The prefix '%' evaluates the expression 'expr'
	// and uses the result as a string (e.g. replace %(1+2) with the
	// string "3").
	// Here, we identify the integer token which is the result of the
	// absolute expression evaluation and replace it with its string
	// representation.
	if (AltMacroMode && Token.getString().front() == '%' &&
	Token.is(AsmToken::Integer))
	// Emit an integer value to the buffer.
	OS << Token.getIntVal();
	// Only Token that was validated as a string and begins with '<'
	// is considered altMacroString!!!
	else if (AltMacroMode && Token.getString().front() == '<' &&
	Token.is(AsmToken::String)) {
	OS << altMacroString(Token.getStringContents());
	}
	// We expect no quotes around the string's contents when
	// parsing for varargs.
	else if (Token.isNot(AsmToken::String) \|\| VarargParameter)
	OS << Token.getString();
	else
	OS << Token.getStringContents();

	Pos += 1 + Argument.size();
	}
	}
	}
	// Update the scan point.
	Body = Body.substr(Pos);
	}

	return false;
	}

	MacroInstantiation::MacroInstantiation(SMLoc IL, int EB, SMLoc EL,
	size_t CondStackDepth)
	: InstantiationLoc(IL), ExitBuffer(EB), ExitLoc(EL),
	CondStackDepth(CondStackDepth) {}

	static bool isOperator(AsmToken::TokenKind kind) {
	switch (kind) {
	default:
	return false;
	case AsmToken::Plus:
	case AsmToken::Minus:
	case AsmToken::Tilde:
	case AsmToken::Slash:
	case AsmToken::Star:
	case AsmToken::Dot:
	case AsmToken::Equal:
	case AsmToken::EqualEqual:
	case AsmToken::Pipe:
	case AsmToken::PipePipe:
	case AsmToken::Caret:
	case AsmToken::Amp:
	case AsmToken::AmpAmp:
	case AsmToken::Exclaim:
	case AsmToken::ExclaimEqual:
	case AsmToken::Less:
	case AsmToken::LessEqual:
	case AsmToken::LessLess:
	case AsmToken::LessGreater:
	case AsmToken::Greater:
	case AsmToken::GreaterEqual:
	case AsmToken::GreaterGreater:
	return true;
	}
	}

	namespace {

	class AsmLexerSkipSpaceRAII {
	public:
	AsmLexerSkipSpaceRAII(AsmLexer &Lexer, bool SkipSpace) : Lexer(Lexer) {
	Lexer.setSkipSpace(SkipSpace);
	}

	~AsmLexerSkipSpaceRAII() {
	Lexer.setSkipSpace(true);
	}

	private:
	AsmLexer &Lexer;
	};

	} // end anonymous namespace

	bool AsmParser::parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg) {

	if (Vararg) {
	if (Lexer.isNot(AsmToken::EndOfStatement)) {
	StringRef Str = parseStringToEndOfStatement();
	MA.emplace_back(AsmToken::String, Str);
	}
	return false;
	}

	unsigned ParenLevel = 0;

	// Darwin doesn't use spaces to delmit arguments.
	AsmLexerSkipSpaceRAII ScopedSkipSpace(Lexer, IsDarwin);

	bool SpaceEaten;

	while (true) {
	SpaceEaten = false;
	if (Lexer.is(AsmToken::Eof) \|\| Lexer.is(AsmToken::Equal))
	return TokError("unexpected token in macro instantiation");

	if (ParenLevel == 0) {

	if (Lexer.is(AsmToken::Comma))
	break;

	if (Lexer.is(AsmToken::Space)) {
	SpaceEaten = true;
	Lexer.Lex(); // Eat spaces
	}

	// Spaces can delimit parameters, but could also be part an expression.
	// If the token after a space is an operator, add the token and the next
	// one into this argument
	if (!IsDarwin) {
	if (isOperator(Lexer.getKind())) {
	MA.push_back(getTok());
	Lexer.Lex();

	// Whitespace after an operator can be ignored.
	if (Lexer.is(AsmToken::Space))
	Lexer.Lex();

	continue;
	}
	}
	if (SpaceEaten)
	break;
	}

	// handleMacroEntry relies on not advancing the lexer here
	// to be able to fill in the remaining default parameter values
	if (Lexer.is(AsmToken::EndOfStatement))
	break;

	// Adjust the current parentheses level.
	if (Lexer.is(AsmToken::LParen))
	++ParenLevel;
	else if (Lexer.is(AsmToken::RParen) && ParenLevel)
	--ParenLevel;

	// Append the token to the current argument list.
	MA.push_back(getTok());
	Lexer.Lex();
	}

	if (ParenLevel != 0)
	return TokError("unbalanced parentheses in macro argument");
	return false;
	}

	// Parse the macro instantiation arguments.
	bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
	MCAsmMacroArguments &A) {
	const unsigned NParameters = M ? M->Parameters.size() : 0;
	bool NamedParametersFound = false;
	SmallVector<SMLoc, 4> FALocs;

	A.resize(NParameters);
	FALocs.resize(NParameters);

	// Parse two kinds of macro invocations:
	// - macros defined without any parameters accept an arbitrary number of them
	// - macros defined with parameters accept at most that many of them
	bool HasVararg = NParameters ? M->Parameters.back().Vararg : false;
	for (unsigned Parameter = 0; !NParameters \|\| Parameter < NParameters;
	++Parameter) {
	SMLoc IDLoc = Lexer.getLoc();
	MCAsmMacroParameter FA;

	if (Lexer.is(AsmToken::Identifier) && Lexer.peekTok().is(AsmToken::Equal)) {
	if (parseIdentifier(FA.Name))
	return Error(IDLoc, "invalid argument identifier for formal argument");

	if (Lexer.isNot(AsmToken::Equal))
	return TokError("expected '=' after formal parameter identifier");

	Lex();

	NamedParametersFound = true;
	}
	bool Vararg = HasVararg && Parameter == (NParameters - 1);

	if (NamedParametersFound && FA.Name.empty())
	return Error(IDLoc, "cannot mix positional and keyword arguments");

	SMLoc StrLoc = Lexer.getLoc();
	SMLoc EndLoc;
	if (AltMacroMode && Lexer.is(AsmToken::Percent)) {
	const MCExpr *AbsoluteExp;
	int64_t Value;
	/// Eat '%'
	Lex();
	if (parseExpression(AbsoluteExp, EndLoc))
	return false;
	if (!AbsoluteExp->evaluateAsAbsolute(Value,
	getStreamer().getAssemblerPtr()))
	return Error(StrLoc, "expected absolute expression");
	const char *StrChar = StrLoc.getPointer();
	const char *EndChar = EndLoc.getPointer();
	AsmToken newToken(AsmToken::Integer,
	StringRef(StrChar, EndChar - StrChar), Value);
	FA.Value.push_back(newToken);
	} else if (AltMacroMode && Lexer.is(AsmToken::Less) &&
	isAltmacroString(StrLoc, EndLoc)) {
	const char *StrChar = StrLoc.getPointer();
	const char *EndChar = EndLoc.getPointer();
	jumpToLoc(EndLoc, CurBuffer);
	/// Eat from '<' to '>'
	Lex();
	AsmToken newToken(AsmToken::String,
	StringRef(StrChar, EndChar - StrChar));
	FA.Value.push_back(newToken);
	} else if(parseMacroArgument(FA.Value, Vararg))
	return true;

	unsigned PI = Parameter;
	if (!FA.Name.empty()) {
	unsigned FAI = 0;
	for (FAI = 0; FAI < NParameters; ++FAI)
	if (M->Parameters[FAI].Name == FA.Name)
	break;

	if (FAI >= NParameters) {
	assert(M && "expected macro to be defined");
	return Error(IDLoc, "parameter named '" + FA.Name +
	"' does not exist for macro '" + M->Name + "'");
	}
	PI = FAI;
	}

	if (!FA.Value.empty()) {
	if (A.size() <= PI)
	A.resize(PI + 1);
	A[PI] = FA.Value;

	if (FALocs.size() <= PI)
	FALocs.resize(PI + 1);

	FALocs[PI] = Lexer.getLoc();
	}

	// At the end of the statement, fill in remaining arguments that have
	// default values. If there aren't any, then the next argument is
	// required but missing
	if (Lexer.is(AsmToken::EndOfStatement)) {
	bool Failure = false;
	for (unsigned FAI = 0; FAI < NParameters; ++FAI) {
	if (A[FAI].empty()) {
	if (M->Parameters[FAI].Required) {
	Error(FALocs[FAI].isValid() ? FALocs[FAI] : Lexer.getLoc(),
	"missing value for required parameter "
	"'" + M->Parameters[FAI].Name + "' in macro '" + M->Name + "'");
	Failure = true;
	}

	if (!M->Parameters[FAI].Value.empty())
	A[FAI] = M->Parameters[FAI].Value;
	}
	}
	return Failure;
	}

	if (Lexer.is(AsmToken::Comma))
	Lex();
	}

	return TokError("too many positional arguments");
	}

	bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
	// Arbitrarily limit macro nesting depth (default matches 'as'). We can
	// eliminate this, although we should protect against infinite loops.
	unsigned MaxNestingDepth = AsmMacroMaxNestingDepth;
	if (ActiveMacros.size() == MaxNestingDepth) {
	std::ostringstream MaxNestingDepthError;
	MaxNestingDepthError << "macros cannot be nested more than "
	<< MaxNestingDepth << " levels deep."
	<< " Use -asm-macro-max-nesting-depth to increase "
	"this limit.";
	return TokError(MaxNestingDepthError.str());
	}

	MCAsmMacroArguments A;
	if (parseMacroArguments(M, A))
	return true;

	// Macro instantiation is lexical, unfortunately. We construct a new buffer
	// to hold the macro body with substitutions.
	SmallString<256> Buf;
	StringRef Body = M->Body;
	raw_svector_ostream OS(Buf);

	if (expandMacro(OS, Body, M->Parameters, A, true, getTok().getLoc()))
	return true;

	// We include the .endmacro in the buffer as our cue to exit the macro
	// instantiation.
	OS << ".endmacro\n";

	std::unique_ptr<MemoryBuffer> Instantiation =
	MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");

	// Create the macro instantiation object and add to the current macro
	// instantiation stack.
	MacroInstantiation *MI = new MacroInstantiation(
	NameLoc, CurBuffer, getTok().getLoc(), TheCondStack.size());
	ActiveMacros.push_back(MI);

	++NumOfMacroInstantiations;

	// Jump to the macro instantiation and prime the lexer.
	CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(Instantiation), SMLoc());
	Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
	Lex();

	return false;
	}

	void AsmParser::handleMacroExit() {
	// Jump to the EndOfStatement we should return to, and consume it.
	jumpToLoc(ActiveMacros.back()->ExitLoc, ActiveMacros.back()->ExitBuffer);
	Lex();

	// Pop the instantiation entry.
	delete ActiveMacros.back();
	ActiveMacros.pop_back();
	}

	bool AsmParser::parseAssignment(StringRef Name, bool allow_redef,
	bool NoDeadStrip) {
	MCSymbol *Sym;
	const MCExpr *Value;
	if (MCParserUtils::parseAssignmentExpression(Name, allow_redef, *this, Sym,
	Value))
	return true;

	if (!Sym) {
	// In the case where we parse an expression starting with a '.', we will
	// not generate an error, nor will we create a symbol. In this case we
	// should just return out.
	return false;
	}

	// Do the assignment.
	Out.EmitAssignment(Sym, Value);
	if (NoDeadStrip)
	Out.EmitSymbolAttribute(Sym, MCSA_NoDeadStrip);

	return false;
	}

	/// parseIdentifier:
	/// ::= identifier
	/// ::= string
	bool AsmParser::parseIdentifier(StringRef &Res) {
	// The assembler has relaxed rules for accepting identifiers, in particular we
	// allow things like '.globl $foo' and '.def @feat.00', which would normally be
	// separate tokens. At this level, we have already lexed so we cannot (currently)
	// handle this as a context dependent token, instead we detect adjacent tokens
	// and return the combined identifier.
	if (Lexer.is(AsmToken::Dollar) \|\| Lexer.is(AsmToken::At)) {
	SMLoc PrefixLoc = getLexer().getLoc();

	// Consume the prefix character, and check for a following identifier.

	AsmToken Buf[1];
	Lexer.peekTokens(Buf, false);

	if (Buf[0].isNot(AsmToken::Identifier))
	return true;

	// We have a '$' or '@' followed by an identifier, make sure they are adjacent.
	if (PrefixLoc.getPointer() + 1 != Buf[0].getLoc().getPointer())
	return true;

	// eat $ or @
	Lexer.Lex(); // Lexer's Lex guarantees consecutive token.
	// Construct the joined identifier and consume the token.
	Res =
	StringRef(PrefixLoc.getPointer(), getTok().getIdentifier().size() + 1);
	Lex(); // Parser Lex to maintain invariants.
	return false;
	}

	if (Lexer.isNot(AsmToken::Identifier) && Lexer.isNot(AsmToken::String))
	return true;

	Res = getTok().getIdentifier();

	Lex(); // Consume the identifier token.

	return false;
	}

	/// parseDirectiveSet:
	/// ::= .equ identifier ',' expression
	/// ::= .equiv identifier ',' expression
	/// ::= .set identifier ',' expression
	bool AsmParser::parseDirectiveSet(StringRef IDVal, bool allow_redef) {
	StringRef Name;
	if (check(parseIdentifier(Name), "expected identifier") \|\|
	parseToken(AsmToken::Comma) \|\| parseAssignment(Name, allow_redef, true))
	return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
	return false;
	}

	bool AsmParser::parseEscapedString(std::string &Data) {
	if (check(getTok().isNot(AsmToken::String), "expected string"))
	return true;

	Data = "";
	StringRef Str = getTok().getStringContents();
	for (unsigned i = 0, e = Str.size(); i != e; ++i) {
	if (Str[i] != '\\') {
	Data += Str[i];
	continue;
	}

	// Recognize escaped characters. Note that this escape semantics currently
	// loosely follows Darwin 'as'. Notably, it doesn't support hex escapes.
	++i;
	if (i == e)
	return TokError("unexpected backslash at end of string");

	// Recognize octal sequences.
	if ((unsigned)(Str[i] - '0') <= 7) {
	// Consume up to three octal characters.
	unsigned Value = Str[i] - '0';

	if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
	++i;
	Value = Value * 8 + (Str[i] - '0');

	if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
	++i;
	Value = Value * 8 + (Str[i] - '0');
	}
	}

	if (Value > 255)
	return TokError("invalid octal escape sequence (out of range)");

	Data += (unsigned char)Value;
	continue;
	}

	// Otherwise recognize individual escapes.
	switch (Str[i]) {
	default:
	// Just reject invalid escape sequences for now.
	return TokError("invalid escape sequence (unrecognized character)");

	case 'b': Data += '\b'; break;
	case 'f': Data += '\f'; break;
	case 'n': Data += '\n'; break;
	case 'r': Data += '\r'; break;
	case 't': Data += '\t'; break;
	case '"': Data += '"'; break;
	case '\\': Data += '\\'; break;
	}
	}

	Lex();
	return false;
	}

	/// parseDirectiveAscii:
	/// ::= ( .ascii \| .asciz \| .string ) [ "string" ( , "string" )* ]
	bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
	auto parseOp = [&]() -> bool {
	std::string Data;
	if (checkForValidSection() \|\| parseEscapedString(Data))
	return true;
	getStreamer().EmitBytes(Data);
	if (ZeroTerminated)
	getStreamer().EmitBytes(StringRef("\0", 1));
	return false;
	};

	if (parseMany(parseOp))
	return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
	return false;
	}

	/// parseDirectiveReloc
	/// ::= .reloc expression , identifier [ , expression ]
	bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
	const MCExpr *Offset;
	const MCExpr *Expr = nullptr;
	int64_t OffsetValue;
	SMLoc OffsetLoc = Lexer.getTok().getLoc();

	if (parseExpression(Offset))
	return true;

	if ((Offset->evaluateAsAbsolute(OffsetValue,
	getStreamer().getAssemblerPtr()) &&
	check(OffsetValue < 0, OffsetLoc, "expression is negative")) \|\|
	(check(Offset->getKind() != llvm::MCExpr::Constant &&
	Offset->getKind() != llvm::MCExpr::SymbolRef,
	OffsetLoc, "expected non-negative number or a label")) \|\|
	(parseToken(AsmToken::Comma, "expected comma") \|\|
	check(getTok().isNot(AsmToken::Identifier), "expected relocation name")))
	return true;

	SMLoc NameLoc = Lexer.getTok().getLoc();
	StringRef Name = Lexer.getTok().getIdentifier();
	Lex();

	if (Lexer.is(AsmToken::Comma)) {
	Lex();
	SMLoc ExprLoc = Lexer.getLoc();
	if (parseExpression(Expr))
	return true;

	MCValue Value;
	if (!Expr->evaluateAsRelocatable(Value, nullptr, nullptr))
	return Error(ExprLoc, "expression must be relocatable");
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in .reloc directive"))
	return true;

	const MCTargetAsmParser &MCT = getTargetParser();
	const MCSubtargetInfo &STI = MCT.getSTI();
	if (getStreamer().EmitRelocDirective(*Offset, Name, Expr, DirectiveLoc, STI))
	return Error(NameLoc, "unknown relocation name");

	return false;
	}

	/// parseDirectiveValue
	/// ::= (.byte \| .short \| ... ) [ expression (, expression)* ]
	bool AsmParser::parseDirectiveValue(StringRef IDVal, unsigned Size) {
	auto parseOp = [&]() -> bool {
	const MCExpr *Value;
	SMLoc ExprLoc = getLexer().getLoc();
	if (checkForValidSection() \|\| parseExpression(Value))
	return true;
	// Special case constant expressions to match code generator.
	if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
	assert(Size <= 8 && "Invalid size");
	uint64_t IntValue = MCE->getValue();
	if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
	return Error(ExprLoc, "out of range literal value");
	getStreamer().EmitIntValue(IntValue, Size);
	} else
	getStreamer().EmitValue(Value, Size, ExprLoc);
	return false;
	};

	if (parseMany(parseOp))
	return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
	return false;
	}

	static bool parseHexOcta(AsmParser &Asm, uint64_t &hi, uint64_t &lo) {
	if (Asm.getTok().isNot(AsmToken::Integer) &&
	Asm.getTok().isNot(AsmToken::BigNum))
	return Asm.TokError("unknown token in expression");
	SMLoc ExprLoc = Asm.getTok().getLoc();
	APInt IntValue = Asm.getTok().getAPIntVal();
	Asm.Lex();
	if (!IntValue.isIntN(128))
	return Asm.Error(ExprLoc, "out of range literal value");
	if (!IntValue.isIntN(64)) {
	hi = IntValue.getHiBits(IntValue.getBitWidth() - 64).getZExtValue();
	lo = IntValue.getLoBits(64).getZExtValue();
	} else {
	hi = 0;
	lo = IntValue.getZExtValue();
	}
	return false;
	}

	/// ParseDirectiveOctaValue
	/// ::= .octa [ hexconstant (, hexconstant)* ]

	bool AsmParser::parseDirectiveOctaValue(StringRef IDVal) {
	auto parseOp = [&]() -> bool {
	if (checkForValidSection())
	return true;
	uint64_t hi, lo;
	if (parseHexOcta(*this, hi, lo))
	return true;
	if (MAI.isLittleEndian()) {
	getStreamer().EmitIntValue(lo, 8);
	getStreamer().EmitIntValue(hi, 8);
	} else {
	getStreamer().EmitIntValue(hi, 8);
	getStreamer().EmitIntValue(lo, 8);
	}
	return false;
	};

	if (parseMany(parseOp))
	return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
	return false;
	}

	bool AsmParser::parseRealValue(const fltSemantics &Semantics, APInt &Res) {
	// We don't truly support arithmetic on floating point expressions, so we
	// have to manually parse unary prefixes.
	bool IsNeg = false;
	if (getLexer().is(AsmToken::Minus)) {
	Lexer.Lex();
	IsNeg = true;
	} else if (getLexer().is(AsmToken::Plus))
	Lexer.Lex();

	if (Lexer.is(AsmToken::Error))
	return TokError(Lexer.getErr());
	if (Lexer.isNot(AsmToken::Integer) && Lexer.isNot(AsmToken::Real) &&
	Lexer.isNot(AsmToken::Identifier))
	return TokError("unexpected token in directive");

	// Convert to an APFloat.
	APFloat Value(Semantics);
	StringRef IDVal = getTok().getString();
	if (getLexer().is(AsmToken::Identifier)) {
	if (!IDVal.compare_lower("infinity") \|\| !IDVal.compare_lower("inf"))
	Value = APFloat::getInf(Semantics);
	else if (!IDVal.compare_lower("nan"))
	Value = APFloat::getNaN(Semantics, false, ~0);
	else
	return TokError("invalid floating point literal");
	} else if (Value.convertFromString(IDVal, APFloat::rmNearestTiesToEven) ==
	APFloat::opInvalidOp)
	return TokError("invalid floating point literal");
	if (IsNeg)
	Value.changeSign();

	// Consume the numeric token.
	Lex();

	Res = Value.bitcastToAPInt();

	return false;
	}

	/// parseDirectiveRealValue
	/// ::= (.single \| .double) [ expression (, expression)* ]
	bool AsmParser::parseDirectiveRealValue(StringRef IDVal,
	const fltSemantics &Semantics) {
	auto parseOp = [&]() -> bool {
	APInt AsInt;
	if (checkForValidSection() \|\| parseRealValue(Semantics, AsInt))
	return true;
	getStreamer().EmitIntValue(AsInt.getLimitedValue(),
	AsInt.getBitWidth() / 8);
	return false;
	};

	if (parseMany(parseOp))
	return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
	return false;
	}

	/// parseDirectiveZero
	/// ::= .zero expression
	bool AsmParser::parseDirectiveZero() {
	SMLoc NumBytesLoc = Lexer.getLoc();
	const MCExpr *NumBytes;
	if (checkForValidSection() \|\| parseExpression(NumBytes))
	return true;

	int64_t Val = 0;
	if (getLexer().is(AsmToken::Comma)) {
	Lex();
	if (parseAbsoluteExpression(Val))
	return true;
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.zero' directive"))
	return true;
	getStreamer().emitFill(*NumBytes, Val, NumBytesLoc);

	return false;
	}

	/// parseDirectiveFill
	/// ::= .fill expression [ , expression [ , expression ] ]
	bool AsmParser::parseDirectiveFill() {
	SMLoc NumValuesLoc = Lexer.getLoc();
	const MCExpr *NumValues;
	if (checkForValidSection() \|\| parseExpression(NumValues))
	return true;

	int64_t FillSize = 1;
	int64_t FillExpr = 0;

	SMLoc SizeLoc, ExprLoc;

	if (parseOptionalToken(AsmToken::Comma)) {
	SizeLoc = getTok().getLoc();
	if (parseAbsoluteExpression(FillSize))
	return true;
	if (parseOptionalToken(AsmToken::Comma)) {
	ExprLoc = getTok().getLoc();
	if (parseAbsoluteExpression(FillExpr))
	return true;
	}
	}
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.fill' directive"))
	return true;

	if (FillSize < 0) {
	Warning(SizeLoc, "'.fill' directive with negative size has no effect");
	return false;
	}
	if (FillSize > 8) {
	Warning(SizeLoc, "'.fill' directive with size greater than 8 has been truncated to 8");
	FillSize = 8;
	}

	if (!isUInt<32>(FillExpr) && FillSize > 4)
	Warning(ExprLoc, "'.fill' directive pattern has been truncated to 32-bits");

	getStreamer().emitFill(*NumValues, FillSize, FillExpr, NumValuesLoc);

	return false;
	}

	/// parseDirectiveOrg
	/// ::= .org expression [ , expression ]
	bool AsmParser::parseDirectiveOrg() {
	const MCExpr *Offset;
	SMLoc OffsetLoc = Lexer.getLoc();
	if (checkForValidSection() \|\| parseExpression(Offset))
	return true;

	// Parse optional fill expression.
	int64_t FillExpr = 0;
	if (parseOptionalToken(AsmToken::Comma))
	if (parseAbsoluteExpression(FillExpr))
	return addErrorSuffix(" in '.org' directive");
	if (parseToken(AsmToken::EndOfStatement))
	return addErrorSuffix(" in '.org' directive");

	getStreamer().emitValueToOffset(Offset, FillExpr, OffsetLoc);
	return false;
	}

	/// parseDirectiveAlign
	/// ::= {.align, ...} expression [ , expression [ , expression ]]
	bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
	SMLoc AlignmentLoc = getLexer().getLoc();
	int64_t Alignment;
	SMLoc MaxBytesLoc;
	bool HasFillExpr = false;
	int64_t FillExpr = 0;
	int64_t MaxBytesToFill = 0;

	auto parseAlign = [&]() -> bool {
	if (parseAbsoluteExpression(Alignment))
	return true;
	if (parseOptionalToken(AsmToken::Comma)) {
	// The fill expression can be omitted while specifying a maximum number of
	// alignment bytes, e.g:
	// .align 3,,4
	if (getTok().isNot(AsmToken::Comma)) {
	HasFillExpr = true;
	if (parseAbsoluteExpression(FillExpr))
	return true;
	}
	if (parseOptionalToken(AsmToken::Comma))
	if (parseTokenLoc(MaxBytesLoc) \|\|
	parseAbsoluteExpression(MaxBytesToFill))
	return true;
	}
	return parseToken(AsmToken::EndOfStatement);
	};

	if (checkForValidSection())
	return addErrorSuffix(" in directive");
	// Ignore empty '.p2align' directives for GNU-as compatibility
	if (IsPow2 && (ValueSize == 1) && getTok().is(AsmToken::EndOfStatement)) {
	Warning(AlignmentLoc, "p2align directive with no operand(s) is ignored");
	return parseToken(AsmToken::EndOfStatement);
	}
	if (parseAlign())
	return addErrorSuffix(" in directive");

	// Always emit an alignment here even if we thrown an error.
	bool ReturnVal = false;

	// Compute alignment in bytes.
	if (IsPow2) {
	// FIXME: Diagnose overflow.
	if (Alignment >= 32) {
	ReturnVal \|= Error(AlignmentLoc, "invalid alignment value");
	Alignment = 31;
	}

	Alignment = 1ULL << Alignment;
	} else {
	// Reject alignments that aren't either a power of two or zero,
	// for gas compatibility. Alignment of zero is silently rounded
	// up to one.
	if (Alignment == 0)
	Alignment = 1;
	if (!isPowerOf2_64(Alignment))
	ReturnVal \|= Error(AlignmentLoc, "alignment must be a power of 2");
	}

	// Diagnose non-sensical max bytes to align.
	if (MaxBytesLoc.isValid()) {
	if (MaxBytesToFill < 1) {
	ReturnVal \|= Error(MaxBytesLoc,
	"alignment directive can never be satisfied in this "
	"many bytes, ignoring maximum bytes expression");
	MaxBytesToFill = 0;
	}

	if (MaxBytesToFill >= Alignment) {
	Warning(MaxBytesLoc, "maximum bytes expression exceeds alignment and "
	"has no effect");
	MaxBytesToFill = 0;
	}
	}

	// Check whether we should use optimal code alignment for this .align
	// directive.
	const MCSection *Section = getStreamer().getCurrentSectionOnly();
	assert(Section && "must have section to emit alignment");
	bool UseCodeAlign = Section->UseCodeAlign();
	if ((!HasFillExpr \|\| Lexer.getMAI().getTextAlignFillValue() == FillExpr) &&
	ValueSize == 1 && UseCodeAlign) {
	getStreamer().EmitCodeAlignment(Alignment, MaxBytesToFill);
	} else {
	// FIXME: Target specific behavior about how the "extra" bytes are filled.
	getStreamer().EmitValueToAlignment(Alignment, FillExpr, ValueSize,
	MaxBytesToFill);
	}

	return ReturnVal;
	}

	/// parseDirectiveFile
	/// ::= .file filename
	/// ::= .file number [directory] filename [md5 checksum] [source source-text]
	bool AsmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
	// FIXME: I'm not sure what this is.
	int64_t FileNumber = -1;
	if (getLexer().is(AsmToken::Integer)) {
	FileNumber = getTok().getIntVal();
	Lex();

	if (FileNumber < 0)
	return TokError("negative file number");
	}

	std::string Path;

	// Usually the directory and filename together, otherwise just the directory.
	// Allow the strings to have escaped octal character sequence.
	if (check(getTok().isNot(AsmToken::String),
	"unexpected token in '.file' directive") \|\|
	parseEscapedString(Path))
	return true;

	StringRef Directory;
	StringRef Filename;
	std::string FilenameData;
	if (getLexer().is(AsmToken::String)) {
	if (check(FileNumber == -1,
	"explicit path specified, but no file number") \|\|
	parseEscapedString(FilenameData))
	return true;
	Filename = FilenameData;
	Directory = Path;
	} else {
	Filename = Path;
	}

	uint64_t MD5Hi, MD5Lo;
	bool HasMD5 = false;

	Optional<StringRef> Source;
	bool HasSource = false;
	std::string SourceString;

	while (!parseOptionalToken(AsmToken::EndOfStatement)) {
	StringRef Keyword;
	if (check(getTok().isNot(AsmToken::Identifier),
	"unexpected token in '.file' directive") \|\|
	parseIdentifier(Keyword))
	return true;
	if (Keyword == "md5") {
	HasMD5 = true;
	if (check(FileNumber == -1,
	"MD5 checksum specified, but no file number") \|\|
	parseHexOcta(*this, MD5Hi, MD5Lo))
	return true;
	} else if (Keyword == "source") {
	HasSource = true;
	if (check(FileNumber == -1,
	"source specified, but no file number") \|\|
	check(getTok().isNot(AsmToken::String),
	"unexpected token in '.file' directive") \|\|
	parseEscapedString(SourceString))
	return true;
	} else {
	return TokError("unexpected token in '.file' directive");
	}
	}

	if (FileNumber == -1) {
	// Ignore the directive if there is no number and the target doesn't support
	// numberless .file directives. This allows some portability of assembler
	// between different object file formats.
	if (getContext().getAsmInfo()->hasSingleParameterDotFile())
	getStreamer().EmitFileDirective(Filename);
	} else {
	// In case there is a -g option as well as debug info from directive .file,
	// we turn off the -g option, directly use the existing debug info instead.
	// Throw away any implicit file table for the assembler source.
	if (Ctx.getGenDwarfForAssembly()) {
	Ctx.getMCDwarfLineTable(0).resetFileTable();
	Ctx.setGenDwarfForAssembly(false);
	}

	Optional<MD5::MD5Result> CKMem;
	if (HasMD5) {
	MD5::MD5Result Sum;
	for (unsigned i = 0; i != 8; ++i) {
	Sum.Bytes[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
	Sum.Bytes[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
	}
	CKMem = Sum;
	}
	if (HasSource) {
	char SourceBuf = static_cast<char >(Ctx.allocate(SourceString.size()));
	memcpy(SourceBuf, SourceString.data(), SourceString.size());
	Source = StringRef(SourceBuf, SourceString.size());
	}
	if (FileNumber == 0) {
	if (Ctx.getDwarfVersion() < 5)
	return Warning(DirectiveLoc, "file 0 not supported prior to DWARF-5");
	getStreamer().emitDwarfFile0Directive(Directory, Filename, CKMem, Source);
	} else {
	Expected<unsigned> FileNumOrErr = getStreamer().tryEmitDwarfFileDirective(
	FileNumber, Directory, Filename, CKMem, Source);
	if (!FileNumOrErr)
	return Error(DirectiveLoc, toString(FileNumOrErr.takeError()));
	}
	// Alert the user if there are some .file directives with MD5 and some not.
	// But only do that once.
	if (!ReportedInconsistentMD5 && !Ctx.isDwarfMD5UsageConsistent(0)) {
	ReportedInconsistentMD5 = true;
	return Warning(DirectiveLoc, "inconsistent use of MD5 checksums");
	}
	}

	return false;
	}

	/// parseDirectiveLine
	/// ::= .line [number]
	bool AsmParser::parseDirectiveLine() {
	int64_t LineNumber;
	if (getLexer().is(AsmToken::Integer)) {
	if (parseIntToken(LineNumber, "unexpected token in '.line' directive"))
	return true;
	(void)LineNumber;
	// FIXME: Do something with the .line.
	}
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.line' directive"))
	return true;

	return false;
	}

	/// parseDirectiveLoc
	/// ::= .loc FileNumber [LineNumber] [ColumnPos] [basic_block] [prologue_end]
	/// [epilogue_begin] [is_stmt VALUE] [isa VALUE]
	/// The first number is a file number, must have been previously assigned with
	/// a .file directive, the second number is the line number and optionally the
	/// third number is a column position (zero if not specified). The remaining
	/// optional items are .loc sub-directives.
	bool AsmParser::parseDirectiveLoc() {
	int64_t FileNumber = 0, LineNumber = 0;
	SMLoc Loc = getTok().getLoc();
	if (parseIntToken(FileNumber, "unexpected token in '.loc' directive") \|\|
	check(FileNumber < 1 && Ctx.getDwarfVersion() < 5, Loc,
	"file number less than one in '.loc' directive") \|\|
	check(!getContext().isValidDwarfFileNumber(FileNumber), Loc,
	"unassigned file number in '.loc' directive"))
	return true;

	// optional
	if (getLexer().is(AsmToken::Integer)) {
	LineNumber = getTok().getIntVal();
	if (LineNumber < 0)
	return TokError("line number less than zero in '.loc' directive");
	Lex();
	}

	int64_t ColumnPos = 0;
	if (getLexer().is(AsmToken::Integer)) {
	ColumnPos = getTok().getIntVal();
	if (ColumnPos < 0)
	return TokError("column position less than zero in '.loc' directive");
	Lex();
	}

	unsigned Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0;
	unsigned Isa = 0;
	int64_t Discriminator = 0;

	auto parseLocOp = [&]() -> bool {
	StringRef Name;
	SMLoc Loc = getTok().getLoc();
	if (parseIdentifier(Name))
	return TokError("unexpected token in '.loc' directive");

	if (Name == "basic_block")
	Flags \|= DWARF2_FLAG_BASIC_BLOCK;
	else if (Name == "prologue_end")
	Flags \|= DWARF2_FLAG_PROLOGUE_END;
	else if (Name == "epilogue_begin")
	Flags \|= DWARF2_FLAG_EPILOGUE_BEGIN;
	else if (Name == "is_stmt") {
	Loc = getTok().getLoc();
	const MCExpr *Value;
	if (parseExpression(Value))
	return true;
	// The expression must be the constant 0 or 1.
	if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
	int Value = MCE->getValue();
	if (Value == 0)
	Flags &= ~DWARF2_FLAG_IS_STMT;
	else if (Value == 1)
	Flags \|= DWARF2_FLAG_IS_STMT;
	else
	return Error(Loc, "is_stmt value not 0 or 1");
	} else {
	return Error(Loc, "is_stmt value not the constant value of 0 or 1");
	}
	} else if (Name == "isa") {
	Loc = getTok().getLoc();
	const MCExpr *Value;
	if (parseExpression(Value))
	return true;
	// The expression must be a constant greater or equal to 0.
	if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
	int Value = MCE->getValue();
	if (Value < 0)
	return Error(Loc, "isa number less than zero");
	Isa = Value;
	} else {
	return Error(Loc, "isa number not a constant value");
	}
	} else if (Name == "discriminator") {
	if (parseAbsoluteExpression(Discriminator))
	return true;
	} else {
	return Error(Loc, "unknown sub-directive in '.loc' directive");
	}
	return false;
	};

	if (parseMany(parseLocOp, false /hasComma/))
	return true;

	getStreamer().EmitDwarfLocDirective(FileNumber, LineNumber, ColumnPos, Flags,
	Isa, Discriminator, StringRef());

	return false;
	}

	/// parseDirectiveStabs
	/// ::= .stabs string, number, number, number
	bool AsmParser::parseDirectiveStabs() {
	return TokError("unsupported directive '.stabs'");
	}

	/// parseDirectiveCVFile
	/// ::= .cv_file number filename [checksum] [checksumkind]
	bool AsmParser::parseDirectiveCVFile() {
	SMLoc FileNumberLoc = getTok().getLoc();
	int64_t FileNumber;
	std::string Filename;
	std::string Checksum;
	int64_t ChecksumKind = 0;

	if (parseIntToken(FileNumber,
	"expected file number in '.cv_file' directive") \|\|
	check(FileNumber < 1, FileNumberLoc, "file number less than one") \|\|
	check(getTok().isNot(AsmToken::String),
	"unexpected token in '.cv_file' directive") \|\|
	parseEscapedString(Filename))
	return true;
	if (!parseOptionalToken(AsmToken::EndOfStatement)) {
	if (check(getTok().isNot(AsmToken::String),
	"unexpected token in '.cv_file' directive") \|\|
	parseEscapedString(Checksum) \|\|
	parseIntToken(ChecksumKind,
	"expected checksum kind in '.cv_file' directive") \|\|
	parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.cv_file' directive"))
	return true;
	}

	Checksum = fromHex(Checksum);
	void *CKMem = Ctx.allocate(Checksum.size(), 1);
	memcpy(CKMem, Checksum.data(), Checksum.size());
	ArrayRef<uint8_t> ChecksumAsBytes(reinterpret_cast<const uint8_t *>(CKMem),
	Checksum.size());

	if (!getStreamer().EmitCVFileDirective(FileNumber, Filename, ChecksumAsBytes,
	static_cast<uint8_t>(ChecksumKind)))
	return Error(FileNumberLoc, "file number already allocated");

	return false;
	}

	bool AsmParser::parseCVFunctionId(int64_t &FunctionId,
	StringRef DirectiveName) {
	SMLoc Loc;
	return parseTokenLoc(Loc) \|\|
	parseIntToken(FunctionId, "expected function id in '" + DirectiveName +
	"' directive") \|\|
	check(FunctionId < 0 \|\| FunctionId >= UINT_MAX, Loc,
	"expected function id within range [0, UINT_MAX)");
	}

	bool AsmParser::parseCVFileId(int64_t &FileNumber, StringRef DirectiveName) {
	SMLoc Loc;
	return parseTokenLoc(Loc) \|\|
	parseIntToken(FileNumber, "expected integer in '" + DirectiveName +
	"' directive") \|\|
	check(FileNumber < 1, Loc, "file number less than one in '" +
	DirectiveName + "' directive") \|\|
	check(!getCVContext().isValidFileNumber(FileNumber), Loc,
	"unassigned file number in '" + DirectiveName + "' directive");
	}

	/// parseDirectiveCVFuncId
	/// ::= .cv_func_id FunctionId
	///
	/// Introduces a function ID that can be used with .cv_loc.
	bool AsmParser::parseDirectiveCVFuncId() {
	SMLoc FunctionIdLoc = getTok().getLoc();
	int64_t FunctionId;

	if (parseCVFunctionId(FunctionId, ".cv_func_id") \|\|
	parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.cv_func_id' directive"))
	return true;

	if (!getStreamer().EmitCVFuncIdDirective(FunctionId))
	return Error(FunctionIdLoc, "function id already allocated");

	return false;
	}

	/// parseDirectiveCVInlineSiteId
	/// ::= .cv_inline_site_id FunctionId
	/// "within" IAFunc
	/// "inlined_at" IAFile IALine [IACol]
	///
	/// Introduces a function ID that can be used with .cv_loc. Includes "inlined
	/// at" source location information for use in the line table of the caller,
	/// whether the caller is a real function or another inlined call site.
	bool AsmParser::parseDirectiveCVInlineSiteId() {
	SMLoc FunctionIdLoc = getTok().getLoc();
	int64_t FunctionId;
	int64_t IAFunc;
	int64_t IAFile;
	int64_t IALine;
	int64_t IACol = 0;

	// FunctionId
	if (parseCVFunctionId(FunctionId, ".cv_inline_site_id"))
	return true;

	// "within"
	if (check((getLexer().isNot(AsmToken::Identifier) \|\|
	getTok().getIdentifier() != "within"),
	"expected 'within' identifier in '.cv_inline_site_id' directive"))
	return true;
	Lex();

	// IAFunc
	if (parseCVFunctionId(IAFunc, ".cv_inline_site_id"))
	return true;

	// "inlined_at"
	if (check((getLexer().isNot(AsmToken::Identifier) \|\|
	getTok().getIdentifier() != "inlined_at"),
	"expected 'inlined_at' identifier in '.cv_inline_site_id' "
	"directive") )
	return true;
	Lex();

	// IAFile IALine
	if (parseCVFileId(IAFile, ".cv_inline_site_id") \|\|
	parseIntToken(IALine, "expected line number after 'inlined_at'"))
	return true;

	// [IACol]
	if (getLexer().is(AsmToken::Integer)) {
	IACol = getTok().getIntVal();
	Lex();
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.cv_inline_site_id' directive"))
	return true;

	if (!getStreamer().EmitCVInlineSiteIdDirective(FunctionId, IAFunc, IAFile,
	IALine, IACol, FunctionIdLoc))
	return Error(FunctionIdLoc, "function id already allocated");

	return false;
	}

	/// parseDirectiveCVLoc
	/// ::= .cv_loc FunctionId FileNumber [LineNumber] [ColumnPos] [prologue_end]
	/// [is_stmt VALUE]
	/// The first number is a file number, must have been previously assigned with
	/// a .file directive, the second number is the line number and optionally the
	/// third number is a column position (zero if not specified). The remaining
	/// optional items are .loc sub-directives.
	bool AsmParser::parseDirectiveCVLoc() {
	SMLoc DirectiveLoc = getTok().getLoc();
	int64_t FunctionId, FileNumber;
	if (parseCVFunctionId(FunctionId, ".cv_loc") \|\|
	parseCVFileId(FileNumber, ".cv_loc"))
	return true;

	int64_t LineNumber = 0;
	if (getLexer().is(AsmToken::Integer)) {
	LineNumber = getTok().getIntVal();
	if (LineNumber < 0)
	return TokError("line number less than zero in '.cv_loc' directive");
	Lex();
	}

	int64_t ColumnPos = 0;
	if (getLexer().is(AsmToken::Integer)) {
	ColumnPos = getTok().getIntVal();
	if (ColumnPos < 0)
	return TokError("column position less than zero in '.cv_loc' directive");
	Lex();
	}

	bool PrologueEnd = false;
	uint64_t IsStmt = 0;

	auto parseOp = [&]() -> bool {
	StringRef Name;
	SMLoc Loc = getTok().getLoc();
	if (parseIdentifier(Name))
	return TokError("unexpected token in '.cv_loc' directive");
	if (Name == "prologue_end")
	PrologueEnd = true;
	else if (Name == "is_stmt") {
	Loc = getTok().getLoc();
	const MCExpr *Value;
	if (parseExpression(Value))
	return true;
	// The expression must be the constant 0 or 1.
	IsStmt = ~0ULL;
	if (const auto *MCE = dyn_cast<MCConstantExpr>(Value))
	IsStmt = MCE->getValue();

	if (IsStmt > 1)
	return Error(Loc, "is_stmt value not 0 or 1");
	} else {
	return Error(Loc, "unknown sub-directive in '.cv_loc' directive");
	}
	return false;
	};

	if (parseMany(parseOp, false /hasComma/))
	return true;

	getStreamer().EmitCVLocDirective(FunctionId, FileNumber, LineNumber,
	ColumnPos, PrologueEnd, IsStmt, StringRef(),
	DirectiveLoc);
	return false;
	}

	/// parseDirectiveCVLinetable
	/// ::= .cv_linetable FunctionId, FnStart, FnEnd
	bool AsmParser::parseDirectiveCVLinetable() {
	int64_t FunctionId;
	StringRef FnStartName, FnEndName;
	SMLoc Loc = getTok().getLoc();
	if (parseCVFunctionId(FunctionId, ".cv_linetable") \|\|
	parseToken(AsmToken::Comma,
	"unexpected token in '.cv_linetable' directive") \|\|
	parseTokenLoc(Loc) \|\| check(parseIdentifier(FnStartName), Loc,
	"expected identifier in directive") \|\|
	parseToken(AsmToken::Comma,
	"unexpected token in '.cv_linetable' directive") \|\|
	parseTokenLoc(Loc) \|\| check(parseIdentifier(FnEndName), Loc,
	"expected identifier in directive"))
	return true;

	MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
	MCSymbol *FnEndSym = getContext().getOrCreateSymbol(FnEndName);

	getStreamer().EmitCVLinetableDirective(FunctionId, FnStartSym, FnEndSym);
	return false;
	}

	/// parseDirectiveCVInlineLinetable
	/// ::= .cv_inline_linetable PrimaryFunctionId FileId LineNum FnStart FnEnd
	bool AsmParser::parseDirectiveCVInlineLinetable() {
	int64_t PrimaryFunctionId, SourceFileId, SourceLineNum;
	StringRef FnStartName, FnEndName;
	SMLoc Loc = getTok().getLoc();
	if (parseCVFunctionId(PrimaryFunctionId, ".cv_inline_linetable") \|\|
	parseTokenLoc(Loc) \|\|
	parseIntToken(
	SourceFileId,
	"expected SourceField in '.cv_inline_linetable' directive") \|\|
	check(SourceFileId <= 0, Loc,
	"File id less than zero in '.cv_inline_linetable' directive") \|\|
	parseTokenLoc(Loc) \|\|
	parseIntToken(
	SourceLineNum,
	"expected SourceLineNum in '.cv_inline_linetable' directive") \|\|
	check(SourceLineNum < 0, Loc,
	"Line number less than zero in '.cv_inline_linetable' directive") \|\|
	parseTokenLoc(Loc) \|\| check(parseIdentifier(FnStartName), Loc,
	"expected identifier in directive") \|\|
	parseTokenLoc(Loc) \|\| check(parseIdentifier(FnEndName), Loc,
	"expected identifier in directive"))
	return true;

	if (parseToken(AsmToken::EndOfStatement, "Expected End of Statement"))
	return true;

	MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
	MCSymbol *FnEndSym = getContext().getOrCreateSymbol(FnEndName);
	getStreamer().EmitCVInlineLinetableDirective(PrimaryFunctionId, SourceFileId,
	SourceLineNum, FnStartSym,
	FnEndSym);
	return false;
	}

	/// parseDirectiveCVDefRange
	/// ::= .cv_def_range RangeStart RangeEnd (GapStart GapEnd), bytes
	bool AsmParser::parseDirectiveCVDefRange() {
	SMLoc Loc;
	std::vector<std::pair<const MCSymbol , const MCSymbol >> Ranges;
	while (getLexer().is(AsmToken::Identifier)) {
	Loc = getLexer().getLoc();
	StringRef GapStartName;
	if (parseIdentifier(GapStartName))
	return Error(Loc, "expected identifier in directive");
	MCSymbol *GapStartSym = getContext().getOrCreateSymbol(GapStartName);

	Loc = getLexer().getLoc();
	StringRef GapEndName;
	if (parseIdentifier(GapEndName))
	return Error(Loc, "expected identifier in directive");
	MCSymbol *GapEndSym = getContext().getOrCreateSymbol(GapEndName);

	Ranges.push_back({GapStartSym, GapEndSym});
	}

	std::string FixedSizePortion;
	if (parseToken(AsmToken::Comma, "unexpected token in directive") \|\|
	parseEscapedString(FixedSizePortion))
	return true;

	getStreamer().EmitCVDefRangeDirective(Ranges, FixedSizePortion);
	return false;
	}

	/// parseDirectiveCVString
	/// ::= .cv_stringtable "string"
	bool AsmParser::parseDirectiveCVString() {
	std::string Data;
	if (checkForValidSection() \|\| parseEscapedString(Data))
	return addErrorSuffix(" in '.cv_string' directive");

	// Put the string in the table and emit the offset.
	std::pair<StringRef, unsigned> Insertion =
	getCVContext().addToStringTable(Data);
	getStreamer().EmitIntValue(Insertion.second, 4);
	return false;
	}

	/// parseDirectiveCVStringTable
	/// ::= .cv_stringtable
	bool AsmParser::parseDirectiveCVStringTable() {
	getStreamer().EmitCVStringTableDirective();
	return false;
	}

	/// parseDirectiveCVFileChecksums
	/// ::= .cv_filechecksums
	bool AsmParser::parseDirectiveCVFileChecksums() {
	getStreamer().EmitCVFileChecksumsDirective();
	return false;
	}

	/// parseDirectiveCVFileChecksumOffset
	/// ::= .cv_filechecksumoffset fileno
	bool AsmParser::parseDirectiveCVFileChecksumOffset() {
	int64_t FileNo;
	if (parseIntToken(FileNo, "expected identifier in directive"))
	return true;
	if (parseToken(AsmToken::EndOfStatement, "Expected End of Statement"))
	return true;
	getStreamer().EmitCVFileChecksumOffsetDirective(FileNo);
	return false;
	}

	/// parseDirectiveCVFPOData
	/// ::= .cv_fpo_data procsym
	bool AsmParser::parseDirectiveCVFPOData() {
	SMLoc DirLoc = getLexer().getLoc();
	StringRef ProcName;
	if (parseIdentifier(ProcName))
	return TokError("expected symbol name");
	if (parseEOL("unexpected tokens"))
	return addErrorSuffix(" in '.cv_fpo_data' directive");
	MCSymbol *ProcSym = getContext().getOrCreateSymbol(ProcName);
	getStreamer().EmitCVFPOData(ProcSym, DirLoc);
	return false;
	}

	/// parseDirectiveCFISections
	/// ::= .cfi_sections section [, section]
	bool AsmParser::parseDirectiveCFISections() {
	StringRef Name;
	bool EH = false;
	bool Debug = false;

	if (parseIdentifier(Name))
	return TokError("Expected an identifier");

	if (Name == ".eh_frame")
	EH = true;
	else if (Name == ".debug_frame")
	Debug = true;

	if (getLexer().is(AsmToken::Comma)) {
	Lex();

	if (parseIdentifier(Name))
	return TokError("Expected an identifier");

	if (Name == ".eh_frame")
	EH = true;
	else if (Name == ".debug_frame")
	Debug = true;
	}

	getStreamer().EmitCFISections(EH, Debug);
	return false;
	}

	/// parseDirectiveCFIStartProc
	/// ::= .cfi_startproc [simple]
	bool AsmParser::parseDirectiveCFIStartProc() {
	StringRef Simple;
	if (!parseOptionalToken(AsmToken::EndOfStatement)) {
	if (check(parseIdentifier(Simple) \|\| Simple != "simple",
	"unexpected token") \|\|
	parseToken(AsmToken::EndOfStatement))
	return addErrorSuffix(" in '.cfi_startproc' directive");
	}

	// TODO(kristina): Deal with a corner case of incorrect diagnostic context
	// being produced if this directive is emitted as part of preprocessor macro
	// expansion which can ONLY happen if Clang's cc1as is the API consumer.
	// Tools like llvm-mc on the other hand are not affected by it, and report
	// correct context information.
	getStreamer().EmitCFIStartProc(!Simple.empty(), Lexer.getLoc());
	return false;
	}

	/// parseDirectiveCFIEndProc
	/// ::= .cfi_endproc
	bool AsmParser::parseDirectiveCFIEndProc() {
	getStreamer().EmitCFIEndProc();
	return false;
	}

	/// parse register name or number.
	bool AsmParser::parseRegisterOrRegisterNumber(int64_t &Register,
	SMLoc DirectiveLoc) {
	unsigned RegNo;

	if (getLexer().isNot(AsmToken::Integer)) {
	if (getTargetParser().ParseRegister(RegNo, DirectiveLoc, DirectiveLoc))
	return true;
	Register = getContext().getRegisterInfo()->getDwarfRegNum(RegNo, true);
	} else
	return parseAbsoluteExpression(Register);

	return false;
	}

	/// parseDirectiveCFIDefCfa
	/// ::= .cfi_def_cfa register, offset
	bool AsmParser::parseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
	int64_t Register = 0, Offset = 0;
	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) \|\|
	parseToken(AsmToken::Comma, "unexpected token in directive") \|\|
	parseAbsoluteExpression(Offset))
	return true;

	getStreamer().EmitCFIDefCfa(Register, Offset);
	return false;
	}

	/// parseDirectiveCFIDefCfaOffset
	/// ::= .cfi_def_cfa_offset offset
	bool AsmParser::parseDirectiveCFIDefCfaOffset() {
	int64_t Offset = 0;
	if (parseAbsoluteExpression(Offset))
	return true;

	getStreamer().EmitCFIDefCfaOffset(Offset);
	return false;
	}

	/// parseDirectiveCFIRegister
	/// ::= .cfi_register register, register
	bool AsmParser::parseDirectiveCFIRegister(SMLoc DirectiveLoc) {
	int64_t Register1 = 0, Register2 = 0;
	if (parseRegisterOrRegisterNumber(Register1, DirectiveLoc) \|\|
	parseToken(AsmToken::Comma, "unexpected token in directive") \|\|
	parseRegisterOrRegisterNumber(Register2, DirectiveLoc))
	return true;

	getStreamer().EmitCFIRegister(Register1, Register2);
	return false;
	}

	/// parseDirectiveCFIWindowSave
	/// ::= .cfi_window_save
	bool AsmParser::parseDirectiveCFIWindowSave() {
	getStreamer().EmitCFIWindowSave();
	return false;
	}

	/// parseDirectiveCFIAdjustCfaOffset
	/// ::= .cfi_adjust_cfa_offset adjustment
	bool AsmParser::parseDirectiveCFIAdjustCfaOffset() {
	int64_t Adjustment = 0;
	if (parseAbsoluteExpression(Adjustment))
	return true;

	getStreamer().EmitCFIAdjustCfaOffset(Adjustment);
	return false;
	}

	/// parseDirectiveCFIDefCfaRegister
	/// ::= .cfi_def_cfa_register register
	bool AsmParser::parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc) {
	int64_t Register = 0;
	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
	return true;

	getStreamer().EmitCFIDefCfaRegister(Register);
	return false;
	}

	/// parseDirectiveCFIOffset
	/// ::= .cfi_offset register, offset
	bool AsmParser::parseDirectiveCFIOffset(SMLoc DirectiveLoc) {
	int64_t Register = 0;
	int64_t Offset = 0;

	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) \|\|
	parseToken(AsmToken::Comma, "unexpected token in directive") \|\|
	parseAbsoluteExpression(Offset))
	return true;

	getStreamer().EmitCFIOffset(Register, Offset);
	return false;
	}

	/// parseDirectiveCFIRelOffset
	/// ::= .cfi_rel_offset register, offset
	bool AsmParser::parseDirectiveCFIRelOffset(SMLoc DirectiveLoc) {
	int64_t Register = 0, Offset = 0;

	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) \|\|
	parseToken(AsmToken::Comma, "unexpected token in directive") \|\|
	parseAbsoluteExpression(Offset))
	return true;

	getStreamer().EmitCFIRelOffset(Register, Offset);
	return false;
	}

	static bool isValidEncoding(int64_t Encoding) {
	if (Encoding & ~0xff)
	return false;

	if (Encoding == dwarf::DW_EH_PE_omit)
	return true;

	const unsigned Format = Encoding & 0xf;
	if (Format != dwarf::DW_EH_PE_absptr && Format != dwarf::DW_EH_PE_udata2 &&
	Format != dwarf::DW_EH_PE_udata4 && Format != dwarf::DW_EH_PE_udata8 &&
	Format != dwarf::DW_EH_PE_sdata2 && Format != dwarf::DW_EH_PE_sdata4 &&
	Format != dwarf::DW_EH_PE_sdata8 && Format != dwarf::DW_EH_PE_signed)
	return false;

	const unsigned Application = Encoding & 0x70;
	if (Application != dwarf::DW_EH_PE_absptr &&
	Application != dwarf::DW_EH_PE_pcrel)
	return false;

	return true;
	}

	/// parseDirectiveCFIPersonalityOrLsda
	/// IsPersonality true for cfi_personality, false for cfi_lsda
	/// ::= .cfi_personality encoding, [symbol_name]
	/// ::= .cfi_lsda encoding, [symbol_name]
	bool AsmParser::parseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
	int64_t Encoding = 0;
	if (parseAbsoluteExpression(Encoding))
	return true;
	if (Encoding == dwarf::DW_EH_PE_omit)
	return false;

	StringRef Name;
	if (check(!isValidEncoding(Encoding), "unsupported encoding.") \|\|
	parseToken(AsmToken::Comma, "unexpected token in directive") \|\|
	check(parseIdentifier(Name), "expected identifier in directive"))
	return true;

	MCSymbol *Sym = getContext().getOrCreateSymbol(Name);

	if (IsPersonality)
	getStreamer().EmitCFIPersonality(Sym, Encoding);
	else
	getStreamer().EmitCFILsda(Sym, Encoding);
	return false;
	}

	/// parseDirectiveCFIRememberState
	/// ::= .cfi_remember_state
	bool AsmParser::parseDirectiveCFIRememberState() {
	getStreamer().EmitCFIRememberState();
	return false;
	}

	/// parseDirectiveCFIRestoreState
	/// ::= .cfi_remember_state
	bool AsmParser::parseDirectiveCFIRestoreState() {
	getStreamer().EmitCFIRestoreState();
	return false;
	}

	/// parseDirectiveCFISameValue
	/// ::= .cfi_same_value register
	bool AsmParser::parseDirectiveCFISameValue(SMLoc DirectiveLoc) {
	int64_t Register = 0;

	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
	return true;

	getStreamer().EmitCFISameValue(Register);
	return false;
	}

	/// parseDirectiveCFIRestore
	/// ::= .cfi_restore register
	bool AsmParser::parseDirectiveCFIRestore(SMLoc DirectiveLoc) {
	int64_t Register = 0;
	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
	return true;

	getStreamer().EmitCFIRestore(Register);
	return false;
	}

	/// parseDirectiveCFIEscape
	/// ::= .cfi_escape expression[,...]
	bool AsmParser::parseDirectiveCFIEscape() {
	std::string Values;
	int64_t CurrValue;
	if (parseAbsoluteExpression(CurrValue))
	return true;

	Values.push_back((uint8_t)CurrValue);

	while (getLexer().is(AsmToken::Comma)) {
	Lex();

	if (parseAbsoluteExpression(CurrValue))
	return true;

	Values.push_back((uint8_t)CurrValue);
	}

	getStreamer().EmitCFIEscape(Values);
	return false;
	}

	/// parseDirectiveCFIReturnColumn
	/// ::= .cfi_return_column register
	bool AsmParser::parseDirectiveCFIReturnColumn(SMLoc DirectiveLoc) {
	int64_t Register = 0;
	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
	return true;
	getStreamer().EmitCFIReturnColumn(Register);
	return false;
	}

	/// parseDirectiveCFISignalFrame
	/// ::= .cfi_signal_frame
	bool AsmParser::parseDirectiveCFISignalFrame() {
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.cfi_signal_frame'"))
	return true;

	getStreamer().EmitCFISignalFrame();
	return false;
	}

	/// parseDirectiveCFIUndefined
	/// ::= .cfi_undefined register
	bool AsmParser::parseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
	int64_t Register = 0;

	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
	return true;

	getStreamer().EmitCFIUndefined(Register);
	return false;
	}

	/// parseDirectiveAltmacro
	/// ::= .altmacro
	/// ::= .noaltmacro
	bool AsmParser::parseDirectiveAltmacro(StringRef Directive) {
	if (getLexer().isNot(AsmToken::EndOfStatement))
	return TokError("unexpected token in '" + Directive + "' directive");
	AltMacroMode = (Directive == ".altmacro");
	return false;
	}

	/// parseDirectiveMacrosOnOff
	/// ::= .macros_on
	/// ::= .macros_off
	bool AsmParser::parseDirectiveMacrosOnOff(StringRef Directive) {
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '" + Directive + "' directive"))
	return true;

	setMacrosEnabled(Directive == ".macros_on");
	return false;
	}

	/// parseDirectiveMacro
	/// ::= .macro name[,] [parameters]
	bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
	StringRef Name;
	if (parseIdentifier(Name))
	return TokError("expected identifier in '.macro' directive");

	if (getLexer().is(AsmToken::Comma))
	Lex();

	MCAsmMacroParameters Parameters;
	while (getLexer().isNot(AsmToken::EndOfStatement)) {

	if (!Parameters.empty() && Parameters.back().Vararg)
	return Error(Lexer.getLoc(),
	"Vararg parameter '" + Parameters.back().Name +
	"' should be last one in the list of parameters.");

	MCAsmMacroParameter Parameter;
	if (parseIdentifier(Parameter.Name))
	return TokError("expected identifier in '.macro' directive");

	// Emit an error if two (or more) named parameters share the same name
	for (const MCAsmMacroParameter& CurrParam : Parameters)
	if (CurrParam.Name.equals(Parameter.Name))
	return TokError("macro '" + Name + "' has multiple parameters"
	" named '" + Parameter.Name + "'");

	if (Lexer.is(AsmToken::Colon)) {
	Lex(); // consume ':'

	SMLoc QualLoc;
	StringRef Qualifier;

	QualLoc = Lexer.getLoc();
	if (parseIdentifier(Qualifier))
	return Error(QualLoc, "missing parameter qualifier for "
	"'" + Parameter.Name + "' in macro '" + Name + "'");

	if (Qualifier == "req")
	Parameter.Required = true;
	else if (Qualifier == "vararg")
	Parameter.Vararg = true;
	else
	return Error(QualLoc, Qualifier + " is not a valid parameter qualifier "
	"for '" + Parameter.Name + "' in macro '" + Name + "'");
	}

	if (getLexer().is(AsmToken::Equal)) {
	Lex();

	SMLoc ParamLoc;

	ParamLoc = Lexer.getLoc();
	if (parseMacroArgument(Parameter.Value, /Vararg=/false ))
	return true;

	if (Parameter.Required)
	Warning(ParamLoc, "pointless default value for required parameter "
	"'" + Parameter.Name + "' in macro '" + Name + "'");
	}

	Parameters.push_back(std::move(Parameter));

	if (getLexer().is(AsmToken::Comma))
	Lex();
	}

	// Eat just the end of statement.
	Lexer.Lex();

	// Consuming deferred text, so use Lexer.Lex to ignore Lexing Errors
	AsmToken EndToken, StartToken = getTok();
	unsigned MacroDepth = 0;
	// Lex the macro definition.
	while (true) {
	// Ignore Lexing errors in macros.
	while (Lexer.is(AsmToken::Error)) {
	Lexer.Lex();
	}

	// Check whether we have reached the end of the file.
	if (getLexer().is(AsmToken::Eof))
	return Error(DirectiveLoc, "no matching '.endmacro' in definition");

	// Otherwise, check whether we have reach the .endmacro.
	if (getLexer().is(AsmToken::Identifier)) {
	if (getTok().getIdentifier() == ".endm" \|\|
	getTok().getIdentifier() == ".endmacro") {
	if (MacroDepth == 0) { // Outermost macro.
	EndToken = getTok();
	Lexer.Lex();
	if (getLexer().isNot(AsmToken::EndOfStatement))
	return TokError("unexpected token in '" + EndToken.getIdentifier() +
	"' directive");
	break;
	} else {
	// Otherwise we just found the end of an inner macro.
	--MacroDepth;
	}
	} else if (getTok().getIdentifier() == ".macro") {
	// We allow nested macros. Those aren't instantiated until the outermost
	// macro is expanded so just ignore them for now.
	++MacroDepth;
	}
	}

	// Otherwise, scan til the end of the statement.
	eatToEndOfStatement();
	}

	if (getContext().lookupMacro(Name)) {
	return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
	}

	const char *BodyStart = StartToken.getLoc().getPointer();
	const char *BodyEnd = EndToken.getLoc().getPointer();
	StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
	checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
	MCAsmMacro Macro(Name, Body, std::move(Parameters));
	DEBUG_WITH_TYPE("asm-macros", dbgs() << "Defining new macro:\n";
	Macro.dump());
	getContext().defineMacro(Name, std::move(Macro));
	return false;
	}

	/// checkForBadMacro
	///
	/// With the support added for named parameters there may be code out there that
	/// is transitioning from positional parameters. In versions of gas that did
	/// not support named parameters they would be ignored on the macro definition.
	/// But to support both styles of parameters this is not possible so if a macro
	/// definition has named parameters but does not use them and has what appears
	/// to be positional parameters, strings like $1, $2, ... and $n, then issue a
	/// warning that the positional parameter found in body which have no effect.
	/// Hoping the developer will either remove the named parameters from the macro
	/// definition so the positional parameters get used if that was what was
	/// intended or change the macro to use the named parameters. It is possible
	/// this warning will trigger when the none of the named parameters are used
	/// and the strings like $1 are infact to simply to be passed trough unchanged.
	void AsmParser::checkForBadMacro(SMLoc DirectiveLoc, StringRef Name,
	StringRef Body,
	ArrayRef<MCAsmMacroParameter> Parameters) {
	// If this macro is not defined with named parameters the warning we are
	// checking for here doesn't apply.
	unsigned NParameters = Parameters.size();
	if (NParameters == 0)
	return;

	bool NamedParametersFound = false;
	bool PositionalParametersFound = false;

	// Look at the body of the macro for use of both the named parameters and what
	// are likely to be positional parameters. This is what expandMacro() is
	// doing when it finds the parameters in the body.
	while (!Body.empty()) {
	// Scan for the next possible parameter.
	std::size_t End = Body.size(), Pos = 0;
	for (; Pos != End; ++Pos) {
	// Check for a substitution or escape.
	// This macro is defined with parameters, look for \foo, \bar, etc.
	if (Body[Pos] == '\\' && Pos + 1 != End)
	break;

	// This macro should have parameters, but look for $0, $1, ..., $n too.
	if (Body[Pos] != '$' \|\| Pos + 1 == End)
	continue;
	char Next = Body[Pos + 1];
	if (Next == '$' \|\| Next == 'n' \|\|
	isdigit(static_cast<unsigned char>(Next)))
	break;
	}

	// Check if we reached the end.
	if (Pos == End)
	break;

	if (Body[Pos] == '$') {
	switch (Body[Pos + 1]) {
	// $$ => $
	case '$':
	break;

	// $n => number of arguments
	case 'n':
	PositionalParametersFound = true;
	break;

	// $[0-9] => argument
	default: {
	PositionalParametersFound = true;
	break;
	}
	}
	Pos += 2;
	} else {
	unsigned I = Pos + 1;
	while (isIdentifierChar(Body[I]) && I + 1 != End)
	++I;

	const char *Begin = Body.data() + Pos + 1;
	StringRef Argument(Begin, I - (Pos + 1));
	unsigned Index = 0;
	for (; Index < NParameters; ++Index)
	if (Parameters[Index].Name == Argument)
	break;

	if (Index == NParameters) {
	if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
	Pos += 3;
	else {
	Pos = I;
	}
	} else {
	NamedParametersFound = true;
	Pos += 1 + Argument.size();
	}
	}
	// Update the scan point.
	Body = Body.substr(Pos);
	}

	if (!NamedParametersFound && PositionalParametersFound)
	Warning(DirectiveLoc, "macro defined with named parameters which are not "
	"used in macro body, possible positional parameter "
	"found in body which will have no effect");
	}

	/// parseDirectiveExitMacro
	/// ::= .exitm
	bool AsmParser::parseDirectiveExitMacro(StringRef Directive) {
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '" + Directive + "' directive"))
	return true;

	if (!isInsideMacroInstantiation())
	return TokError("unexpected '" + Directive + "' in file, "
	"no current macro definition");

	// Exit all conditionals that are active in the current macro.
	while (TheCondStack.size() != ActiveMacros.back()->CondStackDepth) {
	TheCondState = TheCondStack.back();
	TheCondStack.pop_back();
	}

	handleMacroExit();
	return false;
	}

	/// parseDirectiveEndMacro
	/// ::= .endm
	/// ::= .endmacro
	bool AsmParser::parseDirectiveEndMacro(StringRef Directive) {
	if (getLexer().isNot(AsmToken::EndOfStatement))
	return TokError("unexpected token in '" + Directive + "' directive");

	// If we are inside a macro instantiation, terminate the current
	// instantiation.
	if (isInsideMacroInstantiation()) {
	handleMacroExit();
	return false;
	}

	// Otherwise, this .endmacro is a stray entry in the file; well formed
	// .endmacro directives are handled during the macro definition parsing.
	return TokError("unexpected '" + Directive + "' in file, "
	"no current macro definition");
	}

	/// parseDirectivePurgeMacro
	/// ::= .purgem
	bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
	StringRef Name;
	SMLoc Loc;
	if (parseTokenLoc(Loc) \|\|
	check(parseIdentifier(Name), Loc,
	"expected identifier in '.purgem' directive") \|\|
	parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.purgem' directive"))
	return true;

	if (!getContext().lookupMacro(Name))
	return Error(DirectiveLoc, "macro '" + Name + "' is not defined");

	getContext().undefineMacro(Name);
	DEBUG_WITH_TYPE("asm-macros", dbgs()
	<< "Un-defining macro: " << Name << "\n");
	return false;
	}

	/// parseDirectiveBundleAlignMode
	/// ::= {.bundle_align_mode} expression
	bool AsmParser::parseDirectiveBundleAlignMode() {
	// Expect a single argument: an expression that evaluates to a constant
	// in the inclusive range 0-30.
	SMLoc ExprLoc = getLexer().getLoc();
	int64_t AlignSizePow2;
	if (checkForValidSection() \|\| parseAbsoluteExpression(AlignSizePow2) \|\|
	parseToken(AsmToken::EndOfStatement, "unexpected token after expression "
	"in '.bundle_align_mode' "
	"directive") \|\|
	check(AlignSizePow2 < 0 \|\| AlignSizePow2 > 30, ExprLoc,
	"invalid bundle alignment size (expected between 0 and 30)"))
	return true;

	// Because of AlignSizePow2's verified range we can safely truncate it to
	// unsigned.
	getStreamer().EmitBundleAlignMode(static_cast<unsigned>(AlignSizePow2));
	return false;
	}

	/// parseDirectiveBundleLock
	/// ::= {.bundle_lock} [align_to_end]
	bool AsmParser::parseDirectiveBundleLock() {
	if (checkForValidSection())
	return true;
	bool AlignToEnd = false;

	StringRef Option;
	SMLoc Loc = getTok().getLoc();
	const char *kInvalidOptionError =
	"invalid option for '.bundle_lock' directive";

	if (!parseOptionalToken(AsmToken::EndOfStatement)) {
	if (check(parseIdentifier(Option), Loc, kInvalidOptionError) \|\|
	check(Option != "align_to_end", Loc, kInvalidOptionError) \|\|
	parseToken(AsmToken::EndOfStatement,
	"unexpected token after '.bundle_lock' directive option"))
	return true;
	AlignToEnd = true;
	}

	getStreamer().EmitBundleLock(AlignToEnd);
	return false;
	}

	/// parseDirectiveBundleLock
	/// ::= {.bundle_lock}
	bool AsmParser::parseDirectiveBundleUnlock() {
	if (checkForValidSection() \|\|
	parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.bundle_unlock' directive"))
	return true;

	getStreamer().EmitBundleUnlock();
	return false;
	}

	/// parseDirectiveSpace
	/// ::= (.skip \| .space) expression [ , expression ]
	bool AsmParser::parseDirectiveSpace(StringRef IDVal) {
	SMLoc NumBytesLoc = Lexer.getLoc();
	const MCExpr *NumBytes;
	if (checkForValidSection() \|\| parseExpression(NumBytes))
	return true;

	int64_t FillExpr = 0;
	if (parseOptionalToken(AsmToken::Comma))
	if (parseAbsoluteExpression(FillExpr))
	return addErrorSuffix("in '" + Twine(IDVal) + "' directive");
	if (parseToken(AsmToken::EndOfStatement))
	return addErrorSuffix("in '" + Twine(IDVal) + "' directive");

	// FIXME: Sometimes the fill expr is 'nop' if it isn't supplied, instead of 0.
	getStreamer().emitFill(*NumBytes, FillExpr, NumBytesLoc);

	return false;
	}

	/// parseDirectiveDCB
	/// ::= .dcb.{b, l, w} expression, expression
	bool AsmParser::parseDirectiveDCB(StringRef IDVal, unsigned Size) {
	SMLoc NumValuesLoc = Lexer.getLoc();
	int64_t NumValues;
	if (checkForValidSection() \|\| parseAbsoluteExpression(NumValues))
	return true;

	if (NumValues < 0) {
	Warning(NumValuesLoc, "'" + Twine(IDVal) + "' directive with negative repeat count has no effect");
	return false;
	}

	if (parseToken(AsmToken::Comma,
	"unexpected token in '" + Twine(IDVal) + "' directive"))
	return true;

	const MCExpr *Value;
	SMLoc ExprLoc = getLexer().getLoc();
	if (parseExpression(Value))
	return true;

	// Special case constant expressions to match code generator.
	if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
	assert(Size <= 8 && "Invalid size");
	uint64_t IntValue = MCE->getValue();
	if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
	return Error(ExprLoc, "literal value out of range for directive");
	for (uint64_t i = 0, e = NumValues; i != e; ++i)
	getStreamer().EmitIntValue(IntValue, Size);
	} else {
	for (uint64_t i = 0, e = NumValues; i != e; ++i)
	getStreamer().EmitValue(Value, Size, ExprLoc);
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '" + Twine(IDVal) + "' directive"))
	return true;

	return false;
	}

	/// parseDirectiveRealDCB
	/// ::= .dcb.{d, s} expression, expression
	bool AsmParser::parseDirectiveRealDCB(StringRef IDVal, const fltSemantics &Semantics) {
	SMLoc NumValuesLoc = Lexer.getLoc();
	int64_t NumValues;
	if (checkForValidSection() \|\| parseAbsoluteExpression(NumValues))
	return true;

	if (NumValues < 0) {
	Warning(NumValuesLoc, "'" + Twine(IDVal) + "' directive with negative repeat count has no effect");
	return false;
	}

	if (parseToken(AsmToken::Comma,
	"unexpected token in '" + Twine(IDVal) + "' directive"))
	return true;

	APInt AsInt;
	if (parseRealValue(Semantics, AsInt))
	return true;

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '" + Twine(IDVal) + "' directive"))
	return true;

	for (uint64_t i = 0, e = NumValues; i != e; ++i)
	getStreamer().EmitIntValue(AsInt.getLimitedValue(),
	AsInt.getBitWidth() / 8);

	return false;
	}

	/// parseDirectiveDS
	/// ::= .ds.{b, d, l, p, s, w, x} expression
	bool AsmParser::parseDirectiveDS(StringRef IDVal, unsigned Size) {
	SMLoc NumValuesLoc = Lexer.getLoc();
	int64_t NumValues;
	if (checkForValidSection() \|\| parseAbsoluteExpression(NumValues))
	return true;

	if (NumValues < 0) {
	Warning(NumValuesLoc, "'" + Twine(IDVal) + "' directive with negative repeat count has no effect");
	return false;
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '" + Twine(IDVal) + "' directive"))
	return true;

	for (uint64_t i = 0, e = NumValues; i != e; ++i)
	getStreamer().emitFill(Size, 0);

	return false;
	}

	/// parseDirectiveLEB128
	/// ::= (.sleb128 \| .uleb128) [ expression (, expression)* ]
	bool AsmParser::parseDirectiveLEB128(bool Signed) {
	if (checkForValidSection())
	return true;

	auto parseOp = [&]() -> bool {
	const MCExpr *Value;
	if (parseExpression(Value))
	return true;
	if (Signed)
	getStreamer().EmitSLEB128Value(Value);
	else
	getStreamer().EmitULEB128Value(Value);
	return false;
	};

	if (parseMany(parseOp))
	return addErrorSuffix(" in directive");

	return false;
	}

	/// parseDirectiveSymbolAttribute
	/// ::= { ".globl", ".weak", ... } [ identifier ( , identifier )* ]
	bool AsmParser::parseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
	auto parseOp = [&]() -> bool {
	StringRef Name;
	SMLoc Loc = getTok().getLoc();
	if (parseIdentifier(Name))
	return Error(Loc, "expected identifier");
	MCSymbol *Sym = getContext().getOrCreateSymbol(Name);

	// Assembler local symbols don't make any sense here. Complain loudly.
	if (Sym->isTemporary())
	return Error(Loc, "non-local symbol required");

	if (!getStreamer().EmitSymbolAttribute(Sym, Attr))
	return Error(Loc, "unable to emit symbol attribute");
	return false;
	};

	if (parseMany(parseOp))
	return addErrorSuffix(" in directive");
	return false;
	}

	/// parseDirectiveComm
	/// ::= ( .comm \| .lcomm ) identifier , size_expression [ , align_expression ]
	bool AsmParser::parseDirectiveComm(bool IsLocal) {
	if (checkForValidSection())
	return true;

	SMLoc IDLoc = getLexer().getLoc();
	StringRef Name;
	if (parseIdentifier(Name))
	return TokError("expected identifier in directive");

	// Handle the identifier as the key symbol.
	MCSymbol *Sym = getContext().getOrCreateSymbol(Name);

	if (getLexer().isNot(AsmToken::Comma))
	return TokError("unexpected token in directive");
	Lex();

	int64_t Size;
	SMLoc SizeLoc = getLexer().getLoc();
	if (parseAbsoluteExpression(Size))
	return true;

	int64_t Pow2Alignment = 0;
	SMLoc Pow2AlignmentLoc;
	if (getLexer().is(AsmToken::Comma)) {
	Lex();
	Pow2AlignmentLoc = getLexer().getLoc();
	if (parseAbsoluteExpression(Pow2Alignment))
	return true;

	LCOMM::LCOMMType LCOMM = Lexer.getMAI().getLCOMMDirectiveAlignmentType();
	if (IsLocal && LCOMM == LCOMM::NoAlignment)
	return Error(Pow2AlignmentLoc, "alignment not supported on this target");

	// If this target takes alignments in bytes (not log) validate and convert.
	if ((!IsLocal && Lexer.getMAI().getCOMMDirectiveAlignmentIsInBytes()) \|\|
	(IsLocal && LCOMM == LCOMM::ByteAlignment)) {
	if (!isPowerOf2_64(Pow2Alignment))
	return Error(Pow2AlignmentLoc, "alignment must be a power of 2");
	Pow2Alignment = Log2_64(Pow2Alignment);
	}
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.comm' or '.lcomm' directive"))
	return true;

	// NOTE: a size of zero for a .comm should create a undefined symbol
	// but a size of .lcomm creates a bss symbol of size zero.
	if (Size < 0)
	return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
	"be less than zero");

	// NOTE: The alignment in the directive is a power of 2 value, the assembler
	// may internally end up wanting an alignment in bytes.
	// FIXME: Diagnose overflow.
	if (Pow2Alignment < 0)
	return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive "
	"alignment, can't be less than zero");

	Sym->redefineIfPossible();
	if (!Sym->isUndefined())
	return Error(IDLoc, "invalid symbol redefinition");

	// Create the Symbol as a common or local common with Size and Pow2Alignment
	if (IsLocal) {
	getStreamer().EmitLocalCommonSymbol(Sym, Size, 1 << Pow2Alignment);
	return false;
	}

	getStreamer().EmitCommonSymbol(Sym, Size, 1 << Pow2Alignment);
	return false;
	}

	/// parseDirectiveAbort
	/// ::= .abort [... message ...]
	bool AsmParser::parseDirectiveAbort() {
	// FIXME: Use loc from directive.
	SMLoc Loc = getLexer().getLoc();

	StringRef Str = parseStringToEndOfStatement();
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.abort' directive"))
	return true;

	if (Str.empty())
	return Error(Loc, ".abort detected. Assembly stopping.");
	else
	return Error(Loc, ".abort '" + Str + "' detected. Assembly stopping.");
	// FIXME: Actually abort assembly here.

	return false;
	}

	/// parseDirectiveInclude
	/// ::= .include "filename"
	bool AsmParser::parseDirectiveInclude() {
	// Allow the strings to have escaped octal character sequence.
	std::string Filename;
	SMLoc IncludeLoc = getTok().getLoc();

	if (check(getTok().isNot(AsmToken::String),
	"expected string in '.include' directive") \|\|
	parseEscapedString(Filename) \|\|
	check(getTok().isNot(AsmToken::EndOfStatement),
	"unexpected token in '.include' directive") \|\|
	// Attempt to switch the lexer to the included file before consuming the
	// end of statement to avoid losing it when we switch.
	check(enterIncludeFile(Filename), IncludeLoc,
	"Could not find include file '" + Filename + "'"))
	return true;

	return false;
	}

	/// parseDirectiveIncbin
	/// ::= .incbin "filename" [ , skip [ , count ] ]
	bool AsmParser::parseDirectiveIncbin() {
	// Allow the strings to have escaped octal character sequence.
	std::string Filename;
	SMLoc IncbinLoc = getTok().getLoc();
	if (check(getTok().isNot(AsmToken::String),
	"expected string in '.incbin' directive") \|\|
	parseEscapedString(Filename))
	return true;

	int64_t Skip = 0;
	const MCExpr *Count = nullptr;
	SMLoc SkipLoc, CountLoc;
	if (parseOptionalToken(AsmToken::Comma)) {
	// The skip expression can be omitted while specifying the count, e.g:
	// .incbin "filename",,4
	if (getTok().isNot(AsmToken::Comma)) {
	if (parseTokenLoc(SkipLoc) \|\| parseAbsoluteExpression(Skip))
	return true;
	}
	if (parseOptionalToken(AsmToken::Comma)) {
	CountLoc = getTok().getLoc();
	if (parseExpression(Count))
	return true;
	}
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.incbin' directive"))
	return true;

	if (check(Skip < 0, SkipLoc, "skip is negative"))
	return true;

	// Attempt to process the included file.
	if (processIncbinFile(Filename, Skip, Count, CountLoc))
	return Error(IncbinLoc, "Could not find incbin file '" + Filename + "'");
	return false;
	}

	/// parseDirectiveIf
	/// ::= .if{,eq,ge,gt,le,lt,ne} expression
	bool AsmParser::parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind) {
	TheCondStack.push_back(TheCondState);
	TheCondState.TheCond = AsmCond::IfCond;
	if (TheCondState.Ignore) {
	eatToEndOfStatement();
	} else {
	int64_t ExprValue;
	if (parseAbsoluteExpression(ExprValue) \|\|
	parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.if' directive"))
	return true;

	switch (DirKind) {
	default:
	llvm_unreachable("unsupported directive");
	case DK_IF:
	case DK_IFNE:
	break;
	case DK_IFEQ:
	ExprValue = ExprValue == 0;
	break;
	case DK_IFGE:
	ExprValue = ExprValue >= 0;
	break;
	case DK_IFGT:
	ExprValue = ExprValue > 0;
	break;
	case DK_IFLE:
	ExprValue = ExprValue <= 0;
	break;
	case DK_IFLT:
	ExprValue = ExprValue < 0;
	break;
	}

	TheCondState.CondMet = ExprValue;
	TheCondState.Ignore = !TheCondState.CondMet;
	}

	return false;
	}

	/// parseDirectiveIfb
	/// ::= .ifb string
	bool AsmParser::parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
	TheCondStack.push_back(TheCondState);
	TheCondState.TheCond = AsmCond::IfCond;

	if (TheCondState.Ignore) {
	eatToEndOfStatement();
	} else {
	StringRef Str = parseStringToEndOfStatement();

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.ifb' directive"))
	return true;

	TheCondState.CondMet = ExpectBlank == Str.empty();
	TheCondState.Ignore = !TheCondState.CondMet;
	}

	return false;
	}

	/// parseDirectiveIfc
	/// ::= .ifc string1, string2
	/// ::= .ifnc string1, string2
	bool AsmParser::parseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
	TheCondStack.push_back(TheCondState);
	TheCondState.TheCond = AsmCond::IfCond;

	if (TheCondState.Ignore) {
	eatToEndOfStatement();
	} else {
	StringRef Str1 = parseStringToComma();

	if (parseToken(AsmToken::Comma, "unexpected token in '.ifc' directive"))
	return true;

	StringRef Str2 = parseStringToEndOfStatement();

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.ifc' directive"))
	return true;

	TheCondState.CondMet = ExpectEqual == (Str1.trim() == Str2.trim());
	TheCondState.Ignore = !TheCondState.CondMet;
	}

	return false;
	}

	/// parseDirectiveIfeqs
	/// ::= .ifeqs string1, string2
	bool AsmParser::parseDirectiveIfeqs(SMLoc DirectiveLoc, bool ExpectEqual) {
	if (Lexer.isNot(AsmToken::String)) {
	if (ExpectEqual)
	return TokError("expected string parameter for '.ifeqs' directive");
	return TokError("expected string parameter for '.ifnes' directive");
	}

	StringRef String1 = getTok().getStringContents();
	Lex();

	if (Lexer.isNot(AsmToken::Comma)) {
	if (ExpectEqual)
	return TokError(
	"expected comma after first string for '.ifeqs' directive");
	return TokError("expected comma after first string for '.ifnes' directive");
	}

	Lex();

	if (Lexer.isNot(AsmToken::String)) {
	if (ExpectEqual)
	return TokError("expected string parameter for '.ifeqs' directive");
	return TokError("expected string parameter for '.ifnes' directive");
	}

	StringRef String2 = getTok().getStringContents();
	Lex();

	TheCondStack.push_back(TheCondState);
	TheCondState.TheCond = AsmCond::IfCond;
	TheCondState.CondMet = ExpectEqual == (String1 == String2);
	TheCondState.Ignore = !TheCondState.CondMet;

	return false;
	}

	/// parseDirectiveIfdef
	/// ::= .ifdef symbol
	bool AsmParser::parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
	StringRef Name;
	TheCondStack.push_back(TheCondState);
	TheCondState.TheCond = AsmCond::IfCond;

	if (TheCondState.Ignore) {
	eatToEndOfStatement();
	} else {
	if (check(parseIdentifier(Name), "expected identifier after '.ifdef'") \|\|
	parseToken(AsmToken::EndOfStatement, "unexpected token in '.ifdef'"))
	return true;

	MCSymbol *Sym = getContext().lookupSymbol(Name);

	if (expect_defined)
	TheCondState.CondMet = (Sym && !Sym->isUndefined(false));
	else
	TheCondState.CondMet = (!Sym \|\| Sym->isUndefined(false));
	TheCondState.Ignore = !TheCondState.CondMet;
	}

	return false;
	}

	/// parseDirectiveElseIf
	/// ::= .elseif expression
	bool AsmParser::parseDirectiveElseIf(SMLoc DirectiveLoc) {
	if (TheCondState.TheCond != AsmCond::IfCond &&
	TheCondState.TheCond != AsmCond::ElseIfCond)
	return Error(DirectiveLoc, "Encountered a .elseif that doesn't follow an"
	" .if or an .elseif");
	TheCondState.TheCond = AsmCond::ElseIfCond;

	bool LastIgnoreState = false;
	if (!TheCondStack.empty())
	LastIgnoreState = TheCondStack.back().Ignore;
	if (LastIgnoreState \|\| TheCondState.CondMet) {
	TheCondState.Ignore = true;
	eatToEndOfStatement();
	} else {
	int64_t ExprValue;
	if (parseAbsoluteExpression(ExprValue))
	return true;

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.elseif' directive"))
	return true;

	TheCondState.CondMet = ExprValue;
	TheCondState.Ignore = !TheCondState.CondMet;
	}

	return false;
	}

	/// parseDirectiveElse
	/// ::= .else
	bool AsmParser::parseDirectiveElse(SMLoc DirectiveLoc) {
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.else' directive"))
	return true;

	if (TheCondState.TheCond != AsmCond::IfCond &&
	TheCondState.TheCond != AsmCond::ElseIfCond)
	return Error(DirectiveLoc, "Encountered a .else that doesn't follow "
	" an .if or an .elseif");
	TheCondState.TheCond = AsmCond::ElseCond;
	bool LastIgnoreState = false;
	if (!TheCondStack.empty())
	LastIgnoreState = TheCondStack.back().Ignore;
	if (LastIgnoreState \|\| TheCondState.CondMet)
	TheCondState.Ignore = true;
	else
	TheCondState.Ignore = false;

	return false;
	}

	/// parseDirectiveEnd
	/// ::= .end
	bool AsmParser::parseDirectiveEnd(SMLoc DirectiveLoc) {
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.end' directive"))
	return true;

	while (Lexer.isNot(AsmToken::Eof))
	Lexer.Lex();

	return false;
	}

	/// parseDirectiveError
	/// ::= .err
	/// ::= .error [string]
	bool AsmParser::parseDirectiveError(SMLoc L, bool WithMessage) {
	if (!TheCondStack.empty()) {
	if (TheCondStack.back().Ignore) {
	eatToEndOfStatement();
	return false;
	}
	}

	if (!WithMessage)
	return Error(L, ".err encountered");

	StringRef Message = ".error directive invoked in source file";
	if (Lexer.isNot(AsmToken::EndOfStatement)) {
	if (Lexer.isNot(AsmToken::String))
	return TokError(".error argument must be a string");

	Message = getTok().getStringContents();
	Lex();
	}

	return Error(L, Message);
	}

	/// parseDirectiveWarning
	/// ::= .warning [string]
	bool AsmParser::parseDirectiveWarning(SMLoc L) {
	if (!TheCondStack.empty()) {
	if (TheCondStack.back().Ignore) {
	eatToEndOfStatement();
	return false;
	}
	}

	StringRef Message = ".warning directive invoked in source file";

	if (!parseOptionalToken(AsmToken::EndOfStatement)) {
	if (Lexer.isNot(AsmToken::String))
	return TokError(".warning argument must be a string");

	Message = getTok().getStringContents();
	Lex();
	if (parseToken(AsmToken::EndOfStatement,
	"expected end of statement in '.warning' directive"))
	return true;
	}

	return Warning(L, Message);
	}

	/// parseDirectiveEndIf
	/// ::= .endif
	bool AsmParser::parseDirectiveEndIf(SMLoc DirectiveLoc) {
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.endif' directive"))
	return true;

	if ((TheCondState.TheCond == AsmCond::NoCond) \|\| TheCondStack.empty())
	return Error(DirectiveLoc, "Encountered a .endif that doesn't follow "
	"an .if or .else");
	if (!TheCondStack.empty()) {
	TheCondState = TheCondStack.back();
	TheCondStack.pop_back();
	}

	return false;
	}

	void AsmParser::initializeDirectiveKindMap() {
	DirectiveKindMap[".set"] = DK_SET;
	DirectiveKindMap[".equ"] = DK_EQU;
	DirectiveKindMap[".equiv"] = DK_EQUIV;
	DirectiveKindMap[".ascii"] = DK_ASCII;
	DirectiveKindMap[".asciz"] = DK_ASCIZ;
	DirectiveKindMap[".string"] = DK_STRING;
	DirectiveKindMap[".byte"] = DK_BYTE;
	DirectiveKindMap[".short"] = DK_SHORT;
	DirectiveKindMap[".value"] = DK_VALUE;
	DirectiveKindMap[".2byte"] = DK_2BYTE;
	DirectiveKindMap[".long"] = DK_LONG;
	DirectiveKindMap[".int"] = DK_INT;
	DirectiveKindMap[".4byte"] = DK_4BYTE;
	DirectiveKindMap[".quad"] = DK_QUAD;
	DirectiveKindMap[".8byte"] = DK_8BYTE;
	DirectiveKindMap[".octa"] = DK_OCTA;
	DirectiveKindMap[".single"] = DK_SINGLE;
	DirectiveKindMap[".float"] = DK_FLOAT;
	DirectiveKindMap[".double"] = DK_DOUBLE;
	DirectiveKindMap[".align"] = DK_ALIGN;
	DirectiveKindMap[".align32"] = DK_ALIGN32;
	DirectiveKindMap[".balign"] = DK_BALIGN;
	DirectiveKindMap[".balignw"] = DK_BALIGNW;
	DirectiveKindMap[".balignl"] = DK_BALIGNL;
	DirectiveKindMap[".p2align"] = DK_P2ALIGN;
	DirectiveKindMap[".p2alignw"] = DK_P2ALIGNW;
	DirectiveKindMap[".p2alignl"] = DK_P2ALIGNL;
	DirectiveKindMap[".org"] = DK_ORG;
	DirectiveKindMap[".fill"] = DK_FILL;
	DirectiveKindMap[".zero"] = DK_ZERO;
	DirectiveKindMap[".extern"] = DK_EXTERN;
	DirectiveKindMap[".globl"] = DK_GLOBL;
	DirectiveKindMap[".global"] = DK_GLOBAL;
	DirectiveKindMap[".lazy_reference"] = DK_LAZY_REFERENCE;
	DirectiveKindMap[".no_dead_strip"] = DK_NO_DEAD_STRIP;
	DirectiveKindMap[".symbol_resolver"] = DK_SYMBOL_RESOLVER;
	DirectiveKindMap[".private_extern"] = DK_PRIVATE_EXTERN;
	DirectiveKindMap[".reference"] = DK_REFERENCE;
	DirectiveKindMap[".weak_definition"] = DK_WEAK_DEFINITION;
	DirectiveKindMap[".weak_reference"] = DK_WEAK_REFERENCE;
	DirectiveKindMap[".weak_def_can_be_hidden"] = DK_WEAK_DEF_CAN_BE_HIDDEN;
	DirectiveKindMap[".cold"] = DK_COLD;
	DirectiveKindMap[".comm"] = DK_COMM;
	DirectiveKindMap[".common"] = DK_COMMON;
	DirectiveKindMap[".lcomm"] = DK_LCOMM;
	DirectiveKindMap[".abort"] = DK_ABORT;
	DirectiveKindMap[".include"] = DK_INCLUDE;
	DirectiveKindMap[".incbin"] = DK_INCBIN;
	DirectiveKindMap[".code16"] = DK_CODE16;
	DirectiveKindMap[".code16gcc"] = DK_CODE16GCC;
	DirectiveKindMap[".rept"] = DK_REPT;
	DirectiveKindMap[".rep"] = DK_REPT;
	DirectiveKindMap[".irp"] = DK_IRP;
	DirectiveKindMap[".irpc"] = DK_IRPC;
	DirectiveKindMap[".endr"] = DK_ENDR;
	DirectiveKindMap[".bundle_align_mode"] = DK_BUNDLE_ALIGN_MODE;
	DirectiveKindMap[".bundle_lock"] = DK_BUNDLE_LOCK;
	DirectiveKindMap[".bundle_unlock"] = DK_BUNDLE_UNLOCK;
	DirectiveKindMap[".if"] = DK_IF;
	DirectiveKindMap[".ifeq"] = DK_IFEQ;
	DirectiveKindMap[".ifge"] = DK_IFGE;
	DirectiveKindMap[".ifgt"] = DK_IFGT;
	DirectiveKindMap[".ifle"] = DK_IFLE;
	DirectiveKindMap[".iflt"] = DK_IFLT;
	DirectiveKindMap[".ifne"] = DK_IFNE;
	DirectiveKindMap[".ifb"] = DK_IFB;
	DirectiveKindMap[".ifnb"] = DK_IFNB;
	DirectiveKindMap[".ifc"] = DK_IFC;
	DirectiveKindMap[".ifeqs"] = DK_IFEQS;
	DirectiveKindMap[".ifnc"] = DK_IFNC;
	DirectiveKindMap[".ifnes"] = DK_IFNES;
	DirectiveKindMap[".ifdef"] = DK_IFDEF;
	DirectiveKindMap[".ifndef"] = DK_IFNDEF;
	DirectiveKindMap[".ifnotdef"] = DK_IFNOTDEF;
	DirectiveKindMap[".elseif"] = DK_ELSEIF;
	DirectiveKindMap[".else"] = DK_ELSE;
	DirectiveKindMap[".end"] = DK_END;
	DirectiveKindMap[".endif"] = DK_ENDIF;
	DirectiveKindMap[".skip"] = DK_SKIP;
	DirectiveKindMap[".space"] = DK_SPACE;
	DirectiveKindMap[".file"] = DK_FILE;
	DirectiveKindMap[".line"] = DK_LINE;
	DirectiveKindMap[".loc"] = DK_LOC;
	DirectiveKindMap[".stabs"] = DK_STABS;
	DirectiveKindMap[".cv_file"] = DK_CV_FILE;
	DirectiveKindMap[".cv_func_id"] = DK_CV_FUNC_ID;
	DirectiveKindMap[".cv_loc"] = DK_CV_LOC;
	DirectiveKindMap[".cv_linetable"] = DK_CV_LINETABLE;
	DirectiveKindMap[".cv_inline_linetable"] = DK_CV_INLINE_LINETABLE;
	DirectiveKindMap[".cv_inline_site_id"] = DK_CV_INLINE_SITE_ID;
	DirectiveKindMap[".cv_def_range"] = DK_CV_DEF_RANGE;
	DirectiveKindMap[".cv_string"] = DK_CV_STRING;
	DirectiveKindMap[".cv_stringtable"] = DK_CV_STRINGTABLE;
	DirectiveKindMap[".cv_filechecksums"] = DK_CV_FILECHECKSUMS;
	DirectiveKindMap[".cv_filechecksumoffset"] = DK_CV_FILECHECKSUM_OFFSET;
	DirectiveKindMap[".cv_fpo_data"] = DK_CV_FPO_DATA;
	DirectiveKindMap[".sleb128"] = DK_SLEB128;
	DirectiveKindMap[".uleb128"] = DK_ULEB128;
	DirectiveKindMap[".cfi_sections"] = DK_CFI_SECTIONS;
	DirectiveKindMap[".cfi_startproc"] = DK_CFI_STARTPROC;
	DirectiveKindMap[".cfi_endproc"] = DK_CFI_ENDPROC;
	DirectiveKindMap[".cfi_def_cfa"] = DK_CFI_DEF_CFA;
	DirectiveKindMap[".cfi_def_cfa_offset"] = DK_CFI_DEF_CFA_OFFSET;
	DirectiveKindMap[".cfi_adjust_cfa_offset"] = DK_CFI_ADJUST_CFA_OFFSET;
	DirectiveKindMap[".cfi_def_cfa_register"] = DK_CFI_DEF_CFA_REGISTER;
	DirectiveKindMap[".cfi_offset"] = DK_CFI_OFFSET;
	DirectiveKindMap[".cfi_rel_offset"] = DK_CFI_REL_OFFSET;
	DirectiveKindMap[".cfi_personality"] = DK_CFI_PERSONALITY;
	DirectiveKindMap[".cfi_lsda"] = DK_CFI_LSDA;
	DirectiveKindMap[".cfi_remember_state"] = DK_CFI_REMEMBER_STATE;
	DirectiveKindMap[".cfi_restore_state"] = DK_CFI_RESTORE_STATE;
	DirectiveKindMap[".cfi_same_value"] = DK_CFI_SAME_VALUE;
	DirectiveKindMap[".cfi_restore"] = DK_CFI_RESTORE;
	DirectiveKindMap[".cfi_escape"] = DK_CFI_ESCAPE;
	DirectiveKindMap[".cfi_return_column"] = DK_CFI_RETURN_COLUMN;
	DirectiveKindMap[".cfi_signal_frame"] = DK_CFI_SIGNAL_FRAME;
	DirectiveKindMap[".cfi_undefined"] = DK_CFI_UNDEFINED;
	DirectiveKindMap[".cfi_register"] = DK_CFI_REGISTER;
	DirectiveKindMap[".cfi_window_save"] = DK_CFI_WINDOW_SAVE;
	DirectiveKindMap[".cfi_b_key_frame"] = DK_CFI_B_KEY_FRAME;
	DirectiveKindMap[".macros_on"] = DK_MACROS_ON;
	DirectiveKindMap[".macros_off"] = DK_MACROS_OFF;
	DirectiveKindMap[".macro"] = DK_MACRO;
	DirectiveKindMap[".exitm"] = DK_EXITM;
	DirectiveKindMap[".endm"] = DK_ENDM;
	DirectiveKindMap[".endmacro"] = DK_ENDMACRO;
	DirectiveKindMap[".purgem"] = DK_PURGEM;
	DirectiveKindMap[".err"] = DK_ERR;
	DirectiveKindMap[".error"] = DK_ERROR;
	DirectiveKindMap[".warning"] = DK_WARNING;
	DirectiveKindMap[".altmacro"] = DK_ALTMACRO;
	DirectiveKindMap[".noaltmacro"] = DK_NOALTMACRO;
	DirectiveKindMap[".reloc"] = DK_RELOC;
	DirectiveKindMap[".dc"] = DK_DC;
	DirectiveKindMap[".dc.a"] = DK_DC_A;
	DirectiveKindMap[".dc.b"] = DK_DC_B;
	DirectiveKindMap[".dc.d"] = DK_DC_D;
	DirectiveKindMap[".dc.l"] = DK_DC_L;
	DirectiveKindMap[".dc.s"] = DK_DC_S;
	DirectiveKindMap[".dc.w"] = DK_DC_W;
	DirectiveKindMap[".dc.x"] = DK_DC_X;
	DirectiveKindMap[".dcb"] = DK_DCB;
	DirectiveKindMap[".dcb.b"] = DK_DCB_B;
	DirectiveKindMap[".dcb.d"] = DK_DCB_D;
	DirectiveKindMap[".dcb.l"] = DK_DCB_L;
	DirectiveKindMap[".dcb.s"] = DK_DCB_S;
	DirectiveKindMap[".dcb.w"] = DK_DCB_W;
	DirectiveKindMap[".dcb.x"] = DK_DCB_X;
	DirectiveKindMap[".ds"] = DK_DS;
	DirectiveKindMap[".ds.b"] = DK_DS_B;
	DirectiveKindMap[".ds.d"] = DK_DS_D;
	DirectiveKindMap[".ds.l"] = DK_DS_L;
	DirectiveKindMap[".ds.p"] = DK_DS_P;
	DirectiveKindMap[".ds.s"] = DK_DS_S;
	DirectiveKindMap[".ds.w"] = DK_DS_W;
	DirectiveKindMap[".ds.x"] = DK_DS_X;
	DirectiveKindMap[".print"] = DK_PRINT;
	DirectiveKindMap[".addrsig"] = DK_ADDRSIG;
	DirectiveKindMap[".addrsig_sym"] = DK_ADDRSIG_SYM;
	}

	MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
	AsmToken EndToken, StartToken = getTok();

	unsigned NestLevel = 0;
	while (true) {
	// Check whether we have reached the end of the file.
	if (getLexer().is(AsmToken::Eof)) {
	printError(DirectiveLoc, "no matching '.endr' in definition");
	return nullptr;
	}

	if (Lexer.is(AsmToken::Identifier) &&
	(getTok().getIdentifier() == ".rep" \|\|
	getTok().getIdentifier() == ".rept" \|\|
	getTok().getIdentifier() == ".irp" \|\|
	getTok().getIdentifier() == ".irpc")) {
	++NestLevel;
	}

	// Otherwise, check whether we have reached the .endr.
	if (Lexer.is(AsmToken::Identifier) && getTok().getIdentifier() == ".endr") {
	if (NestLevel == 0) {
	EndToken = getTok();
	Lex();
	if (Lexer.isNot(AsmToken::EndOfStatement)) {
	printError(getTok().getLoc(),
	"unexpected token in '.endr' directive");
	return nullptr;
	}
	break;
	}
	--NestLevel;
	}

	// Otherwise, scan till the end of the statement.
	eatToEndOfStatement();
	}

	const char *BodyStart = StartToken.getLoc().getPointer();
	const char *BodyEnd = EndToken.getLoc().getPointer();
	StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);

	// We Are Anonymous.
	MacroLikeBodies.emplace_back(StringRef(), Body, MCAsmMacroParameters());
	return &MacroLikeBodies.back();
	}

	void AsmParser::instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
	raw_svector_ostream &OS) {
	OS << ".endr\n";

	std::unique_ptr<MemoryBuffer> Instantiation =
	MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");

	// Create the macro instantiation object and add to the current macro
	// instantiation stack.
	MacroInstantiation *MI = new MacroInstantiation(
	DirectiveLoc, CurBuffer, getTok().getLoc(), TheCondStack.size());
	ActiveMacros.push_back(MI);

	// Jump to the macro instantiation and prime the lexer.
	CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(Instantiation), SMLoc());
	Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
	Lex();
	}

	/// parseDirectiveRept
	/// ::= .rep \| .rept count
	bool AsmParser::parseDirectiveRept(SMLoc DirectiveLoc, StringRef Dir) {
	const MCExpr *CountExpr;
	SMLoc CountLoc = getTok().getLoc();
	if (parseExpression(CountExpr))
	return true;

	int64_t Count;
	if (!CountExpr->evaluateAsAbsolute(Count, getStreamer().getAssemblerPtr())) {
	return Error(CountLoc, "unexpected token in '" + Dir + "' directive");
	}

	if (check(Count < 0, CountLoc, "Count is negative") \|\|
	parseToken(AsmToken::EndOfStatement,
	"unexpected token in '" + Dir + "' directive"))
	return true;

	// Lex the rept definition.
	MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
	if (!M)
	return true;

	// Macro instantiation is lexical, unfortunately. We construct a new buffer
	// to hold the macro body with substitutions.
	SmallString<256> Buf;
	raw_svector_ostream OS(Buf);
	while (Count--) {
	// Note that the AtPseudoVariable is disabled for instantiations of .rep(t).
	if (expandMacro(OS, M->Body, None, None, false, getTok().getLoc()))
	return true;
	}
	instantiateMacroLikeBody(M, DirectiveLoc, OS);

	return false;
	}

	/// parseDirectiveIrp
	/// ::= .irp symbol,values
	bool AsmParser::parseDirectiveIrp(SMLoc DirectiveLoc) {
	MCAsmMacroParameter Parameter;
	MCAsmMacroArguments A;
	if (check(parseIdentifier(Parameter.Name),
	"expected identifier in '.irp' directive") \|\|
	parseToken(AsmToken::Comma, "expected comma in '.irp' directive") \|\|
	parseMacroArguments(nullptr, A) \|\|
	parseToken(AsmToken::EndOfStatement, "expected End of Statement"))
	return true;

	// Lex the irp definition.
	MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
	if (!M)
	return true;

	// Macro instantiation is lexical, unfortunately. We construct a new buffer
	// to hold the macro body with substitutions.
	SmallString<256> Buf;
	raw_svector_ostream OS(Buf);

	for (const MCAsmMacroArgument &Arg : A) {
	// Note that the AtPseudoVariable is enabled for instantiations of .irp.
	// This is undocumented, but GAS seems to support it.
	if (expandMacro(OS, M->Body, Parameter, Arg, true, getTok().getLoc()))
	return true;
	}

	instantiateMacroLikeBody(M, DirectiveLoc, OS);

	return false;
	}

	/// parseDirectiveIrpc
	/// ::= .irpc symbol,values
	bool AsmParser::parseDirectiveIrpc(SMLoc DirectiveLoc) {
	MCAsmMacroParameter Parameter;
	MCAsmMacroArguments A;

	if (check(parseIdentifier(Parameter.Name),
	"expected identifier in '.irpc' directive") \|\|
	parseToken(AsmToken::Comma, "expected comma in '.irpc' directive") \|\|
	parseMacroArguments(nullptr, A))
	return true;

	if (A.size() != 1 \|\| A.front().size() != 1)
	return TokError("unexpected token in '.irpc' directive");

	// Eat the end of statement.
	if (parseToken(AsmToken::EndOfStatement, "expected end of statement"))
	return true;

	// Lex the irpc definition.
	MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
	if (!M)
	return true;

	// Macro instantiation is lexical, unfortunately. We construct a new buffer
	// to hold the macro body with substitutions.
	SmallString<256> Buf;
	raw_svector_ostream OS(Buf);

	StringRef Values = A.front().front().getString();
	for (std::size_t I = 0, End = Values.size(); I != End; ++I) {
	MCAsmMacroArgument Arg;
	Arg.emplace_back(AsmToken::Identifier, Values.slice(I, I + 1));

	// Note that the AtPseudoVariable is enabled for instantiations of .irpc.
	// This is undocumented, but GAS seems to support it.
	if (expandMacro(OS, M->Body, Parameter, Arg, true, getTok().getLoc()))
	return true;
	}

	instantiateMacroLikeBody(M, DirectiveLoc, OS);

	return false;
	}

	bool AsmParser::parseDirectiveEndr(SMLoc DirectiveLoc) {
	if (ActiveMacros.empty())
	return TokError("unmatched '.endr' directive");

	// The only .repl that should get here are the ones created by
	// instantiateMacroLikeBody.
	assert(getLexer().is(AsmToken::EndOfStatement));

	handleMacroExit();
	return false;
	}

	bool AsmParser::parseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
	size_t Len) {
	const MCExpr *Value;
	SMLoc ExprLoc = getLexer().getLoc();
	if (parseExpression(Value))
	return true;
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
	if (!MCE)
	return Error(ExprLoc, "unexpected expression in _emit");
	uint64_t IntValue = MCE->getValue();
	if (!isUInt<8>(IntValue) && !isInt<8>(IntValue))
	return Error(ExprLoc, "literal value out of range for directive");

	Info.AsmRewrites->emplace_back(AOK_Emit, IDLoc, Len);
	return false;
	}

	bool AsmParser::parseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
	const MCExpr *Value;
	SMLoc ExprLoc = getLexer().getLoc();
	if (parseExpression(Value))
	return true;
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
	if (!MCE)
	return Error(ExprLoc, "unexpected expression in align");
	uint64_t IntValue = MCE->getValue();
	if (!isPowerOf2_64(IntValue))
	return Error(ExprLoc, "literal value not a power of two greater then zero");

	Info.AsmRewrites->emplace_back(AOK_Align, IDLoc, 5, Log2_64(IntValue));
	return false;
	}

	bool AsmParser::parseDirectivePrint(SMLoc DirectiveLoc) {
	const AsmToken StrTok = getTok();
	Lex();
	if (StrTok.isNot(AsmToken::String) \|\| StrTok.getString().front() != '"')
	return Error(DirectiveLoc, "expected double quoted string after .print");
	if (parseToken(AsmToken::EndOfStatement, "expected end of statement"))
	return true;
	llvm::outs() << StrTok.getStringContents() << '\n';
	return false;
	}

	bool AsmParser::parseDirectiveAddrsig() {
	getStreamer().EmitAddrsig();
	return false;
	}

	bool AsmParser::parseDirectiveAddrsigSym() {
	StringRef Name;
	if (check(parseIdentifier(Name),
	"expected identifier in '.addrsig_sym' directive"))
	return true;
	MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
	getStreamer().EmitAddrsigSym(Sym);
	return false;
	}

	// We are comparing pointers, but the pointers are relative to a single string.
	// Thus, this should always be deterministic.
	static int rewritesSort(const AsmRewrite *AsmRewriteA,
	const AsmRewrite *AsmRewriteB) {
	if (AsmRewriteA->Loc.getPointer() < AsmRewriteB->Loc.getPointer())
	return -1;
	if (AsmRewriteB->Loc.getPointer() < AsmRewriteA->Loc.getPointer())
	return 1;

	// It's possible to have a SizeDirective, Imm/ImmPrefix and an Input/Output
	// rewrite to the same location. Make sure the SizeDirective rewrite is
	// performed first, then the Imm/ImmPrefix and finally the Input/Output. This
	// ensures the sort algorithm is stable.
	if (AsmRewritePrecedence[AsmRewriteA->Kind] >
	AsmRewritePrecedence[AsmRewriteB->Kind])
	return -1;

	if (AsmRewritePrecedence[AsmRewriteA->Kind] <
	AsmRewritePrecedence[AsmRewriteB->Kind])
	return 1;
	llvm_unreachable("Unstable rewrite sort.");
	}

	bool AsmParser::parseMSInlineAsm(
	void *AsmLoc, std::string &AsmString, unsigned &NumOutputs,
	unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool>> &OpDecls,
	SmallVectorImpl<std::string> &Constraints,
	SmallVectorImpl<std::string> &Clobbers, const MCInstrInfo *MII,
	const MCInstPrinter *IP, MCAsmParserSemaCallback &SI) {
	SmallVector<void *, 4> InputDecls;
	SmallVector<void *, 4> OutputDecls;
	SmallVector<bool, 4> InputDeclsAddressOf;
	SmallVector<bool, 4> OutputDeclsAddressOf;
	SmallVector<std::string, 4> InputConstraints;
	SmallVector<std::string, 4> OutputConstraints;
	SmallVector<unsigned, 4> ClobberRegs;

	SmallVector<AsmRewrite, 4> AsmStrRewrites;

	// Prime the lexer.
	Lex();

	// While we have input, parse each statement.
	unsigned InputIdx = 0;
	unsigned OutputIdx = 0;
	while (getLexer().isNot(AsmToken::Eof)) {
	// Parse curly braces marking block start/end
	if (parseCurlyBlockScope(AsmStrRewrites))
	continue;

	ParseStatementInfo Info(&AsmStrRewrites);
	bool StatementErr = parseStatement(Info, &SI);

	if (StatementErr \|\| Info.ParseError) {
	// Emit pending errors if any exist.
	printPendingErrors();
	return true;
	}

	// No pending error should exist here.
	assert(!hasPendingError() && "unexpected error from parseStatement");

	if (Info.Opcode == ~0U)
	continue;

	const MCInstrDesc &Desc = MII->get(Info.Opcode);

	// Build the list of clobbers, outputs and inputs.
	for (unsigned i = 1, e = Info.ParsedOperands.size(); i != e; ++i) {
	MCParsedAsmOperand &Operand = *Info.ParsedOperands[i];

	// Immediate.
	if (Operand.isImm())
	continue;

	// Register operand.
	if (Operand.isReg() && !Operand.needAddressOf() &&
	!getTargetParser().OmitRegisterFromClobberLists(Operand.getReg())) {
	unsigned NumDefs = Desc.getNumDefs();
	// Clobber.
	if (NumDefs && Operand.getMCOperandNum() < NumDefs)
	ClobberRegs.push_back(Operand.getReg());
	continue;
	}

	// Expr/Input or Output.
	StringRef SymName = Operand.getSymName();
	if (SymName.empty())
	continue;

	void *OpDecl = Operand.getOpDecl();
	if (!OpDecl)
	continue;

	bool isOutput = (i == 1) && Desc.mayStore();
	SMLoc Start = SMLoc::getFromPointer(SymName.data());
	if (isOutput) {
	++InputIdx;
	OutputDecls.push_back(OpDecl);
	OutputDeclsAddressOf.push_back(Operand.needAddressOf());
	OutputConstraints.push_back(("=" + Operand.getConstraint()).str());
	AsmStrRewrites.emplace_back(AOK_Output, Start, SymName.size());
	} else {
	InputDecls.push_back(OpDecl);
	InputDeclsAddressOf.push_back(Operand.needAddressOf());
	InputConstraints.push_back(Operand.getConstraint().str());
	AsmStrRewrites.emplace_back(AOK_Input, Start, SymName.size());
	}
	}

	// Consider implicit defs to be clobbers. Think of cpuid and push.
	ArrayRef<MCPhysReg> ImpDefs(Desc.getImplicitDefs(),
	Desc.getNumImplicitDefs());
	ClobberRegs.insert(ClobberRegs.end(), ImpDefs.begin(), ImpDefs.end());
	}

	// Set the number of Outputs and Inputs.
	NumOutputs = OutputDecls.size();
	NumInputs = InputDecls.size();

	// Set the unique clobbers.
	array_pod_sort(ClobberRegs.begin(), ClobberRegs.end());
	ClobberRegs.erase(std::unique(ClobberRegs.begin(), ClobberRegs.end()),
	ClobberRegs.end());
	Clobbers.assign(ClobberRegs.size(), std::string());
	for (unsigned I = 0, E = ClobberRegs.size(); I != E; ++I) {
	raw_string_ostream OS(Clobbers[I]);
	IP->printRegName(OS, ClobberRegs[I]);
	}

	// Merge the various outputs and inputs. Output are expected first.
	if (NumOutputs \|\| NumInputs) {
	unsigned NumExprs = NumOutputs + NumInputs;
	OpDecls.resize(NumExprs);
	Constraints.resize(NumExprs);
	for (unsigned i = 0; i < NumOutputs; ++i) {
	OpDecls[i] = std::make_pair(OutputDecls[i], OutputDeclsAddressOf[i]);
	Constraints[i] = OutputConstraints[i];
	}
	for (unsigned i = 0, j = NumOutputs; i < NumInputs; ++i, ++j) {
	OpDecls[j] = std::make_pair(InputDecls[i], InputDeclsAddressOf[i]);
	Constraints[j] = InputConstraints[i];
	}
	}

	// Build the IR assembly string.
	std::string AsmStringIR;
	raw_string_ostream OS(AsmStringIR);
	StringRef ASMString =
	SrcMgr.getMemoryBuffer(SrcMgr.getMainFileID())->getBuffer();
	const char *AsmStart = ASMString.begin();
	const char *AsmEnd = ASMString.end();
	array_pod_sort(AsmStrRewrites.begin(), AsmStrRewrites.end(), rewritesSort);
	for (const AsmRewrite &AR : AsmStrRewrites) {
	AsmRewriteKind Kind = AR.Kind;

	const char *Loc = AR.Loc.getPointer();
	assert(Loc >= AsmStart && "Expected Loc to be at or after Start!");

	// Emit everything up to the immediate/expression.
	if (unsigned Len = Loc - AsmStart)
	OS << StringRef(AsmStart, Len);

	// Skip the original expression.
	if (Kind == AOK_Skip) {
	AsmStart = Loc + AR.Len;
	continue;
	}

	unsigned AdditionalSkip = 0;
	// Rewrite expressions in $N notation.
	switch (Kind) {
	default:
	break;
	case AOK_IntelExpr:
	assert(AR.IntelExp.isValid() && "cannot write invalid intel expression");
	if (AR.IntelExp.NeedBracs)
	OS << "[";
	if (AR.IntelExp.hasBaseReg())
	OS << AR.IntelExp.BaseReg;
	if (AR.IntelExp.hasIndexReg())
	OS << (AR.IntelExp.hasBaseReg() ? " + " : "")
	<< AR.IntelExp.IndexReg;
	if (AR.IntelExp.Scale > 1)
	OS << " * $$" << AR.IntelExp.Scale;
	if (AR.IntelExp.Imm \|\| !AR.IntelExp.hasRegs())
	OS << (AR.IntelExp.hasRegs() ? " + $$" : "$$") << AR.IntelExp.Imm;
	if (AR.IntelExp.NeedBracs)
	OS << "]";
	break;
	case AOK_Label:
	OS << Ctx.getAsmInfo()->getPrivateLabelPrefix() << AR.Label;
	break;
	case AOK_Input:
	OS << '$' << InputIdx++;
	break;
	case AOK_Output:
	OS << '$' << OutputIdx++;
	break;
	case AOK_SizeDirective:
	switch (AR.Val) {
	default: break;
	case 8: OS << "byte ptr "; break;
	case 16: OS << "word ptr "; break;
	case 32: OS << "dword ptr "; break;
	case 64: OS << "qword ptr "; break;
	case 80: OS << "xword ptr "; break;
	case 128: OS << "xmmword ptr "; break;
	case 256: OS << "ymmword ptr "; break;
	}
	break;
	case AOK_Emit:
	OS << ".byte";
	break;
	case AOK_Align: {
	// MS alignment directives are measured in bytes. If the native assembler
	// measures alignment in bytes, we can pass it straight through.
	OS << ".align";
	if (getContext().getAsmInfo()->getAlignmentIsInBytes())
	break;

	// Alignment is in log2 form, so print that instead and skip the original
	// immediate.
	unsigned Val = AR.Val;
	OS << ' ' << Val;
	assert(Val < 10 && "Expected alignment less then 2^10.");
	AdditionalSkip = (Val < 4) ? 2 : Val < 7 ? 3 : 4;
	break;
	}
	case AOK_EVEN:
	OS << ".even";
	break;
	case AOK_EndOfStatement:
	OS << "\n\t";
	break;
	}

	// Skip the original expression.
	AsmStart = Loc + AR.Len + AdditionalSkip;
	}

	// Emit the remainder of the asm string.
	if (AsmStart != AsmEnd)
	OS << StringRef(AsmStart, AsmEnd - AsmStart);

	AsmString = OS.str();
	return false;
	}

	namespace llvm {
	namespace MCParserUtils {

	/// Returns whether the given symbol is used anywhere in the given expression,
	/// or subexpressions.
	static bool isSymbolUsedInExpression(const MCSymbol Sym, const MCExpr Value) {
	switch (Value->getKind()) {
	case MCExpr::Binary: {
	const MCBinaryExpr BE = static_cast<const MCBinaryExpr >(Value);
	return isSymbolUsedInExpression(Sym, BE->getLHS()) \|\|
	isSymbolUsedInExpression(Sym, BE->getRHS());
	}
	case MCExpr::Target:
	case MCExpr::Constant:
	return false;
	case MCExpr::SymbolRef: {
	const MCSymbol &S =
	static_cast<const MCSymbolRefExpr *>(Value)->getSymbol();
	if (S.isVariable())
	return isSymbolUsedInExpression(Sym, S.getVariableValue());
	return &S == Sym;
	}
	case MCExpr::Unary:
	return isSymbolUsedInExpression(
	Sym, static_cast<const MCUnaryExpr *>(Value)->getSubExpr());
	}

	llvm_unreachable("Unknown expr kind!");
	}

	bool parseAssignmentExpression(StringRef Name, bool allow_redef,
	MCAsmParser &Parser, MCSymbol *&Sym,
	const MCExpr *&Value) {

	// FIXME: Use better location, we should use proper tokens.
	SMLoc EqualLoc = Parser.getTok().getLoc();
	if (Parser.parseExpression(Value))
	return Parser.TokError("missing expression");

	// Note: we don't count b as used in "a = b". This is to allow
	// a = b
	// b = c

	if (Parser.parseToken(AsmToken::EndOfStatement))
	return true;

	// Validate that the LHS is allowed to be a variable (either it has not been
	// used as a symbol, or it is an absolute symbol).
	Sym = Parser.getContext().lookupSymbol(Name);
	if (Sym) {
	// Diagnose assignment to a label.
	//
	// FIXME: Diagnostics. Note the location of the definition as a label.
	// FIXME: Diagnose assignment to protected identifier (e.g., register name).
	if (isSymbolUsedInExpression(Sym, Value))
	return Parser.Error(EqualLoc, "Recursive use of '" + Name + "'");
	else if (Sym->isUndefined(/SetUsed/ false) && !Sym->isUsed() &&
	!Sym->isVariable())
	; // Allow redefinitions of undefined symbols only used in directives.
	else if (Sym->isVariable() && !Sym->isUsed() && allow_redef)
	; // Allow redefinitions of variables that haven't yet been used.
	else if (!Sym->isUndefined() && (!Sym->isVariable() \|\| !allow_redef))
	return Parser.Error(EqualLoc, "redefinition of '" + Name + "'");
	else if (!Sym->isVariable())
	return Parser.Error(EqualLoc, "invalid assignment to '" + Name + "'");
	else if (!isa<MCConstantExpr>(Sym->getVariableValue()))
	return Parser.Error(EqualLoc,
	"invalid reassignment of non-absolute variable '" +
	Name + "'");
	} else if (Name == ".") {
	Parser.getStreamer().emitValueToOffset(Value, 0, EqualLoc);
	return false;
	} else
	Sym = Parser.getContext().getOrCreateSymbol(Name);

	Sym->setRedefinable(allow_redef);

	return false;
	}

	} // end namespace MCParserUtils
	} // end namespace llvm

	/// Create an MCAsmParser instance.
	MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM, MCContext &C,
	MCStreamer &Out, const MCAsmInfo &MAI,
	unsigned CB) {
	return new AsmParser(SM, C, Out, MAI, CB);
	}
	Index: vendor/llvm/dist-release_90/lib/Object/RelocationResolver.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Object/RelocationResolver.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Object/RelocationResolver.cpp (revision 351303)
	@@ -1,550 +1,550 @@
	//===- RelocationResolver.cpp ------------------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines utilities to resolve relocations in object files.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Object/RelocationResolver.h"

	namespace llvm {
	namespace object {

	static int64_t getELFAddend(RelocationRef R) {
	Expected<int64_t> AddendOrErr = ELFRelocationRef(R).getAddend();
	handleAllErrors(AddendOrErr.takeError(), [](const ErrorInfoBase &EI) {
	report_fatal_error(EI.message());
	});
	return *AddendOrErr;
	}

	static bool supportsX86_64(uint64_t Type) {
	switch (Type) {
	case ELF::R_X86_64_NONE:
	case ELF::R_X86_64_64:
	case ELF::R_X86_64_DTPOFF32:
	case ELF::R_X86_64_DTPOFF64:
	case ELF::R_X86_64_PC32:
	case ELF::R_X86_64_32:
	case ELF::R_X86_64_32S:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveX86_64(RelocationRef R, uint64_t S, uint64_t A) {
	switch (R.getType()) {
	case ELF::R_X86_64_NONE:
	return A;
	case ELF::R_X86_64_64:
	case ELF::R_X86_64_DTPOFF32:
	case ELF::R_X86_64_DTPOFF64:
	return S + getELFAddend(R);
	case ELF::R_X86_64_PC32:
	return S + getELFAddend(R) - R.getOffset();
	case ELF::R_X86_64_32:
	case ELF::R_X86_64_32S:
	return (S + getELFAddend(R)) & 0xFFFFFFFF;
	default:
	llvm_unreachable("Invalid relocation type");
	}
	}

	static bool supportsAArch64(uint64_t Type) {
	switch (Type) {
	case ELF::R_AARCH64_ABS32:
	case ELF::R_AARCH64_ABS64:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveAArch64(RelocationRef R, uint64_t S, uint64_t A) {
	switch (R.getType()) {
	case ELF::R_AARCH64_ABS32:
	return (S + getELFAddend(R)) & 0xFFFFFFFF;
	case ELF::R_AARCH64_ABS64:
	return S + getELFAddend(R);
	default:
	llvm_unreachable("Invalid relocation type");
	}
	}

	static bool supportsBPF(uint64_t Type) {
	switch (Type) {
	case ELF::R_BPF_64_32:
	case ELF::R_BPF_64_64:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveBPF(RelocationRef R, uint64_t S, uint64_t A) {
	switch (R.getType()) {
	case ELF::R_BPF_64_32:
	- return S & 0xFFFFFFFF;
	+ return (S + A) & 0xFFFFFFFF;
	case ELF::R_BPF_64_64:
	- return S;
	+ return S + A;
	default:
	llvm_unreachable("Invalid relocation type");
	}
	}

	static bool supportsMips64(uint64_t Type) {
	switch (Type) {
	case ELF::R_MIPS_32:
	case ELF::R_MIPS_64:
	case ELF::R_MIPS_TLS_DTPREL64:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveMips64(RelocationRef R, uint64_t S, uint64_t A) {
	switch (R.getType()) {
	case ELF::R_MIPS_32:
	return (S + getELFAddend(R)) & 0xFFFFFFFF;
	case ELF::R_MIPS_64:
	return S + getELFAddend(R);
	case ELF::R_MIPS_TLS_DTPREL64:
	return S + getELFAddend(R) - 0x8000;
	default:
	llvm_unreachable("Invalid relocation type");
	}
	}

	static bool supportsPPC64(uint64_t Type) {
	switch (Type) {
	case ELF::R_PPC64_ADDR32:
	case ELF::R_PPC64_ADDR64:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolvePPC64(RelocationRef R, uint64_t S, uint64_t A) {
	switch (R.getType()) {
	case ELF::R_PPC64_ADDR32:
	return (S + getELFAddend(R)) & 0xFFFFFFFF;
	case ELF::R_PPC64_ADDR64:
	return S + getELFAddend(R);
	default:
	llvm_unreachable("Invalid relocation type");
	}
	}

	static bool supportsSystemZ(uint64_t Type) {
	switch (Type) {
	case ELF::R_390_32:
	case ELF::R_390_64:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveSystemZ(RelocationRef R, uint64_t S, uint64_t A) {
	switch (R.getType()) {
	case ELF::R_390_32:
	return (S + getELFAddend(R)) & 0xFFFFFFFF;
	case ELF::R_390_64:
	return S + getELFAddend(R);
	default:
	llvm_unreachable("Invalid relocation type");
	}
	}

	static bool supportsSparc64(uint64_t Type) {
	switch (Type) {
	case ELF::R_SPARC_32:
	case ELF::R_SPARC_64:
	case ELF::R_SPARC_UA32:
	case ELF::R_SPARC_UA64:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveSparc64(RelocationRef R, uint64_t S, uint64_t A) {
	switch (R.getType()) {
	case ELF::R_SPARC_32:
	case ELF::R_SPARC_64:
	case ELF::R_SPARC_UA32:
	case ELF::R_SPARC_UA64:
	return S + getELFAddend(R);
	default:
	llvm_unreachable("Invalid relocation type");
	}
	}

	static bool supportsAmdgpu(uint64_t Type) {
	switch (Type) {
	case ELF::R_AMDGPU_ABS32:
	case ELF::R_AMDGPU_ABS64:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveAmdgpu(RelocationRef R, uint64_t S, uint64_t A) {
	switch (R.getType()) {
	case ELF::R_AMDGPU_ABS32:
	case ELF::R_AMDGPU_ABS64:
	return S + getELFAddend(R);
	default:
	llvm_unreachable("Invalid relocation type");
	}
	}

	static bool supportsX86(uint64_t Type) {
	switch (Type) {
	case ELF::R_386_NONE:
	case ELF::R_386_32:
	case ELF::R_386_PC32:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveX86(RelocationRef R, uint64_t S, uint64_t A) {
	switch (R.getType()) {
	case ELF::R_386_NONE:
	return A;
	case ELF::R_386_32:
	return S + A;
	case ELF::R_386_PC32:
	return S - R.getOffset() + A;
	default:
	llvm_unreachable("Invalid relocation type");
	}
	}

	static bool supportsPPC32(uint64_t Type) {
	return Type == ELF::R_PPC_ADDR32;
	}

	static uint64_t resolvePPC32(RelocationRef R, uint64_t S, uint64_t A) {
	if (R.getType() == ELF::R_PPC_ADDR32)
	return (S + getELFAddend(R)) & 0xFFFFFFFF;
	llvm_unreachable("Invalid relocation type");
	}

	static bool supportsARM(uint64_t Type) {
	return Type == ELF::R_ARM_ABS32;
	}

	static uint64_t resolveARM(RelocationRef R, uint64_t S, uint64_t A) {
	if (R.getType() == ELF::R_ARM_ABS32)
	return (S + A) & 0xFFFFFFFF;
	llvm_unreachable("Invalid relocation type");
	}

	static bool supportsAVR(uint64_t Type) {
	switch (Type) {
	case ELF::R_AVR_16:
	case ELF::R_AVR_32:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveAVR(RelocationRef R, uint64_t S, uint64_t A) {
	switch (R.getType()) {
	case ELF::R_AVR_16:
	return (S + getELFAddend(R)) & 0xFFFF;
	case ELF::R_AVR_32:
	return (S + getELFAddend(R)) & 0xFFFFFFFF;
	default:
	llvm_unreachable("Invalid relocation type");
	}
	}

	static bool supportsLanai(uint64_t Type) {
	return Type == ELF::R_LANAI_32;
	}

	static uint64_t resolveLanai(RelocationRef R, uint64_t S, uint64_t A) {
	if (R.getType() == ELF::R_LANAI_32)
	return (S + getELFAddend(R)) & 0xFFFFFFFF;
	llvm_unreachable("Invalid relocation type");
	}

	static bool supportsMips32(uint64_t Type) {
	switch (Type) {
	case ELF::R_MIPS_32:
	case ELF::R_MIPS_TLS_DTPREL32:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveMips32(RelocationRef R, uint64_t S, uint64_t A) {
	// FIXME: Take in account implicit addends to get correct results.
	uint32_t Rel = R.getType();
	if (Rel == ELF::R_MIPS_32)
	return (S + A) & 0xFFFFFFFF;
	if (Rel == ELF::R_MIPS_TLS_DTPREL32)
	return (S + A) & 0xFFFFFFFF;
	llvm_unreachable("Invalid relocation type");
	}

	static bool supportsSparc32(uint64_t Type) {
	switch (Type) {
	case ELF::R_SPARC_32:
	case ELF::R_SPARC_UA32:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveSparc32(RelocationRef R, uint64_t S, uint64_t A) {
	uint32_t Rel = R.getType();
	if (Rel == ELF::R_SPARC_32 \|\| Rel == ELF::R_SPARC_UA32)
	return S + getELFAddend(R);
	return A;
	}

	static bool supportsHexagon(uint64_t Type) {
	return Type == ELF::R_HEX_32;
	}

	static uint64_t resolveHexagon(RelocationRef R, uint64_t S, uint64_t A) {
	if (R.getType() == ELF::R_HEX_32)
	return S + getELFAddend(R);
	llvm_unreachable("Invalid relocation type");
	}

	static bool supportsRISCV(uint64_t Type) {
	switch (Type) {
	case ELF::R_RISCV_NONE:
	case ELF::R_RISCV_32:
	case ELF::R_RISCV_64:
	case ELF::R_RISCV_ADD8:
	case ELF::R_RISCV_SUB8:
	case ELF::R_RISCV_ADD16:
	case ELF::R_RISCV_SUB16:
	case ELF::R_RISCV_ADD32:
	case ELF::R_RISCV_SUB32:
	case ELF::R_RISCV_ADD64:
	case ELF::R_RISCV_SUB64:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveRISCV(RelocationRef R, uint64_t S, uint64_t A) {
	int64_t RA = getELFAddend(R);
	switch (R.getType()) {
	case ELF::R_RISCV_NONE:
	return A;
	case ELF::R_RISCV_32:
	return (S + RA) & 0xFFFFFFFF;
	case ELF::R_RISCV_64:
	return S + RA;
	case ELF::R_RISCV_ADD8:
	return (A + (S + RA)) & 0xFF;
	case ELF::R_RISCV_SUB8:
	return (A - (S + RA)) & 0xFF;
	case ELF::R_RISCV_ADD16:
	return (A + (S + RA)) & 0xFFFF;
	case ELF::R_RISCV_SUB16:
	return (A - (S + RA)) & 0xFFFF;
	case ELF::R_RISCV_ADD32:
	return (A + (S + RA)) & 0xFFFFFFFF;
	case ELF::R_RISCV_SUB32:
	return (A - (S + RA)) & 0xFFFFFFFF;
	case ELF::R_RISCV_ADD64:
	return (A + (S + RA));
	case ELF::R_RISCV_SUB64:
	return (A - (S + RA));
	default:
	llvm_unreachable("Invalid relocation type");
	}
	}

	static bool supportsCOFFX86(uint64_t Type) {
	switch (Type) {
	case COFF::IMAGE_REL_I386_SECREL:
	case COFF::IMAGE_REL_I386_DIR32:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveCOFFX86(RelocationRef R, uint64_t S, uint64_t A) {
	switch (R.getType()) {
	case COFF::IMAGE_REL_I386_SECREL:
	case COFF::IMAGE_REL_I386_DIR32:
	return (S + A) & 0xFFFFFFFF;
	default:
	llvm_unreachable("Invalid relocation type");
	}
	}

	static bool supportsCOFFX86_64(uint64_t Type) {
	switch (Type) {
	case COFF::IMAGE_REL_AMD64_SECREL:
	case COFF::IMAGE_REL_AMD64_ADDR64:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveCOFFX86_64(RelocationRef R, uint64_t S, uint64_t A) {
	switch (R.getType()) {
	case COFF::IMAGE_REL_AMD64_SECREL:
	return (S + A) & 0xFFFFFFFF;
	case COFF::IMAGE_REL_AMD64_ADDR64:
	return S + A;
	default:
	llvm_unreachable("Invalid relocation type");
	}
	}

	static bool supportsMachOX86_64(uint64_t Type) {
	return Type == MachO::X86_64_RELOC_UNSIGNED;
	}

	static uint64_t resolveMachOX86_64(RelocationRef R, uint64_t S, uint64_t A) {
	if (R.getType() == MachO::X86_64_RELOC_UNSIGNED)
	return S;
	llvm_unreachable("Invalid relocation type");
	}

	static bool supportsWasm32(uint64_t Type) {
	switch (Type) {
	case wasm::R_WASM_FUNCTION_INDEX_LEB:
	case wasm::R_WASM_TABLE_INDEX_SLEB:
	case wasm::R_WASM_TABLE_INDEX_I32:
	case wasm::R_WASM_MEMORY_ADDR_LEB:
	case wasm::R_WASM_MEMORY_ADDR_SLEB:
	case wasm::R_WASM_MEMORY_ADDR_I32:
	case wasm::R_WASM_TYPE_INDEX_LEB:
	case wasm::R_WASM_GLOBAL_INDEX_LEB:
	case wasm::R_WASM_FUNCTION_OFFSET_I32:
	case wasm::R_WASM_SECTION_OFFSET_I32:
	case wasm::R_WASM_EVENT_INDEX_LEB:
	return true;
	default:
	return false;
	}
	}

	static uint64_t resolveWasm32(RelocationRef R, uint64_t S, uint64_t A) {
	switch (R.getType()) {
	case wasm::R_WASM_FUNCTION_INDEX_LEB:
	case wasm::R_WASM_TABLE_INDEX_SLEB:
	case wasm::R_WASM_TABLE_INDEX_I32:
	case wasm::R_WASM_MEMORY_ADDR_LEB:
	case wasm::R_WASM_MEMORY_ADDR_SLEB:
	case wasm::R_WASM_MEMORY_ADDR_I32:
	case wasm::R_WASM_TYPE_INDEX_LEB:
	case wasm::R_WASM_GLOBAL_INDEX_LEB:
	case wasm::R_WASM_FUNCTION_OFFSET_I32:
	case wasm::R_WASM_SECTION_OFFSET_I32:
	case wasm::R_WASM_EVENT_INDEX_LEB:
	// For wasm section, its offset at 0 -- ignoring Value
	return A;
	default:
	llvm_unreachable("Invalid relocation type");
	}
	}

	std::pair<bool (*)(uint64_t), RelocationResolver>
	getRelocationResolver(const ObjectFile &Obj) {
	if (Obj.isCOFF()) {
	if (Obj.getBytesInAddress() == 8)
	return {supportsCOFFX86_64, resolveCOFFX86_64};
	return {supportsCOFFX86, resolveCOFFX86};
	} else if (Obj.isELF()) {
	if (Obj.getBytesInAddress() == 8) {
	switch (Obj.getArch()) {
	case Triple::x86_64:
	return {supportsX86_64, resolveX86_64};
	case Triple::aarch64:
	case Triple::aarch64_be:
	return {supportsAArch64, resolveAArch64};
	case Triple::bpfel:
	case Triple::bpfeb:
	return {supportsBPF, resolveBPF};
	case Triple::mips64el:
	case Triple::mips64:
	return {supportsMips64, resolveMips64};
	case Triple::ppc64le:
	case Triple::ppc64:
	return {supportsPPC64, resolvePPC64};
	case Triple::systemz:
	return {supportsSystemZ, resolveSystemZ};
	case Triple::sparcv9:
	return {supportsSparc64, resolveSparc64};
	case Triple::amdgcn:
	return {supportsAmdgpu, resolveAmdgpu};
	case Triple::riscv64:
	return {supportsRISCV, resolveRISCV};
	default:
	return {nullptr, nullptr};
	}
	}

	// 32-bit object file
	assert(Obj.getBytesInAddress() == 4 &&
	"Invalid word size in object file");

	switch (Obj.getArch()) {
	case Triple::x86:
	return {supportsX86, resolveX86};
	case Triple::ppc:
	return {supportsPPC32, resolvePPC32};
	case Triple::arm:
	case Triple::armeb:
	return {supportsARM, resolveARM};
	case Triple::avr:
	return {supportsAVR, resolveAVR};
	case Triple::lanai:
	return {supportsLanai, resolveLanai};
	case Triple::mipsel:
	case Triple::mips:
	return {supportsMips32, resolveMips32};
	case Triple::sparc:
	return {supportsSparc32, resolveSparc32};
	case Triple::hexagon:
	return {supportsHexagon, resolveHexagon};
	case Triple::riscv32:
	return {supportsRISCV, resolveRISCV};
	default:
	return {nullptr, nullptr};
	}
	} else if (Obj.isMachO()) {
	if (Obj.getArch() == Triple::x86_64)
	return {supportsMachOX86_64, resolveMachOX86_64};
	return {nullptr, nullptr};
	} else if (Obj.isWasm()) {
	if (Obj.getArch() == Triple::wasm32)
	return {supportsWasm32, resolveWasm32};
	return {nullptr, nullptr};
	}

	llvm_unreachable("Invalid object file");
	}

	} // namespace object
	} // namespace llvm
	Index: vendor/llvm/dist-release_90/lib/Support/AArch64TargetParser.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Support/AArch64TargetParser.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Support/AArch64TargetParser.cpp (revision 351303)
	@@ -1,215 +1,215 @@
	//===-- AArch64TargetParser - Parser for AArch64 features -------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements a target parser to recognise AArch64 hardware features
	// such as FPU/CPU/ARCH and extension names.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Support/AArch64TargetParser.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include <cctype>

	using namespace llvm;

	static unsigned checkArchVersion(llvm::StringRef Arch) {
	if (Arch.size() >= 2 && Arch[0] == 'v' && std::isdigit(Arch[1]))
	return (Arch[1] - 48);
	return 0;
	}

	unsigned AArch64::getDefaultFPU(StringRef CPU, AArch64::ArchKind AK) {
	if (CPU == "generic")
	return AArch64ARCHNames[static_cast<unsigned>(AK)].DefaultFPU;

	return StringSwitch<unsigned>(CPU)
	#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
	.Case(NAME, ARM::DEFAULT_FPU)
	#include "../../include/llvm/Support/AArch64TargetParser.def"
	.Default(ARM::FK_INVALID);
	}

	unsigned AArch64::getDefaultExtensions(StringRef CPU, AArch64::ArchKind AK) {
	if (CPU == "generic")
	return AArch64ARCHNames[static_cast<unsigned>(AK)].ArchBaseExtensions;

	return StringSwitch<unsigned>(CPU)
	#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
	.Case(NAME, AArch64ARCHNames[static_cast<unsigned>(ArchKind::ID)] \
	.ArchBaseExtensions \| \
	DEFAULT_EXT)
	#include "../../include/llvm/Support/AArch64TargetParser.def"
	.Default(AArch64::AEK_INVALID);
	}

	AArch64::ArchKind AArch64::getCPUArchKind(StringRef CPU) {
	if (CPU == "generic")
	return ArchKind::ARMV8A;

	return StringSwitch<AArch64::ArchKind>(CPU)
	#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
	.Case(NAME, ArchKind::ID)
	#include "../../include/llvm/Support/AArch64TargetParser.def"
	.Default(ArchKind::INVALID);
	}

	bool AArch64::getExtensionFeatures(unsigned Extensions,
	std::vector<StringRef> &Features) {
	if (Extensions == AArch64::AEK_INVALID)
	return false;

	if (Extensions & AEK_FP)
	Features.push_back("+fp-armv8");
	if (Extensions & AEK_SIMD)
	Features.push_back("+neon");
	if (Extensions & AEK_CRC)
	Features.push_back("+crc");
	if (Extensions & AEK_CRYPTO)
	Features.push_back("+crypto");
	if (Extensions & AEK_DOTPROD)
	Features.push_back("+dotprod");
	if (Extensions & AEK_FP16FML)
	Features.push_back("+fp16fml");
	if (Extensions & AEK_FP16)
	Features.push_back("+fullfp16");
	if (Extensions & AEK_PROFILE)
	Features.push_back("+spe");
	if (Extensions & AEK_RAS)
	Features.push_back("+ras");
	if (Extensions & AEK_LSE)
	Features.push_back("+lse");
	if (Extensions & AEK_RDM)
	Features.push_back("+rdm");
	if (Extensions & AEK_SVE)
	Features.push_back("+sve");
	if (Extensions & AEK_SVE2)
	Features.push_back("+sve2");
	if (Extensions & AEK_SVE2AES)
	Features.push_back("+sve2-aes");
	if (Extensions & AEK_SVE2SM4)
	Features.push_back("+sve2-sm4");
	if (Extensions & AEK_SVE2SHA3)
	Features.push_back("+sve2-sha3");
	- if (Extensions & AEK_BITPERM)
	- Features.push_back("+bitperm");
	+ if (Extensions & AEK_SVE2BITPERM)
	+ Features.push_back("+sve2-bitperm");
	if (Extensions & AEK_RCPC)
	Features.push_back("+rcpc");

	return true;
	}

	bool AArch64::getArchFeatures(AArch64::ArchKind AK,
	std::vector<StringRef> &Features) {
	if (AK == ArchKind::ARMV8_1A)
	Features.push_back("+v8.1a");
	if (AK == ArchKind::ARMV8_2A)
	Features.push_back("+v8.2a");
	if (AK == ArchKind::ARMV8_3A)
	Features.push_back("+v8.3a");
	if (AK == ArchKind::ARMV8_4A)
	Features.push_back("+v8.4a");
	if (AK == ArchKind::ARMV8_5A)
	Features.push_back("+v8.5a");

	return AK != ArchKind::INVALID;
	}

	StringRef AArch64::getArchName(AArch64::ArchKind AK) {
	return AArch64ARCHNames[static_cast<unsigned>(AK)].getName();
	}

	StringRef AArch64::getCPUAttr(AArch64::ArchKind AK) {
	return AArch64ARCHNames[static_cast<unsigned>(AK)].getCPUAttr();
	}

	StringRef AArch64::getSubArch(AArch64::ArchKind AK) {
	return AArch64ARCHNames[static_cast<unsigned>(AK)].getSubArch();
	}

	unsigned AArch64::getArchAttr(AArch64::ArchKind AK) {
	return AArch64ARCHNames[static_cast<unsigned>(AK)].ArchAttr;
	}

	StringRef AArch64::getArchExtName(unsigned ArchExtKind) {
	for (const auto &AE : AArch64ARCHExtNames)
	if (ArchExtKind == AE.ID)
	return AE.getName();
	return StringRef();
	}

	StringRef AArch64::getArchExtFeature(StringRef ArchExt) {
	if (ArchExt.startswith("no")) {
	StringRef ArchExtBase(ArchExt.substr(2));
	for (const auto &AE : AArch64ARCHExtNames) {
	if (AE.NegFeature && ArchExtBase == AE.getName())
	return StringRef(AE.NegFeature);
	}
	}

	for (const auto &AE : AArch64ARCHExtNames)
	if (AE.Feature && ArchExt == AE.getName())
	return StringRef(AE.Feature);
	return StringRef();
	}

	StringRef AArch64::getDefaultCPU(StringRef Arch) {
	ArchKind AK = parseArch(Arch);
	if (AK == ArchKind::INVALID)
	return StringRef();

	// Look for multiple AKs to find the default for pair AK+Name.
	for (const auto &CPU : AArch64CPUNames)
	if (CPU.ArchID == AK && CPU.Default)
	return CPU.getName();

	// If we can't find a default then target the architecture instead
	return "generic";
	}

	void AArch64::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values) {
	for (const auto &Arch : AArch64CPUNames) {
	if (Arch.ArchID != ArchKind::INVALID)
	Values.push_back(Arch.getName());
	}
	}

	bool AArch64::isX18ReservedByDefault(const Triple &TT) {
	return TT.isAndroid() \|\| TT.isOSDarwin() \|\| TT.isOSFuchsia() \|\|
	TT.isOSWindows();
	}

	// Allows partial match, ex. "v8a" matches "armv8a".
	AArch64::ArchKind AArch64::parseArch(StringRef Arch) {
	Arch = ARM::getCanonicalArchName(Arch);
	if (checkArchVersion(Arch) < 8)
	return ArchKind::INVALID;

	StringRef Syn = ARM::getArchSynonym(Arch);
	for (const auto A : AArch64ARCHNames) {
	if (A.getName().endswith(Syn))
	return A.ID;
	}
	return ArchKind::INVALID;
	}

	AArch64::ArchExtKind AArch64::parseArchExt(StringRef ArchExt) {
	for (const auto A : AArch64ARCHExtNames) {
	if (ArchExt == A.getName())
	return static_cast<ArchExtKind>(A.ID);
	}
	return AArch64::AEK_INVALID;
	}

	AArch64::ArchKind AArch64::parseCPUArch(StringRef CPU) {
	for (const auto C : AArch64CPUNames) {
	if (CPU == C.getName())
	return C.ArchID;
	}
	return ArchKind::INVALID;
	}
	Index: vendor/llvm/dist-release_90/lib/Support/Unix/Path.inc
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Support/Unix/Path.inc (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Support/Unix/Path.inc (revision 351303)
	@@ -1,1226 +1,1226 @@
	//===- llvm/Support/Unix/Path.inc - Unix Path Implementation ----- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the Unix specific implementation of the Path API.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	//=== WARNING: Implementation here must contain only generic UNIX code that
	//=== is guaranteed to work on all UNIX variants.
	//===----------------------------------------------------------------------===//

	#include "Unix.h"
	#include <limits.h>
	#include <stdio.h>
	#if HAVE_SYS_STAT_H
	#include <sys/stat.h>
	#endif
	#if HAVE_FCNTL_H
	#include <fcntl.h>
	#endif
	#ifdef HAVE_UNISTD_H
	#include <unistd.h>
	#endif
	#ifdef HAVE_SYS_MMAN_H
	#include <sys/mman.h>
	#endif

	#include <dirent.h>
	#include <pwd.h>

	#ifdef __APPLE__
	#include <mach-o/dyld.h>
	#include <sys/attr.h>
	#include <copyfile.h>
	#elif defined(__DragonFly__)
	#include <sys/mount.h>
	#endif

	// Both stdio.h and cstdio are included via different paths and
	// stdcxx's cstdio doesn't include stdio.h, so it doesn't #undef the macros
	// either.
	#undef ferror
	#undef feof

	// For GNU Hurd
	#if defined(__GNU__) && !defined(PATH_MAX)
	# define PATH_MAX 4096
	# define MAXPATHLEN 4096
	#endif

	#include <sys/types.h>
	#if !defined(__APPLE__) && !defined(__OpenBSD__) && !defined(__FreeBSD__) && \
	!defined(__linux__) && !defined(__FreeBSD_kernel__) && !defined(_AIX)
	#include <sys/statvfs.h>
	#define STATVFS statvfs
	#define FSTATVFS fstatvfs
	#define STATVFS_F_FRSIZE(vfs) vfs.f_frsize
	#else
	#if defined(__OpenBSD__) \|\| defined(__FreeBSD__)
	#include <sys/mount.h>
	#include <sys/param.h>
	#elif defined(__linux__)
	#if defined(HAVE_LINUX_MAGIC_H)
	#include <linux/magic.h>
	#else
	#if defined(HAVE_LINUX_NFS_FS_H)
	#include <linux/nfs_fs.h>
	#endif
	#if defined(HAVE_LINUX_SMB_H)
	#include <linux/smb.h>
	#endif
	#endif
	#include <sys/vfs.h>
	#elif defined(_AIX)
	#include <sys/statfs.h>

	// <sys/vmount.h> depends on `uint` to be a typedef from <sys/types.h> to
	// `uint_t`; however, <sys/types.h> does not always declare `uint`. We provide
	// the typedef prior to including <sys/vmount.h> to work around this issue.
	typedef uint_t uint;
	#include <sys/vmount.h>
	#else
	#include <sys/mount.h>
	#endif
	#define STATVFS statfs
	#define FSTATVFS fstatfs
	#define STATVFS_F_FRSIZE(vfs) static_cast<uint64_t>(vfs.f_bsize)
	#endif

	#if defined(__NetBSD__) \|\| defined(__DragonFly__) \|\| defined(__GNU__)
	#define STATVFS_F_FLAG(vfs) (vfs).f_flag
	#else
	#define STATVFS_F_FLAG(vfs) (vfs).f_flags
	#endif

	using namespace llvm;

	namespace llvm {
	namespace sys {
	namespace fs {

	const file_t kInvalidFile = -1;

	#if defined(__FreeBSD__) \|\| defined(__NetBSD__) \|\| defined(__OpenBSD__) \|\| \
	defined(__minix) \|\| defined(__FreeBSD_kernel__) \|\| defined(__linux__) \|\| \
	defined(__CYGWIN__) \|\| defined(__DragonFly__) \|\| defined(_AIX) \|\| defined(__GNU__)
	static int
	test_dir(char ret[PATH_MAX], const char dir, const char bin)
	{
	struct stat sb;
	char fullpath[PATH_MAX];

	int chars = snprintf(fullpath, PATH_MAX, "%s/%s", dir, bin);
	// We cannot write PATH_MAX characters because the string will be terminated
	// with a null character. Fail if truncation happened.
	if (chars >= PATH_MAX)
	return 1;
	if (!realpath(fullpath, ret))
	return 1;
	if (stat(fullpath, &sb) != 0)
	return 1;

	return 0;
	}

	static char *
	getprogpath(char ret[PATH_MAX], const char *bin)
	{
	/* First approach: absolute path. */
	if (bin[0] == '/') {
	if (test_dir(ret, "/", bin) == 0)
	return ret;
	return nullptr;
	}

	/* Second approach: relative path. */
	if (strchr(bin, '/')) {
	char cwd[PATH_MAX];
	if (!getcwd(cwd, PATH_MAX))
	return nullptr;
	if (test_dir(ret, cwd, bin) == 0)
	return ret;
	return nullptr;
	}

	/* Third approach: $PATH */
	char *pv;
	if ((pv = getenv("PATH")) == nullptr)
	return nullptr;
	char *s = strdup(pv);
	if (!s)
	return nullptr;
	char *state;
	for (char *t = strtok_r(s, ":", &state); t != nullptr;
	t = strtok_r(nullptr, ":", &state)) {
	if (test_dir(ret, t, bin) == 0) {
	free(s);
	return ret;
	}
	}
	free(s);
	return nullptr;
	}
	#endif // __FreeBSD__ \|\| __NetBSD__ \|\| __FreeBSD_kernel__

	/// GetMainExecutable - Return the path to the main executable, given the
	/// value of argv[0] from program startup.
	std::string getMainExecutable(const char argv0, void MainAddr) {
	#if defined(__APPLE__)
	// On OS X the executable path is saved to the stack by dyld. Reading it
	// from there is much faster than calling dladdr, especially for large
	// binaries with symbols.
	char exe_path[MAXPATHLEN];
	uint32_t size = sizeof(exe_path);
	if (_NSGetExecutablePath(exe_path, &size) == 0) {
	char link_path[MAXPATHLEN];
	if (realpath(exe_path, link_path))
	return link_path;
	}
	#elif defined(__FreeBSD__) \|\| defined(__NetBSD__) \|\| defined(__OpenBSD__) \|\| \
	defined(__minix) \|\| defined(__DragonFly__) \|\| \
	defined(__FreeBSD_kernel__) \|\| defined(_AIX)
	StringRef curproc("/proc/curproc/file");
	char exe_path[PATH_MAX];
	// /proc is not mounted by default under FreeBSD, but gives more accurate
	// information than argv[0] when it is.
	if (sys::fs::exists(curproc)) {
	ssize_t len = readlink(curproc.str().c_str(), exe_path, sizeof(exe_path));
	if (len > 0) {
	// Null terminate the string for realpath. readlink never null
	// terminates its output.
	len = std::min(len, ssize_t(sizeof(exe_path) - 1));
	exe_path[len] = '\0';
	return exe_path;
	}
	}
	// If we don't have procfs mounted, fall back to argv[0]
	if (getprogpath(exe_path, argv0) != NULL)
	return exe_path;
	#elif defined(__linux__) \|\| defined(__CYGWIN__)
	char exe_path[MAXPATHLEN];
	StringRef aPath("/proc/self/exe");
	if (sys::fs::exists(aPath)) {
	// /proc is not always mounted under Linux (chroot for example).
	ssize_t len = readlink(aPath.str().c_str(), exe_path, sizeof(exe_path));
	if (len < 0)
	return "";

	// Null terminate the string for realpath. readlink never null
	// terminates its output.
	len = std::min(len, ssize_t(sizeof(exe_path) - 1));
	exe_path[len] = '\0';

	// On Linux, /proc/self/exe always looks through symlinks. However, on
	// GNU/Hurd, /proc/self/exe is a symlink to the path that was used to start
	// the program, and not the eventual binary file. Therefore, call realpath
	// so this behaves the same on all platforms.
	#if _POSIX_VERSION >= 200112 \|\| defined(__GLIBC__)
	if (char *real_path = realpath(exe_path, NULL)) {
	std::string ret = std::string(real_path);
	free(real_path);
	return ret;
	}
	#else
	char real_path[MAXPATHLEN];
	if (realpath(exe_path, real_path))
	return std::string(real_path);
	#endif
	}
	// Fall back to the classical detection.
	if (getprogpath(exe_path, argv0))
	return exe_path;
	#elif defined(HAVE_DLFCN_H) && defined(HAVE_DLADDR)
	// Use dladdr to get executable path if available.
	Dl_info DLInfo;
	int err = dladdr(MainAddr, &DLInfo);
	if (err == 0)
	return "";

	// If the filename is a symlink, we need to resolve and return the location of
	// the actual executable.
	char link_path[MAXPATHLEN];
	if (realpath(DLInfo.dli_fname, link_path))
	return link_path;
	#else
	#error GetMainExecutable is not implemented on this host yet.
	#endif
	return "";
	}

	TimePoint<> basic_file_status::getLastAccessedTime() const {
	return toTimePoint(fs_st_atime, fs_st_atime_nsec);
	}

	TimePoint<> basic_file_status::getLastModificationTime() const {
	return toTimePoint(fs_st_mtime, fs_st_mtime_nsec);
	}

	UniqueID file_status::getUniqueID() const {
	return UniqueID(fs_st_dev, fs_st_ino);
	}

	uint32_t file_status::getLinkCount() const {
	return fs_st_nlinks;
	}

	ErrorOr<space_info> disk_space(const Twine &Path) {
	struct STATVFS Vfs;
	if (::STATVFS(const_cast<char *>(Path.str().c_str()), &Vfs))
	return std::error_code(errno, std::generic_category());
	auto FrSize = STATVFS_F_FRSIZE(Vfs);
	space_info SpaceInfo;
	SpaceInfo.capacity = static_cast<uint64_t>(Vfs.f_blocks) * FrSize;
	SpaceInfo.free = static_cast<uint64_t>(Vfs.f_bfree) * FrSize;
	SpaceInfo.available = static_cast<uint64_t>(Vfs.f_bavail) * FrSize;
	return SpaceInfo;
	}

	std::error_code current_path(SmallVectorImpl<char> &result) {
	result.clear();

	const char *pwd = ::getenv("PWD");
	llvm::sys::fs::file_status PWDStatus, DotStatus;
	if (pwd && llvm::sys::path::is_absolute(pwd) &&
	!llvm::sys::fs::status(pwd, PWDStatus) &&
	!llvm::sys::fs::status(".", DotStatus) &&
	PWDStatus.getUniqueID() == DotStatus.getUniqueID()) {
	result.append(pwd, pwd + strlen(pwd));
	return std::error_code();
	}

	#ifdef MAXPATHLEN
	result.reserve(MAXPATHLEN);
	#else
	// For GNU Hurd
	result.reserve(1024);
	#endif

	while (true) {
	if (::getcwd(result.data(), result.capacity()) == nullptr) {
	// See if there was a real error.
	if (errno != ENOMEM)
	return std::error_code(errno, std::generic_category());
	// Otherwise there just wasn't enough space.
	result.reserve(result.capacity() * 2);
	} else
	break;
	}

	result.set_size(strlen(result.data()));
	return std::error_code();
	}

	std::error_code set_current_path(const Twine &path) {
	SmallString<128> path_storage;
	StringRef p = path.toNullTerminatedStringRef(path_storage);

	if (::chdir(p.begin()) == -1)
	return std::error_code(errno, std::generic_category());

	return std::error_code();
	}

	std::error_code create_directory(const Twine &path, bool IgnoreExisting,
	perms Perms) {
	SmallString<128> path_storage;
	StringRef p = path.toNullTerminatedStringRef(path_storage);

	if (::mkdir(p.begin(), Perms) == -1) {
	if (errno != EEXIST \|\| !IgnoreExisting)
	return std::error_code(errno, std::generic_category());
	}

	return std::error_code();
	}

	// Note that we are using symbolic link because hard links are not supported by
	// all filesystems (SMB doesn't).
	std::error_code create_link(const Twine &to, const Twine &from) {
	// Get arguments.
	SmallString<128> from_storage;
	SmallString<128> to_storage;
	StringRef f = from.toNullTerminatedStringRef(from_storage);
	StringRef t = to.toNullTerminatedStringRef(to_storage);

	if (::symlink(t.begin(), f.begin()) == -1)
	return std::error_code(errno, std::generic_category());

	return std::error_code();
	}

	std::error_code create_hard_link(const Twine &to, const Twine &from) {
	// Get arguments.
	SmallString<128> from_storage;
	SmallString<128> to_storage;
	StringRef f = from.toNullTerminatedStringRef(from_storage);
	StringRef t = to.toNullTerminatedStringRef(to_storage);

	if (::link(t.begin(), f.begin()) == -1)
	return std::error_code(errno, std::generic_category());

	return std::error_code();
	}

	std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
	SmallString<128> path_storage;
	StringRef p = path.toNullTerminatedStringRef(path_storage);

	struct stat buf;
	if (lstat(p.begin(), &buf) != 0) {
	if (errno != ENOENT \|\| !IgnoreNonExisting)
	return std::error_code(errno, std::generic_category());
	return std::error_code();
	}

	// Note: this check catches strange situations. In all cases, LLVM should
	// only be involved in the creation and deletion of regular files. This
	// check ensures that what we're trying to erase is a regular file. It
	// effectively prevents LLVM from erasing things like /dev/null, any block
	// special file, or other things that aren't "regular" files.
	if (!S_ISREG(buf.st_mode) && !S_ISDIR(buf.st_mode) && !S_ISLNK(buf.st_mode))
	return make_error_code(errc::operation_not_permitted);

	if (::remove(p.begin()) == -1) {
	if (errno != ENOENT \|\| !IgnoreNonExisting)
	return std::error_code(errno, std::generic_category());
	}

	return std::error_code();
	}

	static bool is_local_impl(struct STATVFS &Vfs) {
	#if defined(__linux__) \|\| defined(__GNU__)
	#ifndef NFS_SUPER_MAGIC
	#define NFS_SUPER_MAGIC 0x6969
	#endif
	#ifndef SMB_SUPER_MAGIC
	#define SMB_SUPER_MAGIC 0x517B
	#endif
	#ifndef CIFS_MAGIC_NUMBER
	#define CIFS_MAGIC_NUMBER 0xFF534D42
	#endif
	#ifdef __GNU__
	switch ((uint32_t)Vfs.__f_type) {
	#else
	switch ((uint32_t)Vfs.f_type) {
	#endif
	case NFS_SUPER_MAGIC:
	case SMB_SUPER_MAGIC:
	case CIFS_MAGIC_NUMBER:
	return false;
	default:
	return true;
	}
	#elif defined(__CYGWIN__)
	// Cygwin doesn't expose this information; would need to use Win32 API.
	return false;
	#elif defined(__Fuchsia__)
	// Fuchsia doesn't yet support remote filesystem mounts.
	return true;
	#elif defined(__EMSCRIPTEN__)
	// Emscripten doesn't currently support remote filesystem mounts.
	return true;
	#elif defined(__HAIKU__)
	// Haiku doesn't expose this information.
	return false;
	#elif defined(__sun)
	// statvfs::f_basetype contains a null-terminated FSType name of the mounted target
	StringRef fstype(Vfs.f_basetype);
	// NFS is the only non-local fstype??
	return !fstype.equals("nfs");
	#elif defined(_AIX)
	// Call mntctl; try more than twice in case of timing issues with a concurrent
	// mount.
	int Ret;
	size_t BufSize = 2048u;
	std::unique_ptr<char[]> Buf;
	int Tries = 3;
	while (Tries--) {
	Buf = llvm::make_unique<char[]>(BufSize);
	Ret = mntctl(MCTL_QUERY, BufSize, Buf.get());
	if (Ret != 0)
	break;
	BufSize = reinterpret_cast<unsigned int >(Buf.get());
	Buf.reset();
	}

	if (Ret == -1)
	// There was an error; "remote" is the conservative answer.
	return false;

	// Look for the correct vmount entry.
	char *CurObjPtr = Buf.get();
	while (Ret--) {
	struct vmount Vp = reinterpret_cast<struct vmount >(CurObjPtr);
	static_assert(sizeof(Vfs.f_fsid) == sizeof(Vp->vmt_fsid),
	"fsid length mismatch");
	if (memcmp(&Vfs.f_fsid, &Vp->vmt_fsid, sizeof Vfs.f_fsid) == 0)
	return (Vp->vmt_flags & MNT_REMOTE) == 0;

	CurObjPtr += Vp->vmt_length;
	}

	// vmount entry not found; "remote" is the conservative answer.
	return false;
	#else
	return !!(STATVFS_F_FLAG(Vfs) & MNT_LOCAL);
	#endif
	}

	std::error_code is_local(const Twine &Path, bool &Result) {
	struct STATVFS Vfs;
	if (::STATVFS(const_cast<char *>(Path.str().c_str()), &Vfs))
	return std::error_code(errno, std::generic_category());

	Result = is_local_impl(Vfs);
	return std::error_code();
	}

	std::error_code is_local(int FD, bool &Result) {
	struct STATVFS Vfs;
	if (::FSTATVFS(FD, &Vfs))
	return std::error_code(errno, std::generic_category());

	Result = is_local_impl(Vfs);
	return std::error_code();
	}

	std::error_code rename(const Twine &from, const Twine &to) {
	// Get arguments.
	SmallString<128> from_storage;
	SmallString<128> to_storage;
	StringRef f = from.toNullTerminatedStringRef(from_storage);
	StringRef t = to.toNullTerminatedStringRef(to_storage);

	if (::rename(f.begin(), t.begin()) == -1)
	return std::error_code(errno, std::generic_category());

	return std::error_code();
	}

	std::error_code resize_file(int FD, uint64_t Size) {
	#if defined(HAVE_POSIX_FALLOCATE)
	// If we have posix_fallocate use it. Unlike ftruncate it always allocates
	// space, so we get an error if the disk is full.
	if (int Err = ::posix_fallocate(FD, 0, Size)) {
	#ifdef _AIX
	constexpr int NotSupportedError = ENOTSUP;
	#else
	constexpr int NotSupportedError = EOPNOTSUPP;
	#endif
	if (Err != EINVAL && Err != NotSupportedError)
	return std::error_code(Err, std::generic_category());
	}
	#endif
	// Use ftruncate as a fallback. It may or may not allocate space. At least on
	// OS X with HFS+ it does.
	if (::ftruncate(FD, Size) == -1)
	return std::error_code(errno, std::generic_category());

	return std::error_code();
	}

	static int convertAccessMode(AccessMode Mode) {
	switch (Mode) {
	case AccessMode::Exist:
	return F_OK;
	case AccessMode::Write:
	return W_OK;
	case AccessMode::Execute:
	return R_OK \| X_OK; // scripts also need R_OK.
	}
	llvm_unreachable("invalid enum");
	}

	std::error_code access(const Twine &Path, AccessMode Mode) {
	SmallString<128> PathStorage;
	StringRef P = Path.toNullTerminatedStringRef(PathStorage);

	if (::access(P.begin(), convertAccessMode(Mode)) == -1)
	return std::error_code(errno, std::generic_category());

	if (Mode == AccessMode::Execute) {
	// Don't say that directories are executable.
	struct stat buf;
	if (0 != stat(P.begin(), &buf))
	return errc::permission_denied;
	if (!S_ISREG(buf.st_mode))
	return errc::permission_denied;
	}

	return std::error_code();
	}

	bool can_execute(const Twine &Path) {
	return !access(Path, AccessMode::Execute);
	}

	bool equivalent(file_status A, file_status B) {
	assert(status_known(A) && status_known(B));
	return A.fs_st_dev == B.fs_st_dev &&
	A.fs_st_ino == B.fs_st_ino;
	}

	std::error_code equivalent(const Twine &A, const Twine &B, bool &result) {
	file_status fsA, fsB;
	if (std::error_code ec = status(A, fsA))
	return ec;
	if (std::error_code ec = status(B, fsB))
	return ec;
	result = equivalent(fsA, fsB);
	return std::error_code();
	}

	static void expandTildeExpr(SmallVectorImpl<char> &Path) {
	StringRef PathStr(Path.begin(), Path.size());
	if (PathStr.empty() \|\| !PathStr.startswith("~"))
	return;

	PathStr = PathStr.drop_front();
	StringRef Expr =
	PathStr.take_until([](char c) { return path::is_separator(c); });
	StringRef Remainder = PathStr.substr(Expr.size() + 1);
	SmallString<128> Storage;
	if (Expr.empty()) {
	// This is just ~/..., resolve it to the current user's home dir.
	if (!path::home_directory(Storage)) {
	// For some reason we couldn't get the home directory. Just exit.
	return;
	}

	// Overwrite the first character and insert the rest.
	Path[0] = Storage[0];
	Path.insert(Path.begin() + 1, Storage.begin() + 1, Storage.end());
	return;
	}

	// This is a string of the form ~username/, look up this user's entry in the
	// password database.
	struct passwd *Entry = nullptr;
	std::string User = Expr.str();
	Entry = ::getpwnam(User.c_str());

	if (!Entry) {
	// Unable to look up the entry, just return back the original path.
	return;
	}

	Storage = Remainder;
	Path.clear();
	Path.append(Entry->pw_dir, Entry->pw_dir + strlen(Entry->pw_dir));
	llvm::sys::path::append(Path, Storage);
	}


	void expand_tilde(const Twine &path, SmallVectorImpl<char> &dest) {
	dest.clear();
	if (path.isTriviallyEmpty())
	return;

	path.toVector(dest);
	expandTildeExpr(dest);

	return;
	}

	static file_type typeForMode(mode_t Mode) {
	if (S_ISDIR(Mode))
	return file_type::directory_file;
	else if (S_ISREG(Mode))
	return file_type::regular_file;
	else if (S_ISBLK(Mode))
	return file_type::block_file;
	else if (S_ISCHR(Mode))
	return file_type::character_file;
	else if (S_ISFIFO(Mode))
	return file_type::fifo_file;
	else if (S_ISSOCK(Mode))
	return file_type::socket_file;
	else if (S_ISLNK(Mode))
	return file_type::symlink_file;
	return file_type::type_unknown;
	}

	static std::error_code fillStatus(int StatRet, const struct stat &Status,
	file_status &Result) {
	if (StatRet != 0) {
	std::error_code EC(errno, std::generic_category());
	if (EC == errc::no_such_file_or_directory)
	Result = file_status(file_type::file_not_found);
	else
	Result = file_status(file_type::status_error);
	return EC;
	}

	uint32_t atime_nsec, mtime_nsec;
	#if defined(HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC)
	atime_nsec = Status.st_atimespec.tv_nsec;
	mtime_nsec = Status.st_mtimespec.tv_nsec;
	#elif defined(HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC)
	atime_nsec = Status.st_atim.tv_nsec;
	mtime_nsec = Status.st_mtim.tv_nsec;
	#else
	atime_nsec = mtime_nsec = 0;
	#endif

	perms Perms = static_cast<perms>(Status.st_mode) & all_perms;
	Result = file_status(typeForMode(Status.st_mode), Perms, Status.st_dev,
	Status.st_nlink, Status.st_ino,
	Status.st_atime, atime_nsec, Status.st_mtime, mtime_nsec,
	Status.st_uid, Status.st_gid, Status.st_size);

	return std::error_code();
	}

	std::error_code status(const Twine &Path, file_status &Result, bool Follow) {
	SmallString<128> PathStorage;
	StringRef P = Path.toNullTerminatedStringRef(PathStorage);

	struct stat Status;
	int StatRet = (Follow ? ::stat : ::lstat)(P.begin(), &Status);
	return fillStatus(StatRet, Status, Result);
	}

	std::error_code status(int FD, file_status &Result) {
	struct stat Status;
	int StatRet = ::fstat(FD, &Status);
	return fillStatus(StatRet, Status, Result);
	}

	unsigned getUmask() {
	// Chose arbitary new mask and reset the umask to the old mask.
	// umask(2) never fails so ignore the return of the second call.
	unsigned Mask = ::umask(0);
	(void) ::umask(Mask);
	return Mask;
	}

	std::error_code setPermissions(const Twine &Path, perms Permissions) {
	SmallString<128> PathStorage;
	StringRef P = Path.toNullTerminatedStringRef(PathStorage);

	if (::chmod(P.begin(), Permissions))
	return std::error_code(errno, std::generic_category());
	return std::error_code();
	}

	std::error_code setPermissions(int FD, perms Permissions) {
	if (::fchmod(FD, Permissions))
	return std::error_code(errno, std::generic_category());
	return std::error_code();
	}

	std::error_code setLastAccessAndModificationTime(int FD, TimePoint<> AccessTime,
	TimePoint<> ModificationTime) {
	#if defined(HAVE_FUTIMENS)
	timespec Times[2];
	Times[0] = sys::toTimeSpec(AccessTime);
	Times[1] = sys::toTimeSpec(ModificationTime);
	if (::futimens(FD, Times))
	return std::error_code(errno, std::generic_category());
	return std::error_code();
	#elif defined(HAVE_FUTIMES)
	timeval Times[2];
	Times[0] = sys::toTimeVal(
	std::chrono::time_point_cast<std::chrono::microseconds>(AccessTime));
	Times[1] =
	sys::toTimeVal(std::chrono::time_point_cast<std::chrono::microseconds>(
	ModificationTime));
	if (::futimes(FD, Times))
	return std::error_code(errno, std::generic_category());
	return std::error_code();
	#else
	#warning Missing futimes() and futimens()
	return make_error_code(errc::function_not_supported);
	#endif
	}

	std::error_code mapped_file_region::init(int FD, uint64_t Offset,
	mapmode Mode) {
	assert(Size != 0);

	int flags = (Mode == readwrite) ? MAP_SHARED : MAP_PRIVATE;
	int prot = (Mode == readonly) ? PROT_READ : (PROT_READ \| PROT_WRITE);
	#if defined(__APPLE__)
	//----------------------------------------------------------------------
	// Newer versions of MacOSX have a flag that will allow us to read from
	// binaries whose code signature is invalid without crashing by using
	// the MAP_RESILIENT_CODESIGN flag. Also if a file from removable media
	// is mapped we can avoid crashing and return zeroes to any pages we try
	// to read if the media becomes unavailable by using the
	// MAP_RESILIENT_MEDIA flag. These flags are only usable when mapping
	// with PROT_READ, so take care not to specify them otherwise.
	//----------------------------------------------------------------------
	if (Mode == readonly) {
	#if defined(MAP_RESILIENT_CODESIGN)
	flags \|= MAP_RESILIENT_CODESIGN;
	#endif
	#if defined(MAP_RESILIENT_MEDIA)
	flags \|= MAP_RESILIENT_MEDIA;
	#endif
	}
	#endif // #if defined (__APPLE__)

	Mapping = ::mmap(nullptr, Size, prot, flags, FD, Offset);
	if (Mapping == MAP_FAILED)
	return std::error_code(errno, std::generic_category());
	return std::error_code();
	}

	mapped_file_region::mapped_file_region(int fd, mapmode mode, size_t length,
	uint64_t offset, std::error_code &ec)
	: Size(length), Mapping(), Mode(mode) {
	(void)Mode;
	ec = init(fd, offset, mode);
	if (ec)
	Mapping = nullptr;
	}

	mapped_file_region::~mapped_file_region() {
	if (Mapping)
	::munmap(Mapping, Size);
	}

	size_t mapped_file_region::size() const {
	assert(Mapping && "Mapping failed but used anyway!");
	return Size;
	}

	char *mapped_file_region::data() const {
	assert(Mapping && "Mapping failed but used anyway!");
	return reinterpret_cast<char*>(Mapping);
	}

	const char *mapped_file_region::const_data() const {
	assert(Mapping && "Mapping failed but used anyway!");
	return reinterpret_cast<const char*>(Mapping);
	}

	int mapped_file_region::alignment() {
	return Process::getPageSizeEstimate();
	}

	std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
	StringRef path,
	bool follow_symlinks) {
	SmallString<128> path_null(path);
	DIR *directory = ::opendir(path_null.c_str());
	if (!directory)
	return std::error_code(errno, std::generic_category());

	it.IterationHandle = reinterpret_cast<intptr_t>(directory);
	// Add something for replace_filename to replace.
	path::append(path_null, ".");
	it.CurrentEntry = directory_entry(path_null.str(), follow_symlinks);
	return directory_iterator_increment(it);
	}

	std::error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
	if (it.IterationHandle)
	::closedir(reinterpret_cast<DIR *>(it.IterationHandle));
	it.IterationHandle = 0;
	it.CurrentEntry = directory_entry();
	return std::error_code();
	}

	static file_type direntType(dirent* Entry) {
	// Most platforms provide the file type in the dirent: Linux/BSD/Mac.
	// The DTTOIF macro lets us reuse our status -> type conversion.
	#if defined(_DIRENT_HAVE_D_TYPE) && defined(DTTOIF)
	return typeForMode(DTTOIF(Entry->d_type));
	#else
	// Other platforms such as Solaris require a stat() to get the type.
	return file_type::type_unknown;
	#endif
	}

	std::error_code detail::directory_iterator_increment(detail::DirIterState &It) {
	errno = 0;
	dirent CurDir = ::readdir(reinterpret_cast<DIR >(It.IterationHandle));
	if (CurDir == nullptr && errno != 0) {
	return std::error_code(errno, std::generic_category());
	} else if (CurDir != nullptr) {
	StringRef Name(CurDir->d_name);
	if ((Name.size() == 1 && Name[0] == '.') \|\|
	(Name.size() == 2 && Name[0] == '.' && Name[1] == '.'))
	return directory_iterator_increment(It);
	It.CurrentEntry.replace_filename(Name, direntType(CurDir));
	} else
	return directory_iterator_destruct(It);

	return std::error_code();
	}

	ErrorOr<basic_file_status> directory_entry::status() const {
	file_status s;
	if (auto EC = fs::status(Path, s, FollowSymlinks))
	return EC;
	return s;
	}

	#if !defined(F_GETPATH)
	static bool hasProcSelfFD() {
	// If we have a /proc filesystem mounted, we can quickly establish the
	// real name of the file with readlink
	static const bool Result = (::access("/proc/self/fd", R_OK) == 0);
	return Result;
	}
	#endif

	static int nativeOpenFlags(CreationDisposition Disp, OpenFlags Flags,
	FileAccess Access) {
	int Result = 0;
	if (Access == FA_Read)
	Result \|= O_RDONLY;
	else if (Access == FA_Write)
	Result \|= O_WRONLY;
	else if (Access == (FA_Read \| FA_Write))
	Result \|= O_RDWR;

	// This is for compatibility with old code that assumed F_Append implied
	// would open an existing file. See Windows/Path.inc for a longer comment.
	if (Flags & F_Append)
	Disp = CD_OpenAlways;

	if (Disp == CD_CreateNew) {
	Result \|= O_CREAT; // Create if it doesn't exist.
	Result \|= O_EXCL; // Fail if it does.
	} else if (Disp == CD_CreateAlways) {
	Result \|= O_CREAT; // Create if it doesn't exist.
	Result \|= O_TRUNC; // Truncate if it does.
	} else if (Disp == CD_OpenAlways) {
	Result \|= O_CREAT; // Create if it doesn't exist.
	} else if (Disp == CD_OpenExisting) {
	// Nothing special, just don't add O_CREAT and we get these semantics.
	}

	if (Flags & F_Append)
	Result \|= O_APPEND;

	#ifdef O_CLOEXEC
	if (!(Flags & OF_ChildInherit))
	Result \|= O_CLOEXEC;
	#endif

	return Result;
	}

	std::error_code openFile(const Twine &Name, int &ResultFD,
	CreationDisposition Disp, FileAccess Access,
	OpenFlags Flags, unsigned Mode) {
	int OpenFlags = nativeOpenFlags(Disp, Flags, Access);

	SmallString<128> Storage;
	StringRef P = Name.toNullTerminatedStringRef(Storage);
	// Call ::open in a lambda to avoid overload resolution in RetryAfterSignal
	// when open is overloaded, such as in Bionic.
	auto Open = [&]() { return ::open(P.begin(), OpenFlags, Mode); };
	if ((ResultFD = sys::RetryAfterSignal(-1, Open)) < 0)
	return std::error_code(errno, std::generic_category());
	#ifndef O_CLOEXEC
	if (!(Flags & OF_ChildInherit)) {
	int r = fcntl(ResultFD, F_SETFD, FD_CLOEXEC);
	(void)r;
	assert(r == 0 && "fcntl(F_SETFD, FD_CLOEXEC) failed");
	}
	#endif
	return std::error_code();
	}

	Expected<int> openNativeFile(const Twine &Name, CreationDisposition Disp,
	FileAccess Access, OpenFlags Flags,
	unsigned Mode) {

	int FD;
	std::error_code EC = openFile(Name, FD, Disp, Access, Flags, Mode);
	if (EC)
	return errorCodeToError(EC);
	return FD;
	}

	std::error_code openFileForRead(const Twine &Name, int &ResultFD,
	OpenFlags Flags,
	SmallVectorImpl<char> *RealPath) {
	std::error_code EC =
	openFile(Name, ResultFD, CD_OpenExisting, FA_Read, Flags, 0666);
	if (EC)
	return EC;

	// Attempt to get the real name of the file, if the user asked
	if(!RealPath)
	return std::error_code();
	RealPath->clear();
	#if defined(F_GETPATH)
	// When F_GETPATH is availble, it is the quickest way to get
	// the real path name.
	char Buffer[MAXPATHLEN];
	if (::fcntl(ResultFD, F_GETPATH, Buffer) != -1)
	RealPath->append(Buffer, Buffer + strlen(Buffer));
	#else
	char Buffer[PATH_MAX];
	if (hasProcSelfFD()) {
	char ProcPath[64];
	snprintf(ProcPath, sizeof(ProcPath), "/proc/self/fd/%d", ResultFD);
	ssize_t CharCount = ::readlink(ProcPath, Buffer, sizeof(Buffer));
	if (CharCount > 0)
	RealPath->append(Buffer, Buffer + CharCount);
	} else {
	SmallString<128> Storage;
	StringRef P = Name.toNullTerminatedStringRef(Storage);

	// Use ::realpath to get the real path name
	if (::realpath(P.begin(), Buffer) != nullptr)
	RealPath->append(Buffer, Buffer + strlen(Buffer));
	}
	#endif
	return std::error_code();
	}

	Expected<file_t> openNativeFileForRead(const Twine &Name, OpenFlags Flags,
	SmallVectorImpl<char> *RealPath) {
	file_t ResultFD;
	std::error_code EC = openFileForRead(Name, ResultFD, Flags, RealPath);
	if (EC)
	return errorCodeToError(EC);
	return ResultFD;
	}

	file_t getStdinHandle() { return 0; }
	file_t getStdoutHandle() { return 1; }
	file_t getStderrHandle() { return 2; }

	std::error_code readNativeFile(file_t FD, MutableArrayRef<char> Buf,
	size_t *BytesRead) {
	*BytesRead = sys::RetryAfterSignal(-1, ::read, FD, Buf.data(), Buf.size());
	if (ssize_t(*BytesRead) == -1)
	return std::error_code(errno, std::generic_category());
	return std::error_code();
	}

	std::error_code readNativeFileSlice(file_t FD, MutableArrayRef<char> Buf,
	size_t Offset) {
	char *BufPtr = Buf.data();
	size_t BytesLeft = Buf.size();

	#ifndef HAVE_PREAD
	// If we don't have pread, seek to Offset.
	if (lseek(FD, Offset, SEEK_SET) == -1)
	return std::error_code(errno, std::generic_category());
	#endif

	while (BytesLeft) {
	#ifdef HAVE_PREAD
	ssize_t NumRead = sys::RetryAfterSignal(-1, ::pread, FD, BufPtr, BytesLeft,
	Buf.size() - BytesLeft + Offset);
	#else
	ssize_t NumRead = sys::RetryAfterSignal(-1, ::read, FD, BufPtr, BytesLeft);
	#endif
	if (NumRead == -1) {
	// Error while reading.
	return std::error_code(errno, std::generic_category());
	}
	if (NumRead == 0) {
	memset(BufPtr, 0, BytesLeft); // zero-initialize rest of the buffer.
	break;
	}
	BytesLeft -= NumRead;
	BufPtr += NumRead;
	}
	return std::error_code();
	}

	std::error_code closeFile(file_t &F) {
	file_t TmpF = F;
	F = kInvalidFile;
	return Process::SafelyCloseFileDescriptor(TmpF);
	}

	template <typename T>
	static std::error_code remove_directories_impl(const T &Entry,
	bool IgnoreErrors) {
	std::error_code EC;
	directory_iterator Begin(Entry, EC, false);
	directory_iterator End;
	while (Begin != End) {
	auto &Item = *Begin;
	ErrorOr<basic_file_status> st = Item.status();
	if (!st && !IgnoreErrors)
	return st.getError();

	if (is_directory(*st)) {
	EC = remove_directories_impl(Item, IgnoreErrors);
	if (EC && !IgnoreErrors)
	return EC;
	}

	EC = fs::remove(Item.path(), true);
	if (EC && !IgnoreErrors)
	return EC;

	Begin.increment(EC);
	if (EC && !IgnoreErrors)
	return EC;
	}
	return std::error_code();
	}

	std::error_code remove_directories(const Twine &path, bool IgnoreErrors) {
	auto EC = remove_directories_impl(path, IgnoreErrors);
	if (EC && !IgnoreErrors)
	return EC;
	EC = fs::remove(path, true);
	if (EC && !IgnoreErrors)
	return EC;
	return std::error_code();
	}

	std::error_code real_path(const Twine &path, SmallVectorImpl<char> &dest,
	bool expand_tilde) {
	dest.clear();
	if (path.isTriviallyEmpty())
	return std::error_code();

	if (expand_tilde) {
	SmallString<128> Storage;
	path.toVector(Storage);
	expandTildeExpr(Storage);
	return real_path(Storage, dest, false);
	}

	SmallString<128> Storage;
	StringRef P = path.toNullTerminatedStringRef(Storage);
	char Buffer[PATH_MAX];
	if (::realpath(P.begin(), Buffer) == nullptr)
	return std::error_code(errno, std::generic_category());
	dest.append(Buffer, Buffer + strlen(Buffer));
	return std::error_code();
	}

	} // end namespace fs

	namespace path {

	bool home_directory(SmallVectorImpl<char> &result) {
	char *RequestedDir = getenv("HOME");
	if (!RequestedDir) {
	struct passwd *pw = getpwuid(getuid());
	if (pw && pw->pw_dir)
	RequestedDir = pw->pw_dir;
	}
	if (!RequestedDir)
	return false;

	result.clear();
	result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
	return true;
	}

	static bool getDarwinConfDir(bool TempDir, SmallVectorImpl<char> &Result) {
	#if defined(_CS_DARWIN_USER_TEMP_DIR) && defined(_CS_DARWIN_USER_CACHE_DIR)
	// On Darwin, use DARWIN_USER_TEMP_DIR or DARWIN_USER_CACHE_DIR.
	// macros defined in <unistd.h> on darwin >= 9
	int ConfName = TempDir ? _CS_DARWIN_USER_TEMP_DIR
	: _CS_DARWIN_USER_CACHE_DIR;
	size_t ConfLen = confstr(ConfName, nullptr, 0);
	if (ConfLen > 0) {
	do {
	Result.resize(ConfLen);
	ConfLen = confstr(ConfName, Result.data(), Result.size());
	} while (ConfLen > 0 && ConfLen != Result.size());

	if (ConfLen > 0) {
	assert(Result.back() == 0);
	Result.pop_back();
	return true;
	}

	Result.clear();
	}
	#endif
	return false;
	}

	static const char *getEnvTempDir() {
	// Check whether the temporary directory is specified by an environment
	// variable.
	const char *EnvironmentVariables[] = {"TMPDIR", "TMP", "TEMP", "TEMPDIR"};
	for (const char *Env : EnvironmentVariables) {
	if (const char *Dir = std::getenv(Env))
	return Dir;
	}

	return nullptr;
	}

	static const char *getDefaultTempDir(bool ErasedOnReboot) {
	#ifdef P_tmpdir
	if ((bool)P_tmpdir)
	return P_tmpdir;
	#endif

	if (ErasedOnReboot)
	return "/tmp";
	return "/var/tmp";
	}

	void system_temp_directory(bool ErasedOnReboot, SmallVectorImpl<char> &Result) {
	Result.clear();

	if (ErasedOnReboot) {
	// There is no env variable for the cache directory.
	if (const char *RequestedDir = getEnvTempDir()) {
	Result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
	return;
	}
	}

	if (getDarwinConfDir(ErasedOnReboot, Result))
	return;

	const char *RequestedDir = getDefaultTempDir(ErasedOnReboot);
	Result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
	}

	} // end namespace path

	namespace fs {

	#ifdef __APPLE__
	/// This implementation tries to perform an APFS CoW clone of the file,
	/// which can be much faster and uses less space.
	/// Unfortunately fcopyfile(3) does not support COPYFILE_CLONE, so the
	/// file descriptor variant of this function still uses the default
	/// implementation.
	std::error_code copy_file(const Twine &From, const Twine &To) {
	uint32_t Flag = COPYFILE_DATA;
	-#if __has_builtin(__builtin_available)
	+#if __has_builtin(__builtin_available) && defined(COPYFILE_CLONE)
	if (__builtin_available(macos 10.12, *)) {
	bool IsSymlink;
	if (std::error_code Error = is_symlink_file(From, IsSymlink))
	return Error;
	// COPYFILE_CLONE clones the symlink instead of following it
	// and returns EEXISTS if the target file already exists.
	if (!IsSymlink && !exists(To))
	Flag = COPYFILE_CLONE;
	}
	#endif
	int Status =
	copyfile(From.str().c_str(), To.str().c_str(), /* State */ NULL, Flag);

	if (Status == 0)
	return std::error_code();
	return std::error_code(errno, std::generic_category());
	}
	#endif // __APPLE__

	} // end namespace fs

	} // end namespace sys
	} // end namespace llvm
	Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64.td
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64.td (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64.td (revision 351303)
	@@ -1,822 +1,822 @@
	//=- AArch64.td - Describe the AArch64 Target Machine --------- tablegen --=//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Target-independent interfaces which we are implementing.
	//===----------------------------------------------------------------------===//

	include "llvm/Target/Target.td"

	//===----------------------------------------------------------------------===//
	// AArch64 Subtarget features.
	//

	def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
	"Enable ARMv8 FP">;

	def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
	"Enable Advanced SIMD instructions", [FeatureFPARMv8]>;

	def FeatureSM4 : SubtargetFeature<
	"sm4", "HasSM4", "true",
	"Enable SM3 and SM4 support", [FeatureNEON]>;

	def FeatureSHA2 : SubtargetFeature<
	"sha2", "HasSHA2", "true",
	"Enable SHA1 and SHA256 support", [FeatureNEON]>;

	def FeatureSHA3 : SubtargetFeature<
	"sha3", "HasSHA3", "true",
	"Enable SHA512 and SHA3 support", [FeatureNEON, FeatureSHA2]>;

	def FeatureAES : SubtargetFeature<
	"aes", "HasAES", "true",
	"Enable AES support", [FeatureNEON]>;

	// Crypto has been split up and any combination is now valid (see the
	// crypto defintions above). Also, crypto is now context sensitive:
	// it has a different meaning for e.g. Armv8.4 than it has for Armv8.2.
	// Therefore, we rely on Clang, the user interacing tool, to pass on the
	// appropriate crypto options. But here in the backend, crypto has very little
	// meaning anymore. We kept the Crypto defintion here for backward
	// compatibility, and now imply features SHA2 and AES, which was the
	// "traditional" meaning of Crypto.
	def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
	"Enable cryptographic instructions", [FeatureNEON, FeatureSHA2, FeatureAES]>;

	def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
	"Enable ARMv8 CRC-32 checksum instructions">;

	def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
	"Enable ARMv8 Reliability, Availability and Serviceability Extensions">;

	def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
	"Enable ARMv8.1 Large System Extension (LSE) atomic instructions">;

	def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true",
	"Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">;

	def FeaturePAN : SubtargetFeature<
	"pan", "HasPAN", "true",
	"Enables ARM v8.1 Privileged Access-Never extension">;

	def FeatureLOR : SubtargetFeature<
	"lor", "HasLOR", "true",
	"Enables ARM v8.1 Limited Ordering Regions extension">;

	def FeatureVH : SubtargetFeature<
	"vh", "HasVH", "true",
	"Enables ARM v8.1 Virtual Host extension">;

	def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
	"Enable ARMv8 PMUv3 Performance Monitors extension">;

	def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
	"Full FP16", [FeatureFPARMv8]>;

	def FeatureFP16FML : SubtargetFeature<"fp16fml", "HasFP16FML", "true",
	"Enable FP16 FML instructions", [FeatureFullFP16]>;

	def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
	"Enable Statistical Profiling extension">;

	def FeaturePAN_RWV : SubtargetFeature<
	"pan-rwv", "HasPAN_RWV", "true",
	"Enable v8.2 PAN s1e1R and s1e1W Variants",
	[FeaturePAN]>;

	// UAO PState
	def FeaturePsUAO : SubtargetFeature< "uaops", "HasPsUAO", "true",
	"Enable v8.2 UAO PState">;

	def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP",
	"true", "Enable v8.2 data Cache Clean to Point of Persistence" >;

	def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
	"Enable Scalable Vector Extension (SVE) instructions">;

	def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true",
	"Enable Scalable Vector Extension 2 (SVE2) instructions", [FeatureSVE]>;

	def FeatureSVE2AES : SubtargetFeature<"sve2-aes", "HasSVE2AES", "true",
	"Enable AES SVE2 instructions", [FeatureSVE2, FeatureAES]>;

	def FeatureSVE2SM4 : SubtargetFeature<"sve2-sm4", "HasSVE2SM4", "true",
	"Enable SM4 SVE2 instructions", [FeatureSVE2, FeatureSM4]>;

	def FeatureSVE2SHA3 : SubtargetFeature<"sve2-sha3", "HasSVE2SHA3", "true",
	"Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>;

	-def FeatureSVE2BitPerm : SubtargetFeature<"bitperm", "HasSVE2BitPerm", "true",
	+def FeatureSVE2BitPerm : SubtargetFeature<"sve2-bitperm", "HasSVE2BitPerm", "true",
	"Enable bit permutation SVE2 instructions", [FeatureSVE2]>;

	def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
	"Has zero-cycle register moves">;
	def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
	"Has zero-cycle zeroing instructions for generic registers">;

	def FeatureZCZeroingFP : SubtargetFeature<"zcz-fp", "HasZeroCycleZeroingFP", "true",
	"Has zero-cycle zeroing instructions for FP registers">;

	def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
	"Has zero-cycle zeroing instructions",
	[FeatureZCZeroingGP, FeatureZCZeroingFP]>;

	/// ... but the floating-point version doesn't quite work in rare cases on older
	/// CPUs.
	def FeatureZCZeroingFPWorkaround : SubtargetFeature<"zcz-fp-workaround",
	"HasZeroCycleZeroingFPWorkaround", "true",
	"The zero-cycle floating-point zeroing instruction has a bug">;

	def FeatureStrictAlign : SubtargetFeature<"strict-align",
	"StrictAlign", "true",
	"Disallow all unaligned memory "
	"access">;

	foreach i = {1-7,9-15,18,20-28} in
	def FeatureReserveX#i : SubtargetFeature<"reserve-x"#i, "ReserveXRegister["#i#"]", "true",
	"Reserve X"#i#", making it unavailable "
	"as a GPR">;

	foreach i = {8-15,18} in
	def FeatureCallSavedX#i : SubtargetFeature<"call-saved-x"#i,
	"CustomCallSavedXRegs["#i#"]", "true", "Make X"#i#" callee saved.">;

	def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
	"Use alias analysis during codegen">;

	def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps",
	"true",
	"balance mix of odd and even D-registers for fp multiply(-accumulate) ops">;

	def FeaturePredictableSelectIsExpensive : SubtargetFeature<
	"predictable-select-expensive", "PredictableSelectIsExpensive", "true",
	"Prefer likely predicted branches over selects">;

	def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
	"CustomAsCheapAsMove", "true",
	"Use custom handling of cheap instructions">;

	def FeatureExynosCheapAsMoveHandling : SubtargetFeature<"exynos-cheap-as-move",
	"ExynosAsCheapAsMove", "true",
	"Use Exynos specific handling of cheap instructions",
	[FeatureCustomCheapAsMoveHandling]>;

	def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
	"UsePostRAScheduler", "true", "Schedule again after register allocation">;

	def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
	"Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;

	def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128",
	"Paired128IsSlow", "true", "Paired 128 bit loads and stores are slow">;

	def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "STRQroIsSlow",
	"true", "STR of Q register with register offset is slow">;

	def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
	"alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",
	"true", "Use alternative pattern for sextload convert to f32">;

	def FeatureArithmeticBccFusion : SubtargetFeature<
	"arith-bcc-fusion", "HasArithmeticBccFusion", "true",
	"CPU fuses arithmetic+bcc operations">;

	def FeatureArithmeticCbzFusion : SubtargetFeature<
	"arith-cbz-fusion", "HasArithmeticCbzFusion", "true",
	"CPU fuses arithmetic + cbz/cbnz operations">;

	def FeatureFuseAddress : SubtargetFeature<
	"fuse-address", "HasFuseAddress", "true",
	"CPU fuses address generation and memory operations">;

	def FeatureFuseAES : SubtargetFeature<
	"fuse-aes", "HasFuseAES", "true",
	"CPU fuses AES crypto operations">;

	def FeatureFuseArithmeticLogic : SubtargetFeature<
	"fuse-arith-logic", "HasFuseArithmeticLogic", "true",
	"CPU fuses arithmetic and logic operations">;

	def FeatureFuseCCSelect : SubtargetFeature<
	"fuse-csel", "HasFuseCCSelect", "true",
	"CPU fuses conditional select operations">;

	def FeatureFuseCryptoEOR : SubtargetFeature<
	"fuse-crypto-eor", "HasFuseCryptoEOR", "true",
	"CPU fuses AES/PMULL and EOR operations">;

	def FeatureFuseLiterals : SubtargetFeature<
	"fuse-literals", "HasFuseLiterals", "true",
	"CPU fuses literal generation operations">;

	def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
	"disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
	"Disable latency scheduling heuristic">;

	def FeatureForce32BitJumpTables
	: SubtargetFeature<"force-32bit-jump-tables", "Force32BitJumpTables", "true",
	"Force jump table entries to be 32-bits wide except at MinSize">;

	def FeatureRCPC : SubtargetFeature<"rcpc", "HasRCPC", "true",
	"Enable support for RCPC extension">;

	def FeatureUseRSqrt : SubtargetFeature<
	"use-reciprocal-square-root", "UseRSqrt", "true",
	"Use the reciprocal square root approximation">;

	def FeatureDotProd : SubtargetFeature<
	"dotprod", "HasDotProd", "true",
	"Enable dot product support">;

	def FeaturePA : SubtargetFeature<
	"pa", "HasPA", "true",
	"Enable v8.3-A Pointer Authentication enchancement">;

	def FeatureJS : SubtargetFeature<
	"jsconv", "HasJS", "true",
	"Enable v8.3-A JavaScript FP conversion enchancement",
	[FeatureFPARMv8]>;

	def FeatureCCIDX : SubtargetFeature<
	"ccidx", "HasCCIDX", "true",
	"Enable v8.3-A Extend of the CCSIDR number of sets">;

	def FeatureComplxNum : SubtargetFeature<
	"complxnum", "HasComplxNum", "true",
	"Enable v8.3-A Floating-point complex number support",
	[FeatureNEON]>;

	def FeatureNV : SubtargetFeature<
	"nv", "HasNV", "true",
	"Enable v8.4-A Nested Virtualization Enchancement">;

	def FeatureRASv8_4 : SubtargetFeature<
	"rasv8_4", "HasRASv8_4", "true",
	"Enable v8.4-A Reliability, Availability and Serviceability extension",
	[FeatureRAS]>;

	def FeatureMPAM : SubtargetFeature<
	"mpam", "HasMPAM", "true",
	"Enable v8.4-A Memory system Partitioning and Monitoring extension">;

	def FeatureDIT : SubtargetFeature<
	"dit", "HasDIT", "true",
	"Enable v8.4-A Data Independent Timing instructions">;

	def FeatureTRACEV8_4 : SubtargetFeature<
	"tracev8.4", "HasTRACEV8_4", "true",
	"Enable v8.4-A Trace extension">;

	def FeatureAM : SubtargetFeature<
	"am", "HasAM", "true",
	"Enable v8.4-A Activity Monitors extension">;

	def FeatureSEL2 : SubtargetFeature<
	"sel2", "HasSEL2", "true",
	"Enable v8.4-A Secure Exception Level 2 extension">;

	def FeatureTLB_RMI : SubtargetFeature<
	"tlb-rmi", "HasTLB_RMI", "true",
	"Enable v8.4-A TLB Range and Maintenance Instructions">;

	def FeatureFMI : SubtargetFeature<
	"fmi", "HasFMI", "true",
	"Enable v8.4-A Flag Manipulation Instructions">;

	// 8.4 RCPC enchancements: LDAPR & STLR instructions with Immediate Offset
	def FeatureRCPC_IMMO : SubtargetFeature<"rcpc-immo", "HasRCPC_IMMO", "true",
	"Enable v8.4-A RCPC instructions with Immediate Offsets",
	[FeatureRCPC]>;

	def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
	"NegativeImmediates", "false",
	"Convert immediates and instructions "
	"to their negated or complemented "
	"equivalent when the immediate does "
	"not fit in the encoding.">;

	def FeatureLSLFast : SubtargetFeature<
	"lsl-fast", "HasLSLFast", "true",
	"CPU has a fastpath logical shift of up to 3 places">;

	def FeatureAggressiveFMA :
	SubtargetFeature<"aggressive-fma",
	"HasAggressiveFMA",
	"true",
	"Enable Aggressive FMA for floating-point.">;

	def FeatureAltFPCmp : SubtargetFeature<"altnzcv", "HasAlternativeNZCV", "true",
	"Enable alternative NZCV format for floating point comparisons">;

	def FeatureFRInt3264 : SubtargetFeature<"fptoint", "HasFRInt3264", "true",
	"Enable FRInt[32\|64][Z\|X] instructions that round a floating-point number to "
	"an integer (in FP format) forcing it to fit into a 32- or 64-bit int" >;

	def FeatureSpecRestrict : SubtargetFeature<"specrestrict", "HasSpecRestrict",
	"true", "Enable architectural speculation restriction" >;

	def FeatureSB : SubtargetFeature<"sb", "HasSB",
	"true", "Enable v8.5 Speculation Barrier" >;

	def FeatureSSBS : SubtargetFeature<"ssbs", "HasSSBS",
	"true", "Enable Speculative Store Bypass Safe bit" >;

	def FeaturePredRes : SubtargetFeature<"predres", "HasPredRes", "true",
	"Enable v8.5a execution and data prediction invalidation instructions" >;

	def FeatureCacheDeepPersist : SubtargetFeature<"ccdp", "HasCCDP",
	"true", "Enable v8.5 Cache Clean to Point of Deep Persistence" >;

	def FeatureBranchTargetId : SubtargetFeature<"bti", "HasBTI",
	"true", "Enable Branch Target Identification" >;

	def FeatureRandGen : SubtargetFeature<"rand", "HasRandGen",
	"true", "Enable Random Number generation instructions" >;

	def FeatureMTE : SubtargetFeature<"mte", "HasMTE",
	"true", "Enable Memory Tagging Extension" >;

	//===----------------------------------------------------------------------===//
	// Architectures.
	//

	def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
	"Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM,
	FeaturePAN, FeatureLOR, FeatureVH]>;

	def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
	"Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO,
	FeaturePAN_RWV, FeatureRAS, FeatureCCPP]>;

	def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true",
	"Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC, FeaturePA,
	FeatureJS, FeatureCCIDX, FeatureComplxNum]>;

	def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
	"Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd,
	FeatureNV, FeatureRASv8_4, FeatureMPAM, FeatureDIT,
	FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI,
	FeatureFMI, FeatureRCPC_IMMO]>;

	def HasV8_5aOps : SubtargetFeature<
	"v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions",
	[HasV8_4aOps, FeatureAltFPCmp, FeatureFRInt3264, FeatureSpecRestrict,
	FeatureSSBS, FeatureSB, FeaturePredRes, FeatureCacheDeepPersist,
	FeatureBranchTargetId]
	>;

	//===----------------------------------------------------------------------===//
	// Register File Description
	//===----------------------------------------------------------------------===//

	include "AArch64RegisterInfo.td"
	include "AArch64RegisterBanks.td"
	include "AArch64CallingConvention.td"

	//===----------------------------------------------------------------------===//
	// Instruction Descriptions
	//===----------------------------------------------------------------------===//

	include "AArch64Schedule.td"
	include "AArch64InstrInfo.td"
	include "AArch64SchedPredicates.td"
	include "AArch64SchedPredExynos.td"

	def AArch64InstrInfo : InstrInfo;

	//===----------------------------------------------------------------------===//
	// Named operands for MRS/MSR/TLBI/...
	//===----------------------------------------------------------------------===//

	include "AArch64SystemOperands.td"

	//===----------------------------------------------------------------------===//
	// Access to privileged registers
	//===----------------------------------------------------------------------===//

	foreach i = 1-3 in
	def FeatureUseEL#i#ForTP : SubtargetFeature<"tpidr-el"#i, "UseEL"#i#"ForTP",
	"true", "Permit use of TPIDR_EL"#i#" for the TLS base">;

	//===----------------------------------------------------------------------===//
	// AArch64 Processors supported.
	//

	//===----------------------------------------------------------------------===//
	// Unsupported features to disable for scheduling models
	//===----------------------------------------------------------------------===//

	class AArch64Unsupported { list<Predicate> F; }

	def SVEUnsupported : AArch64Unsupported {
	let F = [HasSVE, HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3,
	HasSVE2BitPerm];
	}

	include "AArch64SchedA53.td"
	include "AArch64SchedA57.td"
	include "AArch64SchedCyclone.td"
	include "AArch64SchedFalkor.td"
	include "AArch64SchedKryo.td"
	include "AArch64SchedExynosM1.td"
	include "AArch64SchedExynosM3.td"
	include "AArch64SchedExynosM4.td"
	include "AArch64SchedThunderX.td"
	include "AArch64SchedThunderX2T99.td"

	def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
	"Cortex-A35 ARM processors", [
	FeatureCRC,
	FeatureCrypto,
	FeatureFPARMv8,
	FeatureNEON,
	FeaturePerfMon
	]>;

	def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
	"Cortex-A53 ARM processors", [
	FeatureBalanceFPOps,
	FeatureCRC,
	FeatureCrypto,
	FeatureCustomCheapAsMoveHandling,
	FeatureFPARMv8,
	FeatureFuseAES,
	FeatureNEON,
	FeaturePerfMon,
	FeaturePostRAScheduler,
	FeatureUseAA
	]>;

	def ProcA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
	"Cortex-A55 ARM processors", [
	HasV8_2aOps,
	FeatureCrypto,
	FeatureFPARMv8,
	FeatureFuseAES,
	FeatureNEON,
	FeatureFullFP16,
	FeatureDotProd,
	FeatureRCPC,
	FeaturePerfMon
	]>;

	def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
	"Cortex-A57 ARM processors", [
	FeatureBalanceFPOps,
	FeatureCRC,
	FeatureCrypto,
	FeatureCustomCheapAsMoveHandling,
	FeatureFPARMv8,
	FeatureFuseAES,
	FeatureFuseLiterals,
	FeatureNEON,
	FeaturePerfMon,
	FeaturePostRAScheduler,
	FeaturePredictableSelectIsExpensive
	]>;

	def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
	"Cortex-A72 ARM processors", [
	FeatureCRC,
	FeatureCrypto,
	FeatureFPARMv8,
	FeatureFuseAES,
	FeatureNEON,
	FeaturePerfMon
	]>;

	def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
	"Cortex-A73 ARM processors", [
	FeatureCRC,
	FeatureCrypto,
	FeatureFPARMv8,
	FeatureFuseAES,
	FeatureNEON,
	FeaturePerfMon
	]>;

	def ProcA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
	"Cortex-A75 ARM processors", [
	HasV8_2aOps,
	FeatureCrypto,
	FeatureFPARMv8,
	FeatureFuseAES,
	FeatureNEON,
	FeatureFullFP16,
	FeatureDotProd,
	FeatureRCPC,
	FeaturePerfMon
	]>;

	def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
	"Cortex-A76 ARM processors", [
	HasV8_2aOps,
	FeatureFPARMv8,
	FeatureNEON,
	FeatureRCPC,
	FeatureCrypto,
	FeatureFullFP16,
	FeatureDotProd,
	FeatureSSBS
	]>;

	// Note that cyclone does not fuse AES instructions, but newer apple chips do
	// perform the fusion and cyclone is used by default when targetting apple OSes.
	def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
	"Cyclone", [
	FeatureAlternateSExtLoadCVTF32Pattern,
	FeatureArithmeticBccFusion,
	FeatureArithmeticCbzFusion,
	FeatureCrypto,
	FeatureDisableLatencySchedHeuristic,
	FeatureFPARMv8,
	FeatureFuseAES,
	FeatureFuseCryptoEOR,
	FeatureNEON,
	FeaturePerfMon,
	FeatureZCRegMove,
	FeatureZCZeroing,
	FeatureZCZeroingFPWorkaround
	]>;

	def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
	"Samsung Exynos-M1 processors",
	[FeatureSlowPaired128,
	FeatureCRC,
	FeatureCrypto,
	FeatureExynosCheapAsMoveHandling,
	FeatureForce32BitJumpTables,
	FeatureFuseAES,
	FeaturePerfMon,
	FeaturePostRAScheduler,
	FeatureSlowMisaligned128Store,
	FeatureUseRSqrt,
	FeatureZCZeroingFP]>;

	def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
	"Samsung Exynos-M2 processors",
	[FeatureSlowPaired128,
	FeatureCRC,
	FeatureCrypto,
	FeatureExynosCheapAsMoveHandling,
	FeatureForce32BitJumpTables,
	FeatureFuseAES,
	FeaturePerfMon,
	FeaturePostRAScheduler,
	FeatureSlowMisaligned128Store,
	FeatureZCZeroingFP]>;

	def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
	"Samsung Exynos-M3 processors",
	[FeatureCRC,
	FeatureCrypto,
	FeatureExynosCheapAsMoveHandling,
	FeatureForce32BitJumpTables,
	FeatureFuseAddress,
	FeatureFuseAES,
	FeatureFuseCCSelect,
	FeatureFuseLiterals,
	FeatureLSLFast,
	FeaturePerfMon,
	FeaturePostRAScheduler,
	FeaturePredictableSelectIsExpensive,
	FeatureZCZeroingFP]>;

	def ProcExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
	"Samsung Exynos-M4 processors",
	[HasV8_2aOps,
	FeatureArithmeticBccFusion,
	FeatureArithmeticCbzFusion,
	FeatureCrypto,
	FeatureDotProd,
	FeatureExynosCheapAsMoveHandling,
	FeatureForce32BitJumpTables,
	FeatureFullFP16,
	FeatureFuseAddress,
	FeatureFuseAES,
	FeatureFuseArithmeticLogic,
	FeatureFuseCCSelect,
	FeatureFuseLiterals,
	FeatureLSLFast,
	FeaturePerfMon,
	FeaturePostRAScheduler,
	FeatureZCZeroing]>;

	def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
	"Qualcomm Kryo processors", [
	FeatureCRC,
	FeatureCrypto,
	FeatureCustomCheapAsMoveHandling,
	FeatureFPARMv8,
	FeatureNEON,
	FeaturePerfMon,
	FeaturePostRAScheduler,
	FeaturePredictableSelectIsExpensive,
	FeatureZCZeroing,
	FeatureLSLFast
	]>;

	def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
	"Qualcomm Falkor processors", [
	FeatureCRC,
	FeatureCrypto,
	FeatureCustomCheapAsMoveHandling,
	FeatureFPARMv8,
	FeatureNEON,
	FeaturePerfMon,
	FeaturePostRAScheduler,
	FeaturePredictableSelectIsExpensive,
	FeatureRDM,
	FeatureZCZeroing,
	FeatureLSLFast,
	FeatureSlowSTRQro
	]>;

	def ProcSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
	"Qualcomm Saphira processors", [
	FeatureCrypto,
	FeatureCustomCheapAsMoveHandling,
	FeatureFPARMv8,
	FeatureNEON,
	FeatureSPE,
	FeaturePerfMon,
	FeaturePostRAScheduler,
	FeaturePredictableSelectIsExpensive,
	FeatureZCZeroing,
	FeatureLSLFast,
	HasV8_4aOps]>;

	def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
	"ThunderX2T99",
	"Cavium ThunderX2 processors", [
	FeatureAggressiveFMA,
	FeatureCRC,
	FeatureCrypto,
	FeatureFPARMv8,
	FeatureArithmeticBccFusion,
	FeatureNEON,
	FeaturePostRAScheduler,
	FeaturePredictableSelectIsExpensive,
	FeatureLSE,
	HasV8_1aOps]>;

	def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
	"Cavium ThunderX processors", [
	FeatureCRC,
	FeatureCrypto,
	FeatureFPARMv8,
	FeaturePerfMon,
	FeaturePostRAScheduler,
	FeaturePredictableSelectIsExpensive,
	FeatureNEON]>;

	def ProcThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily",
	"ThunderXT88",
	"Cavium ThunderX processors", [
	FeatureCRC,
	FeatureCrypto,
	FeatureFPARMv8,
	FeaturePerfMon,
	FeaturePostRAScheduler,
	FeaturePredictableSelectIsExpensive,
	FeatureNEON]>;

	def ProcThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily",
	"ThunderXT81",
	"Cavium ThunderX processors", [
	FeatureCRC,
	FeatureCrypto,
	FeatureFPARMv8,
	FeaturePerfMon,
	FeaturePostRAScheduler,
	FeaturePredictableSelectIsExpensive,
	FeatureNEON]>;

	def ProcThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily",
	"ThunderXT83",
	"Cavium ThunderX processors", [
	FeatureCRC,
	FeatureCrypto,
	FeatureFPARMv8,
	FeaturePerfMon,
	FeaturePostRAScheduler,
	FeaturePredictableSelectIsExpensive,
	FeatureNEON]>;

	def ProcTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
	"HiSilicon TS-V110 processors", [
	HasV8_2aOps,
	FeatureCrypto,
	FeatureCustomCheapAsMoveHandling,
	FeatureFPARMv8,
	FeatureFuseAES,
	FeatureNEON,
	FeaturePerfMon,
	FeaturePostRAScheduler,
	FeatureSPE,
	FeatureFullFP16,
	FeatureFP16FML,
	FeatureDotProd]>;

	def : ProcessorModel<"generic", NoSchedModel, [
	FeatureFPARMv8,
	FeatureFuseAES,
	FeatureNEON,
	FeaturePerfMon,
	FeaturePostRAScheduler
	]>;

	// FIXME: Cortex-A35 and Cortex-A55 are currently modeled as a Cortex-A53.
	def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
	def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
	def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>;
	def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
	def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
	def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
	def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>;
	def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>;
	def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>;
	def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
	def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
	def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
	def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>;
	def : ProcessorModel<"exynos-m4", ExynosM4Model, [ProcExynosM4]>;
	def : ProcessorModel<"exynos-m5", ExynosM4Model, [ProcExynosM4]>;
	def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>;
	def : ProcessorModel<"saphira", FalkorModel, [ProcSaphira]>;
	def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
	// Cavium ThunderX/ThunderX T8X Processors
	def : ProcessorModel<"thunderx", ThunderXT8XModel, [ProcThunderX]>;
	def : ProcessorModel<"thunderxt88", ThunderXT8XModel, [ProcThunderXT88]>;
	def : ProcessorModel<"thunderxt81", ThunderXT8XModel, [ProcThunderXT81]>;
	def : ProcessorModel<"thunderxt83", ThunderXT8XModel, [ProcThunderXT83]>;
	// Cavium ThunderX2T9X Processors. Formerly Broadcom Vulcan.
	def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>;
	// FIXME: HiSilicon TSV110 is currently modeled as a Cortex-A57.
	def : ProcessorModel<"tsv110", CortexA57Model, [ProcTSV110]>;

	// Alias for the latest Apple processor model supported by LLVM.
	def : ProcessorModel<"apple-latest", CycloneModel, [ProcCyclone]>;

	//===----------------------------------------------------------------------===//
	// Assembly parser
	//===----------------------------------------------------------------------===//

	def GenericAsmParserVariant : AsmParserVariant {
	int Variant = 0;
	string Name = "generic";
	string BreakCharacters = ".";
	string TokenizingCharacters = "[]*!/";
	}

	def AppleAsmParserVariant : AsmParserVariant {
	int Variant = 1;
	string Name = "apple-neon";
	string BreakCharacters = ".";
	string TokenizingCharacters = "[]*!/";
	}

	//===----------------------------------------------------------------------===//
	// Assembly printer
	//===----------------------------------------------------------------------===//
	// AArch64 Uses the MC printer for asm output, so make sure the TableGen
	// AsmWriter bits get associated with the correct class.
	def GenericAsmWriter : AsmWriter {
	string AsmWriterClassName = "InstPrinter";
	int PassSubtarget = 1;
	int Variant = 0;
	bit isMCAsmWriter = 1;
	}

	def AppleAsmWriter : AsmWriter {
	let AsmWriterClassName = "AppleInstPrinter";
	int PassSubtarget = 1;
	int Variant = 1;
	int isMCAsmWriter = 1;
	}

	//===----------------------------------------------------------------------===//
	// Target Declaration
	//===----------------------------------------------------------------------===//

	def AArch64 : Target {
	let InstructionSet = AArch64InstrInfo;
	let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
	let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
	let AllowRegisterRenaming = 1;
	}

	//===----------------------------------------------------------------------===//
	// Pfm Counters
	//===----------------------------------------------------------------------===//

	include "AArch64PfmCounters.td"
	Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64ISelLowering.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 351303)
	@@ -1,12071 +1,12083 @@
	//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the AArch64TargetLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64ExpandImm.h"
	#include "AArch64ISelLowering.h"
	#include "AArch64CallingConvention.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64PerfectShuffle.h"
	#include "AArch64RegisterInfo.h"
	#include "AArch64Subtarget.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "Utils/AArch64BaseInfo.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetCallingConv.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/OperandTraits.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/Value.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MachineValueType.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cassert>
	#include <cctype>
	#include <cstdint>
	#include <cstdlib>
	#include <iterator>
	#include <limits>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;
	using namespace llvm::PatternMatch;

	#define DEBUG_TYPE "aarch64-lower"

	STATISTIC(NumTailCalls, "Number of tail calls");
	STATISTIC(NumShiftInserts, "Number of vector shift inserts");
	STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");

	static cl::opt<bool>
	EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
	cl::desc("Allow AArch64 SLI/SRI formation"),
	cl::init(false));

	// FIXME: The necessary dtprel relocations don't seem to be supported
	// well in the GNU bfd and gold linkers at the moment. Therefore, by
	// default, for now, fall back to GeneralDynamic code generation.
	cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
	"aarch64-elf-ldtls-generation", cl::Hidden,
	cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
	cl::init(false));

	static cl::opt<bool>
	EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
	cl::desc("Enable AArch64 logical imm instruction "
	"optimization"),
	cl::init(true));

	/// Value type used for condition codes.
	static const MVT MVT_CC = MVT::i32;

	AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
	const AArch64Subtarget &STI)
	: TargetLowering(TM), Subtarget(&STI) {
	// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
	// we have to make something up. Arbitrarily, choose ZeroOrOne.
	setBooleanContents(ZeroOrOneBooleanContent);
	// When comparing vectors the result sets the different elements in the
	// vector to all-one or all-zero.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// Set up the register classes.
	addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
	addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);

	if (Subtarget->hasFPARMv8()) {
	addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
	addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
	addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
	addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
	}

	if (Subtarget->hasNEON()) {
	addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
	addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
	// Someone set us up the NEON.
	addDRTypeForNEON(MVT::v2f32);
	addDRTypeForNEON(MVT::v8i8);
	addDRTypeForNEON(MVT::v4i16);
	addDRTypeForNEON(MVT::v2i32);
	addDRTypeForNEON(MVT::v1i64);
	addDRTypeForNEON(MVT::v1f64);
	addDRTypeForNEON(MVT::v4f16);

	addQRTypeForNEON(MVT::v4f32);
	addQRTypeForNEON(MVT::v2f64);
	addQRTypeForNEON(MVT::v16i8);
	addQRTypeForNEON(MVT::v8i16);
	addQRTypeForNEON(MVT::v4i32);
	addQRTypeForNEON(MVT::v2i64);
	addQRTypeForNEON(MVT::v8f16);
	}

	// Compute derived properties from the register classes
	computeRegisterProperties(Subtarget->getRegisterInfo());

	// Provide all sorts of operation actions
	setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
	setOperationAction(ISD::SETCC, MVT::i32, Custom);
	setOperationAction(ISD::SETCC, MVT::i64, Custom);
	setOperationAction(ISD::SETCC, MVT::f16, Custom);
	setOperationAction(ISD::SETCC, MVT::f32, Custom);
	setOperationAction(ISD::SETCC, MVT::f64, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
	setOperationAction(ISD::BRCOND, MVT::Other, Expand);
	setOperationAction(ISD::BR_CC, MVT::i32, Custom);
	setOperationAction(ISD::BR_CC, MVT::i64, Custom);
	setOperationAction(ISD::BR_CC, MVT::f16, Custom);
	setOperationAction(ISD::BR_CC, MVT::f32, Custom);
	setOperationAction(ISD::BR_CC, MVT::f64, Custom);
	setOperationAction(ISD::SELECT, MVT::i32, Custom);
	setOperationAction(ISD::SELECT, MVT::i64, Custom);
	setOperationAction(ISD::SELECT, MVT::f16, Custom);
	setOperationAction(ISD::SELECT, MVT::f32, Custom);
	setOperationAction(ISD::SELECT, MVT::f64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
	setOperationAction(ISD::BR_JT, MVT::Other, Custom);
	setOperationAction(ISD::JumpTable, MVT::i64, Custom);

	setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);

	setOperationAction(ISD::FREM, MVT::f32, Expand);
	setOperationAction(ISD::FREM, MVT::f64, Expand);
	setOperationAction(ISD::FREM, MVT::f80, Expand);

	setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);

	// Custom lowering hooks are needed for XOR
	// to fold it into CSINC/CSINV.
	setOperationAction(ISD::XOR, MVT::i32, Custom);
	setOperationAction(ISD::XOR, MVT::i64, Custom);

	// Virtually no operation on f128 is legal, but LLVM can't expand them when
	// there's a valid register class, so we need custom operations in most cases.
	setOperationAction(ISD::FABS, MVT::f128, Expand);
	setOperationAction(ISD::FADD, MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
	setOperationAction(ISD::FCOS, MVT::f128, Expand);
	setOperationAction(ISD::FDIV, MVT::f128, Custom);
	setOperationAction(ISD::FMA, MVT::f128, Expand);
	setOperationAction(ISD::FMUL, MVT::f128, Custom);
	setOperationAction(ISD::FNEG, MVT::f128, Expand);
	setOperationAction(ISD::FPOW, MVT::f128, Expand);
	setOperationAction(ISD::FREM, MVT::f128, Expand);
	setOperationAction(ISD::FRINT, MVT::f128, Expand);
	setOperationAction(ISD::FSIN, MVT::f128, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
	setOperationAction(ISD::FSQRT, MVT::f128, Expand);
	setOperationAction(ISD::FSUB, MVT::f128, Custom);
	setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
	setOperationAction(ISD::SETCC, MVT::f128, Custom);
	setOperationAction(ISD::BR_CC, MVT::f128, Custom);
	setOperationAction(ISD::SELECT, MVT::f128, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
	setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

	// Lowering for many of the conversions is actually specified by the non-f128
	// type. The LowerXXX function will be trivial when f128 isn't involved.
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);

	// Variable arguments.
	setOperationAction(ISD::VASTART, MVT::Other, Custom);
	setOperationAction(ISD::VAARG, MVT::Other, Custom);
	setOperationAction(ISD::VACOPY, MVT::Other, Custom);
	setOperationAction(ISD::VAEND, MVT::Other, Expand);

	// Variable-sized objects.
	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	if (Subtarget->isTargetWindows())
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
	else
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);

	// Constant pool entries
	setOperationAction(ISD::ConstantPool, MVT::i64, Custom);

	// BlockAddress
	setOperationAction(ISD::BlockAddress, MVT::i64, Custom);

	// Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
	setOperationAction(ISD::ADDC, MVT::i32, Custom);
	setOperationAction(ISD::ADDE, MVT::i32, Custom);
	setOperationAction(ISD::SUBC, MVT::i32, Custom);
	setOperationAction(ISD::SUBE, MVT::i32, Custom);
	setOperationAction(ISD::ADDC, MVT::i64, Custom);
	setOperationAction(ISD::ADDE, MVT::i64, Custom);
	setOperationAction(ISD::SUBC, MVT::i64, Custom);
	setOperationAction(ISD::SUBE, MVT::i64, Custom);

	// AArch64 lacks both left-rotate and popcount instructions.
	setOperationAction(ISD::ROTL, MVT::i32, Expand);
	setOperationAction(ISD::ROTL, MVT::i64, Expand);
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	}

	// AArch64 doesn't have {U\|S}MUL_LOHI.
	setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);

	setOperationAction(ISD::CTPOP, MVT::i32, Custom);
	setOperationAction(ISD::CTPOP, MVT::i64, Custom);

	setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	}
	setOperationAction(ISD::SREM, MVT::i32, Expand);
	setOperationAction(ISD::SREM, MVT::i64, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
	setOperationAction(ISD::UREM, MVT::i32, Expand);
	setOperationAction(ISD::UREM, MVT::i64, Expand);

	// Custom lower Add/Sub/Mul with overflow.
	setOperationAction(ISD::SADDO, MVT::i32, Custom);
	setOperationAction(ISD::SADDO, MVT::i64, Custom);
	setOperationAction(ISD::UADDO, MVT::i32, Custom);
	setOperationAction(ISD::UADDO, MVT::i64, Custom);
	setOperationAction(ISD::SSUBO, MVT::i32, Custom);
	setOperationAction(ISD::SSUBO, MVT::i64, Custom);
	setOperationAction(ISD::USUBO, MVT::i32, Custom);
	setOperationAction(ISD::USUBO, MVT::i64, Custom);
	setOperationAction(ISD::SMULO, MVT::i32, Custom);
	setOperationAction(ISD::SMULO, MVT::i64, Custom);
	setOperationAction(ISD::UMULO, MVT::i32, Custom);
	setOperationAction(ISD::UMULO, MVT::i64, Custom);

	setOperationAction(ISD::FSIN, MVT::f32, Expand);
	setOperationAction(ISD::FSIN, MVT::f64, Expand);
	setOperationAction(ISD::FCOS, MVT::f32, Expand);
	setOperationAction(ISD::FCOS, MVT::f64, Expand);
	setOperationAction(ISD::FPOW, MVT::f32, Expand);
	setOperationAction(ISD::FPOW, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
	if (Subtarget->hasFullFP16())
	setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
	else
	setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);

	setOperationAction(ISD::FREM, MVT::f16, Promote);
	setOperationAction(ISD::FREM, MVT::v4f16, Expand);
	setOperationAction(ISD::FREM, MVT::v8f16, Expand);
	setOperationAction(ISD::FPOW, MVT::f16, Promote);
	setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
	setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
	setOperationAction(ISD::FPOWI, MVT::f16, Promote);
	setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
	setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
	setOperationAction(ISD::FCOS, MVT::f16, Promote);
	setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
	setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
	setOperationAction(ISD::FSIN, MVT::f16, Promote);
	setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
	setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
	setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
	setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
	setOperationAction(ISD::FEXP, MVT::f16, Promote);
	setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
	setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
	setOperationAction(ISD::FEXP2, MVT::f16, Promote);
	setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
	setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG, MVT::f16, Promote);
	setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG2, MVT::f16, Promote);
	setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG10, MVT::f16, Promote);
	setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);

	if (!Subtarget->hasFullFP16()) {
	setOperationAction(ISD::SELECT, MVT::f16, Promote);
	setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
	setOperationAction(ISD::SETCC, MVT::f16, Promote);
	setOperationAction(ISD::BR_CC, MVT::f16, Promote);
	setOperationAction(ISD::FADD, MVT::f16, Promote);
	setOperationAction(ISD::FSUB, MVT::f16, Promote);
	setOperationAction(ISD::FMUL, MVT::f16, Promote);
	setOperationAction(ISD::FDIV, MVT::f16, Promote);
	setOperationAction(ISD::FMA, MVT::f16, Promote);
	setOperationAction(ISD::FNEG, MVT::f16, Promote);
	setOperationAction(ISD::FABS, MVT::f16, Promote);
	setOperationAction(ISD::FCEIL, MVT::f16, Promote);
	setOperationAction(ISD::FSQRT, MVT::f16, Promote);
	setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
	setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
	setOperationAction(ISD::FRINT, MVT::f16, Promote);
	setOperationAction(ISD::FROUND, MVT::f16, Promote);
	setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
	setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
	setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
	setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
	setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);

	// promote v4f16 to v4f32 when that is known to be safe.
	setOperationAction(ISD::FADD, MVT::v4f16, Promote);
	setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
	setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
	setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
	setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
	setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
	AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);

	setOperationAction(ISD::FABS, MVT::v4f16, Expand);
	setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
	setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
	setOperationAction(ISD::FMA, MVT::v4f16, Expand);
	setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
	setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
	setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
	setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
	setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
	setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);

	setOperationAction(ISD::FABS, MVT::v8f16, Expand);
	setOperationAction(ISD::FADD, MVT::v8f16, Expand);
	setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
	setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
	setOperationAction(ISD::FMA, MVT::v8f16, Expand);
	setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
	setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
	setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
	setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
	setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
	setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
	setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
	setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
	setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
	}

	// AArch64 has implementations of a lot of rounding-like FP operations.
	for (MVT Ty : {MVT::f32, MVT::f64}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	setOperationAction(ISD::FMINNUM, Ty, Legal);
	setOperationAction(ISD::FMAXNUM, Ty, Legal);
	setOperationAction(ISD::FMINIMUM, Ty, Legal);
	setOperationAction(ISD::FMAXIMUM, Ty, Legal);
	setOperationAction(ISD::LROUND, Ty, Legal);
	setOperationAction(ISD::LLROUND, Ty, Legal);
	setOperationAction(ISD::LRINT, Ty, Legal);
	setOperationAction(ISD::LLRINT, Ty, Legal);
	}

	if (Subtarget->hasFullFP16()) {
	setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
	setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
	setOperationAction(ISD::FCEIL, MVT::f16, Legal);
	setOperationAction(ISD::FRINT, MVT::f16, Legal);
	setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
	setOperationAction(ISD::FROUND, MVT::f16, Legal);
	setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
	setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
	}

	setOperationAction(ISD::PREFETCH, MVT::Other, Custom);

	setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);

	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);

	// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
	// This requires the Performance Monitors extension.
	if (Subtarget->hasPerfMon())
	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);

	if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
	getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
	// Issue __sincos_stret if available.
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	} else {
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
	}

	// Make floating-point constants legal for the large code model, so they don't
	// become loads from the constant pool.
	if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
	setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
	setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
	}

	// AArch64 does not have floating-point extending loads, i1 sign-extending
	// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
	for (MVT VT : MVT::fp_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
	}
	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);

	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f32, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f128, MVT::f80, Expand);
	setTruncStoreAction(MVT::f128, MVT::f64, Expand);
	setTruncStoreAction(MVT::f128, MVT::f32, Expand);
	setTruncStoreAction(MVT::f128, MVT::f16, Expand);

	setOperationAction(ISD::BITCAST, MVT::i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::f16, Custom);

	// Indexed loads and stores are supported.
	for (unsigned im = (unsigned)ISD::PRE_INC;
	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
	setIndexedLoadAction(im, MVT::i8, Legal);
	setIndexedLoadAction(im, MVT::i16, Legal);
	setIndexedLoadAction(im, MVT::i32, Legal);
	setIndexedLoadAction(im, MVT::i64, Legal);
	setIndexedLoadAction(im, MVT::f64, Legal);
	setIndexedLoadAction(im, MVT::f32, Legal);
	setIndexedLoadAction(im, MVT::f16, Legal);
	setIndexedStoreAction(im, MVT::i8, Legal);
	setIndexedStoreAction(im, MVT::i16, Legal);
	setIndexedStoreAction(im, MVT::i32, Legal);
	setIndexedStoreAction(im, MVT::i64, Legal);
	setIndexedStoreAction(im, MVT::f64, Legal);
	setIndexedStoreAction(im, MVT::f32, Legal);
	setIndexedStoreAction(im, MVT::f16, Legal);
	}

	// Trap.
	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	if (Subtarget->isTargetWindows())
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// We combine OR nodes for bitfield operations.
	setTargetDAGCombine(ISD::OR);
	// Try to create BICs for vector ANDs.
	setTargetDAGCombine(ISD::AND);

	// Vector add and sub nodes may conceal a high-half opportunity.
	// Also, try to fold ADD into CSINC/CSINV..
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);

	setTargetDAGCombine(ISD::FP_TO_SINT);
	setTargetDAGCombine(ISD::FP_TO_UINT);
	setTargetDAGCombine(ISD::FDIV);

	setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);

	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::CONCAT_VECTORS);
	setTargetDAGCombine(ISD::STORE);
	if (Subtarget->supportsAddressTopByteIgnored())
	setTargetDAGCombine(ISD::LOAD);

	setTargetDAGCombine(ISD::MUL);

	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::VSELECT);

	setTargetDAGCombine(ISD::INTRINSIC_VOID);
	setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
	setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);

	setTargetDAGCombine(ISD::GlobalAddress);

	// In case of strict alignment, avoid an excessive number of byte wide stores.
	MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemset = Subtarget->requiresStrictAlign()
	? MaxStoresPerMemsetOptSize : 32;

	MaxGluedStoresPerMemcpy = 4;
	MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
	? MaxStoresPerMemcpyOptSize : 16;

	MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;

	+ MaxLoadsPerMemcmpOptSize = 4;
	+ MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
	+ ? MaxLoadsPerMemcmpOptSize : 8;
	+
	setStackPointerRegisterToSaveRestore(AArch64::SP);

	setSchedulingPreference(Sched::Hybrid);

	EnableExtLdPromotion = true;

	// Set required alignment.
	setMinFunctionAlignment(2);
	// Set preferred alignments.
	setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
	setPrefLoopAlignment(STI.getPrefLoopAlignment());

	// Only change the limit for entries in a jump table if specified by
	// the sub target, but not at the command line.
	unsigned MaxJT = STI.getMaximumJumpTableSize();
	if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
	setMaximumJumpTableSize(MaxJT);

	setHasExtractBitsInsn(true);

	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

	if (Subtarget->hasNEON()) {
	// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
	// silliness like this:
	setOperationAction(ISD::FABS, MVT::v1f64, Expand);
	setOperationAction(ISD::FADD, MVT::v1f64, Expand);
	setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
	setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
	setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
	setOperationAction(ISD::FMA, MVT::v1f64, Expand);
	setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
	setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
	setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
	setOperationAction(ISD::FREM, MVT::v1f64, Expand);
	setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
	setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
	setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
	setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
	setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
	setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
	setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
	setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);

	setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
	setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
	setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
	setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
	setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);

	setOperationAction(ISD::MUL, MVT::v1i64, Expand);

	// AArch64 doesn't have a direct vector ->f32 conversion instructions for
	// elements smaller than i32, so promote the input to i32 first.
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
	// i8 vector elements also need promotion to i32 for v8i8
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
	// Similarly, there is no direct i32 -> f64 vector conversion instruction.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
	// Or, direct i32 -> f16 vector conversion. Set it so custom, so the
	// conversion happens in two steps: v4i32 -> v4f32 -> v4f16
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

	if (Subtarget->hasFullFP16()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
	} else {
	// when AArch64 doesn't have fullfp16 support, promote the input
	// to i32 first.
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
	}

	setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);

	// AArch64 doesn't have MUL.2d:
	setOperationAction(ISD::MUL, MVT::v2i64, Expand);
	// Custom handling for some quad-vector types to detect MULL.
	setOperationAction(ISD::MUL, MVT::v8i16, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);

	// Vector reductions
	for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
	MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
	setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
	setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
	}
	for (MVT VT : { MVT::v4f16, MVT::v2f32,
	MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
	}

	setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
	// Likewise, narrowing and extending vector loads/stores aren't handled
	// directly.
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);

	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32) {
	setOperationAction(ISD::MULHS, VT, Legal);
	setOperationAction(ISD::MULHU, VT, Legal);
	} else {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	}
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);

	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);

	for (MVT InnerVT : MVT::vector_valuetypes()) {
	setTruncStoreAction(VT, InnerVT, Expand);
	setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
	}
	}

	// AArch64 has implementations of a lot of rounding-like FP operations.
	for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	}

	if (Subtarget->hasFullFP16()) {
	for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	}
	}

	setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
	}

	PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
	}

	void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
	assert(VT.isVector() && "VT should be a vector type");

	if (VT.isFloatingPoint()) {
	MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
	setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
	setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
	}

	// Mark vector float intrinsics as expand.
	if (VT == MVT::v2f32 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);

	// But we do support custom-lowering for FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::OR, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);

	setOperationAction(ISD::SELECT, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	setOperationAction(ISD::VSELECT, VT, Expand);
	for (MVT InnerVT : MVT::all_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// CNT supports only B element sizes, then use UADDLP to widen.
	if (VT != MVT::v8i8 && VT != MVT::v16i8)
	setOperationAction(ISD::CTPOP, VT, Custom);

	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);

	setOperationAction(ISD::FP_TO_SINT, VT, Custom);
	setOperationAction(ISD::FP_TO_UINT, VT, Custom);

	if (!VT.isFloatingPoint())
	setOperationAction(ISD::ABS, VT, Legal);

	// [SU][MIN\|MAX] are available for all NEON types apart from i64.
	if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
	for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
	setOperationAction(Opcode, VT, Legal);

	// F[MIN\|MAX][NUM\|NAN] are available for all FP NEON types.
	if (VT.isFloatingPoint() &&
	(VT.getVectorElementType() != MVT::f16 \|\| Subtarget->hasFullFP16()))
	for (unsigned Opcode :
	{ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
	setOperationAction(Opcode, VT, Legal);

	if (Subtarget->isLittleEndian()) {
	for (unsigned im = (unsigned)ISD::PRE_INC;
	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
	setIndexedLoadAction(im, VT, Legal);
	setIndexedStoreAction(im, VT, Legal);
	}
	}
	}

	void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
	addRegisterClass(VT, &AArch64::FPR64RegClass);
	addTypeForNEON(VT, MVT::v2i32);
	}

	void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
	addRegisterClass(VT, &AArch64::FPR128RegClass);
	addTypeForNEON(VT, MVT::v4i32);
	}

	EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i32;
	return VT.changeVectorElementTypeToInteger();
	}

	static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
	const APInt &Demanded,
	TargetLowering::TargetLoweringOpt &TLO,
	unsigned NewOpc) {
	uint64_t OldImm = Imm, NewImm, Enc;
	uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;

	// Return if the immediate is already all zeros, all ones, a bimm32 or a
	// bimm64.
	if (Imm == 0 \|\| Imm == Mask \|\|
	AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
	return false;

	unsigned EltSize = Size;
	uint64_t DemandedBits = Demanded.getZExtValue();

	// Clear bits that are not demanded.
	Imm &= DemandedBits;

	while (true) {
	// The goal here is to set the non-demanded bits in a way that minimizes
	// the number of switching between 0 and 1. In order to achieve this goal,
	// we set the non-demanded bits to the value of the preceding demanded bits.
	// For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
	// non-demanded bit), we copy bit0 (1) to the least significant 'x',
	// bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
	// The final result is 0b11000011.
	uint64_t NonDemandedBits = ~DemandedBits;
	uint64_t InvertedImm = ~Imm & DemandedBits;
	uint64_t RotatedImm =
	((InvertedImm << 1) \| (InvertedImm >> (EltSize - 1) & 1)) &
	NonDemandedBits;
	uint64_t Sum = RotatedImm + NonDemandedBits;
	bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
	uint64_t Ones = (Sum + Carry) & NonDemandedBits;
	NewImm = (Imm \| Ones) & Mask;

	// If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
	// or all-ones or all-zeros, in which case we can stop searching. Otherwise,
	// we halve the element size and continue the search.
	if (isShiftedMask_64(NewImm) \|\| isShiftedMask_64(~(NewImm \| ~Mask)))
	break;

	// We cannot shrink the element size any further if it is 2-bits.
	if (EltSize == 2)
	return false;

	EltSize /= 2;
	Mask >>= EltSize;
	uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;

	// Return if there is mismatch in any of the demanded bits of Imm and Hi.
	if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
	return false;

	// Merge the upper and lower halves of Imm and DemandedBits.
	Imm \|= Hi;
	DemandedBits \|= DemandedBitsHi;
	}

	++NumOptimizedImms;

	// Replicate the element across the register width.
	while (EltSize < Size) {
	NewImm \|= NewImm << EltSize;
	EltSize *= 2;
	}

	(void)OldImm;
	assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
	"demanded bits should never be altered");
	assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");

	// Create the new constant immediate node.
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue New;

	// If the new constant immediate is all-zeros or all-ones, let the target
	// independent DAG combine optimize this node.
	if (NewImm == 0 \|\| NewImm == OrigMask) {
	New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
	TLO.DAG.getConstant(NewImm, DL, VT));
	// Otherwise, create a machine node so that target independent DAG combine
	// doesn't undo this optimization.
	} else {
	Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
	SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
	New = SDValue(
	TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
	}

	return TLO.CombineTo(Op, New);
	}

	bool AArch64TargetLowering::targetShrinkDemandedConstant(
	SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
	// Delay this optimization to as late as possible.
	if (!TLO.LegalOps)
	return false;

	if (!EnableOptimizeLogicalImm)
	return false;

	EVT VT = Op.getValueType();
	if (VT.isVector())
	return false;

	unsigned Size = VT.getSizeInBits();
	assert((Size == 32 \|\| Size == 64) &&
	"i32 or i64 is expected after legalization.");

	// Exit early if we demand all bits.
	if (Demanded.countPopulation() == Size)
	return false;

	unsigned NewOpc;
	switch (Op.getOpcode()) {
	default:
	return false;
	case ISD::AND:
	NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
	break;
	case ISD::OR:
	NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
	break;
	case ISD::XOR:
	NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
	break;
	}
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!C)
	return false;
	uint64_t Imm = C->getZExtValue();
	return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
	}

	/// computeKnownBitsForTargetNode - Determine which of the bits specified in
	/// Mask are known to be either zero or one and return them Known.
	void AArch64TargetLowering::computeKnownBitsForTargetNode(
	const SDValue Op, KnownBits &Known,
	const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
	switch (Op.getOpcode()) {
	default:
	break;
	case AArch64ISD::CSEL: {
	KnownBits Known2;
	Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
	Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	break;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
	Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
	switch (IntID) {
	default: return;
	case Intrinsic::aarch64_ldaxr:
	case Intrinsic::aarch64_ldxr: {
	unsigned BitWidth = Known.getBitWidth();
	EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
	unsigned MemBits = VT.getScalarSizeInBits();
	Known.Zero \|= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
	return;
	}
	}
	break;
	}
	case ISD::INTRINSIC_WO_CHAIN:
	case ISD::INTRINSIC_VOID: {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	switch (IntNo) {
	default:
	break;
	case Intrinsic::aarch64_neon_umaxv:
	case Intrinsic::aarch64_neon_uminv: {
	// Figure out the datatype of the vector operand. The UMINV instruction
	// will zero extend the result, so we can mark as known zero all the
	// bits larger than the element datatype. 32-bit or larget doesn't need
	// this as those are legal types and will be handled by isel directly.
	MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
	unsigned BitWidth = Known.getBitWidth();
	if (VT == MVT::v8i8 \|\| VT == MVT::v16i8) {
	assert(BitWidth >= 8 && "Unexpected width!");
	APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
	Known.Zero \|= Mask;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v8i16) {
	assert(BitWidth >= 16 && "Unexpected width!");
	APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
	Known.Zero \|= Mask;
	}
	break;
	} break;
	}
	}
	}
	}

	MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
	EVT) const {
	return MVT::i64;
	}

	bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
	EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
	bool *Fast) const {
	if (Subtarget->requiresStrictAlign())
	return false;

	if (Fast) {
	// Some CPUs are fine with unaligned stores except for 128-bit ones.
	*Fast = !Subtarget->isMisaligned128StoreSlow() \|\| VT.getStoreSize() != 16 \|\|
	// See comments in performSTORECombine() for more details about
	// these conditions.

	// Code that uses clang vector extensions can mark that it
	// wants unaligned accesses to be treated as fast by
	// underspecifying alignment to be 1 or 2.
	Align <= 2 \|\|

	// Disregard v2i64. Memcpy lowering produces those and splitting
	// them regresses performance on micro-benchmarks and olden/bh.
	VT == MVT::v2i64;
	}
	return true;
	}

	FastISel *
	AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return AArch64::createFastISel(funcInfo, libInfo);
	}

	const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((AArch64ISD::NodeType)Opcode) {
	case AArch64ISD::FIRST_NUMBER: break;
	case AArch64ISD::CALL: return "AArch64ISD::CALL";
	case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
	case AArch64ISD::ADR: return "AArch64ISD::ADR";
	case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
	case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
	case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
	case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
	case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
	case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
	case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
	case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
	case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
	case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
	case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
	case AArch64ISD::ADC: return "AArch64ISD::ADC";
	case AArch64ISD::SBC: return "AArch64ISD::SBC";
	case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
	case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
	case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
	case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
	case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
	case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
	case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
	case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
	case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
	case AArch64ISD::DUP: return "AArch64ISD::DUP";
	case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
	case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
	case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
	case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
	case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
	case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
	case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
	case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
	case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
	case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
	case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
	case AArch64ISD::BICi: return "AArch64ISD::BICi";
	case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
	case AArch64ISD::BSL: return "AArch64ISD::BSL";
	case AArch64ISD::NEG: return "AArch64ISD::NEG";
	case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
	case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
	case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
	case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
	case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
	case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
	case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
	case AArch64ISD::REV16: return "AArch64ISD::REV16";
	case AArch64ISD::REV32: return "AArch64ISD::REV32";
	case AArch64ISD::REV64: return "AArch64ISD::REV64";
	case AArch64ISD::EXT: return "AArch64ISD::EXT";
	case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
	case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
	case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
	case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
	case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
	case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
	case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
	case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
	case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
	case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
	case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
	case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
	case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
	case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
	case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
	case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
	case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
	case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
	case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
	case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
	case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
	case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
	case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
	case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
	case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
	case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
	case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
	case AArch64ISD::NOT: return "AArch64ISD::NOT";
	case AArch64ISD::BIT: return "AArch64ISD::BIT";
	case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
	case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
	case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
	case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
	case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
	case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
	case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
	case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
	case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
	case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
	case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
	case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
	case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
	case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
	case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
	case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
	case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
	case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
	case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
	case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
	case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
	case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
	case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
	case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
	case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
	case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
	case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
	case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
	case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
	case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
	case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
	case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
	case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
	case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
	case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
	case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
	case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
	case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
	case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
	case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
	case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
	case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
	case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
	case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
	case AArch64ISD::STG: return "AArch64ISD::STG";
	case AArch64ISD::STZG: return "AArch64ISD::STZG";
	case AArch64ISD::ST2G: return "AArch64ISD::ST2G";
	case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G";
	}
	return nullptr;
	}

	MachineBasicBlock *
	AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// We materialise the F128CSEL pseudo-instruction as some control flow and a
	// phi node:

	// OrigBB:
	// [... previous instrs leading to comparison ...]
	// b.ne TrueBB
	// b EndBB
	// TrueBB:
	// ; Fallthrough
	// EndBB:
	// Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]

	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction::iterator It = ++MBB->getIterator();

	unsigned DestReg = MI.getOperand(0).getReg();
	unsigned IfTrueReg = MI.getOperand(1).getReg();
	unsigned IfFalseReg = MI.getOperand(2).getReg();
	unsigned CondCode = MI.getOperand(3).getImm();
	bool NZCVKilled = MI.getOperand(4).isKill();

	MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MF->insert(It, TrueBB);
	MF->insert(It, EndBB);

	// Transfer rest of current basic-block to EndBB
	EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
	MBB->end());
	EndBB->transferSuccessorsAndUpdatePHIs(MBB);

	BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
	BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
	MBB->addSuccessor(TrueBB);
	MBB->addSuccessor(EndBB);

	// TrueBB falls through to the end.
	TrueBB->addSuccessor(EndBB);

	if (!NZCVKilled) {
	TrueBB->addLiveIn(AArch64::NZCV);
	EndBB->addLiveIn(AArch64::NZCV);
	}

	BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
	.addReg(IfTrueReg)
	.addMBB(TrueBB)
	.addReg(IfFalseReg)
	.addMBB(MBB);

	MI.eraseFromParent();
	return EndBB;
	}

	MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
	MachineInstr &MI, MachineBasicBlock *BB) const {
	assert(!isAsynchronousEHPersonality(classifyEHPersonality(
	BB->getParent()->getFunction().getPersonalityFn())) &&
	"SEH does not use catchret!");
	return BB;
	}

	MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchPad(
	MachineInstr &MI, MachineBasicBlock *BB) const {
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *BB) const {
	switch (MI.getOpcode()) {
	default:
	#ifndef NDEBUG
	MI.dump();
	#endif
	llvm_unreachable("Unexpected instruction for custom inserter!");

	case AArch64::F128CSEL:
	return EmitF128CSEL(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);

	case AArch64::CATCHRET:
	return EmitLoweredCatchRet(MI, BB);
	case AArch64::CATCHPAD:
	return EmitLoweredCatchPad(MI, BB);
	}
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Lowering private implementation.
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Lowering Code
	//===----------------------------------------------------------------------===//

	/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
	/// CC
	static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
	switch (CC) {
	default:
	llvm_unreachable("Unknown condition code!");
	case ISD::SETNE:
	return AArch64CC::NE;
	case ISD::SETEQ:
	return AArch64CC::EQ;
	case ISD::SETGT:
	return AArch64CC::GT;
	case ISD::SETGE:
	return AArch64CC::GE;
	case ISD::SETLT:
	return AArch64CC::LT;
	case ISD::SETLE:
	return AArch64CC::LE;
	case ISD::SETUGT:
	return AArch64CC::HI;
	case ISD::SETUGE:
	return AArch64CC::HS;
	case ISD::SETULT:
	return AArch64CC::LO;
	case ISD::SETULE:
	return AArch64CC::LS;
	}
	}

	/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
	static void changeFPCCToAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2) {
	CondCode2 = AArch64CC::AL;
	switch (CC) {
	default:
	llvm_unreachable("Unknown FP condition!");
	case ISD::SETEQ:
	case ISD::SETOEQ:
	CondCode = AArch64CC::EQ;
	break;
	case ISD::SETGT:
	case ISD::SETOGT:
	CondCode = AArch64CC::GT;
	break;
	case ISD::SETGE:
	case ISD::SETOGE:
	CondCode = AArch64CC::GE;
	break;
	case ISD::SETOLT:
	CondCode = AArch64CC::MI;
	break;
	case ISD::SETOLE:
	CondCode = AArch64CC::LS;
	break;
	case ISD::SETONE:
	CondCode = AArch64CC::MI;
	CondCode2 = AArch64CC::GT;
	break;
	case ISD::SETO:
	CondCode = AArch64CC::VC;
	break;
	case ISD::SETUO:
	CondCode = AArch64CC::VS;
	break;
	case ISD::SETUEQ:
	CondCode = AArch64CC::EQ;
	CondCode2 = AArch64CC::VS;
	break;
	case ISD::SETUGT:
	CondCode = AArch64CC::HI;
	break;
	case ISD::SETUGE:
	CondCode = AArch64CC::PL;
	break;
	case ISD::SETLT:
	case ISD::SETULT:
	CondCode = AArch64CC::LT;
	break;
	case ISD::SETLE:
	case ISD::SETULE:
	CondCode = AArch64CC::LE;
	break;
	case ISD::SETNE:
	case ISD::SETUNE:
	CondCode = AArch64CC::NE;
	break;
	}
	}

	/// Convert a DAG fp condition code to an AArch64 CC.
	/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
	/// should be AND'ed instead of OR'ed.
	static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2) {
	CondCode2 = AArch64CC::AL;
	switch (CC) {
	default:
	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
	assert(CondCode2 == AArch64CC::AL);
	break;
	case ISD::SETONE:
	// (a one b)
	// == ((a olt b) \|\| (a ogt b))
	// == ((a ord b) && (a une b))
	CondCode = AArch64CC::VC;
	CondCode2 = AArch64CC::NE;
	break;
	case ISD::SETUEQ:
	// (a ueq b)
	// == ((a uno b) \|\| (a oeq b))
	// == ((a ule b) && (a uge b))
	CondCode = AArch64CC::PL;
	CondCode2 = AArch64CC::LE;
	break;
	}
	}

	/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
	/// CC usable with the vector instructions. Fewer operations are available
	/// without a real NZCV register, so we have to use less efficient combinations
	/// to get the same effect.
	static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2,
	bool &Invert) {
	Invert = false;
	switch (CC) {
	default:
	// Mostly the scalar mappings work fine.
	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
	break;
	case ISD::SETUO:
	Invert = true;
	LLVM_FALLTHROUGH;
	case ISD::SETO:
	CondCode = AArch64CC::MI;
	CondCode2 = AArch64CC::GE;
	break;
	case ISD::SETUEQ:
	case ISD::SETULT:
	case ISD::SETULE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	// All of the compare-mask comparisons are ordered, but we can switch
	// between the two by a double inversion. E.g. ULE == !OGT.
	Invert = true;
	changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
	break;
	}
	}

	static bool isLegalArithImmed(uint64_t C) {
	// Matches AArch64DAGToDAGISel::SelectArithImmed().
	bool IsLegal = (C >> 12 == 0) \|\| ((C & 0xFFFULL) == 0 && C >> 24 == 0);
	LLVM_DEBUG(dbgs() << "Is imm " << C
	<< " legal: " << (IsLegal ? "yes\n" : "no\n"));
	return IsLegal;
	}

	// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
	// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
	// can be set differently by this operation. It comes down to whether
	// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
	// everything is fine. If not then the optimization is wrong. Thus general
	// comparisons are only valid if op2 != 0.
	//
	// So, finally, the only LLVM-native comparisons that don't mention C and V
	// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
	// the absence of information about op2.
	static bool isCMN(SDValue Op, ISD::CondCode CC) {
	return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE);
	}

	static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	EVT VT = LHS.getValueType();
	const bool FullFP16 =
	static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();

	if (VT.isFloatingPoint()) {
	assert(VT != MVT::f128);
	if (VT == MVT::f16 && !FullFP16) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
	VT = MVT::f32;
	}
	return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
	}

	// The CMP instruction is just an alias for SUBS, and representing it as
	// SUBS means that it's possible to get CSE with subtract operations.
	// A later phase can perform the optimization of setting the destination
	// register to WZR/XZR if it ends up being unused.
	unsigned Opcode = AArch64ISD::SUBS;

	if (isCMN(RHS, CC)) {
	// Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
	Opcode = AArch64ISD::ADDS;
	RHS = RHS.getOperand(1);
	} else if (isCMN(LHS, CC)) {
	// As we are looking for EQ/NE compares, the operands can be commuted ; can
	// we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
	Opcode = AArch64ISD::ADDS;
	LHS = LHS.getOperand(1);
	} else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
	!isUnsignedIntSetCC(CC)) {
	// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
	// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
	// of the signed comparisons.
	Opcode = AArch64ISD::ANDS;
	RHS = LHS.getOperand(1);
	LHS = LHS.getOperand(0);
	}

	return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
	.getValue(1);
	}

	/// \defgroup AArch64CCMP CMP;CCMP matching
	///
	/// These functions deal with the formation of CMP;CCMP;... sequences.
	/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
	/// a comparison. They set the NZCV flags to a predefined value if their
	/// predicate is false. This allows to express arbitrary conjunctions, for
	/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
	/// expressed as:
	/// cmp A
	/// ccmp B, inv(CB), CA
	/// check for CB flags
	///
	/// This naturally lets us implement chains of AND operations with SETCC
	/// operands. And we can even implement some other situations by transforming
	/// them:
	/// - We can implement (NEG SETCC) i.e. negating a single comparison by
	/// negating the flags used in a CCMP/FCCMP operations.
	/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
	/// by negating the flags we test for afterwards. i.e.
	/// NEG (CMP CCMP CCCMP ...) can be implemented.
	/// - Note that we can only ever negate all previously processed results.
	/// What we can not implement by flipping the flags to test is a negation
	/// of two sub-trees (because the negation affects all sub-trees emitted so
	/// far, so the 2nd sub-tree we emit would also affect the first).
	/// With those tools we can implement some OR operations:
	/// - (OR (SETCC A) (SETCC B)) can be implemented via:
	/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
	/// - After transforming OR to NEG/AND combinations we may be able to use NEG
	/// elimination rules from earlier to implement the whole thing as a
	/// CCMP/FCCMP chain.
	///
	/// As complete example:
	/// or (or (setCA (cmp A)) (setCB (cmp B)))
	/// (and (setCC (cmp C)) (setCD (cmp D)))"
	/// can be reassociated to:
	/// or (and (setCC (cmp C)) setCD (cmp D))
	// (or (setCA (cmp A)) (setCB (cmp B)))
	/// can be transformed to:
	/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
	/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
	/// which can be implemented as:
	/// cmp C
	/// ccmp D, inv(CD), CC
	/// ccmp A, CA, inv(CD)
	/// ccmp B, CB, inv(CA)
	/// check for CB flags
	///
	/// A counterexample is "or (and A B) (and C D)" which translates to
	/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
	/// can only implement 1 of the inner (not) operations, but not both!
	/// @{

	/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
	static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
	ISD::CondCode CC, SDValue CCOp,
	AArch64CC::CondCode Predicate,
	AArch64CC::CondCode OutCC,
	const SDLoc &DL, SelectionDAG &DAG) {
	unsigned Opcode = 0;
	const bool FullFP16 =
	static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();

	if (LHS.getValueType().isFloatingPoint()) {
	assert(LHS.getValueType() != MVT::f128);
	if (LHS.getValueType() == MVT::f16 && !FullFP16) {
	LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
	}
	Opcode = AArch64ISD::FCCMP;
	} else if (RHS.getOpcode() == ISD::SUB) {
	SDValue SubOp0 = RHS.getOperand(0);
	if (isNullConstant(SubOp0) && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// See emitComparison() on why we can only do this for SETEQ and SETNE.
	Opcode = AArch64ISD::CCMN;
	RHS = RHS.getOperand(1);
	}
	}
	if (Opcode == 0)
	Opcode = AArch64ISD::CCMP;

	SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
	AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
	unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
	SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
	return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
	}

	/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
	/// expressed as a conjunction. See \ref AArch64CCMP.
	/// \param CanNegate Set to true if we can negate the whole sub-tree just by
	/// changing the conditions on the SETCC tests.
	/// (this means we can call emitConjunctionRec() with
	/// Negate==true on this sub-tree)
	/// \param MustBeFirst Set to true if this subtree needs to be negated and we
	/// cannot do the negation naturally. We are required to
	/// emit the subtree first in this case.
	/// \param WillNegate Is true if are called when the result of this
	/// subexpression must be negated. This happens when the
	/// outer expression is an OR. We can use this fact to know
	/// that we have a double negation (or (or ...) ...) that
	/// can be implemented for free.
	static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
	bool &MustBeFirst, bool WillNegate,
	unsigned Depth = 0) {
	if (!Val.hasOneUse())
	return false;
	unsigned Opcode = Val->getOpcode();
	if (Opcode == ISD::SETCC) {
	if (Val->getOperand(0).getValueType() == MVT::f128)
	return false;
	CanNegate = true;
	MustBeFirst = false;
	return true;
	}
	// Protect against exponential runtime and stack overflow.
	if (Depth > 6)
	return false;
	if (Opcode == ISD::AND \|\| Opcode == ISD::OR) {
	bool IsOR = Opcode == ISD::OR;
	SDValue O0 = Val->getOperand(0);
	SDValue O1 = Val->getOperand(1);
	bool CanNegateL;
	bool MustBeFirstL;
	if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
	return false;
	bool CanNegateR;
	bool MustBeFirstR;
	if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
	return false;

	if (MustBeFirstL && MustBeFirstR)
	return false;

	if (IsOR) {
	// For an OR expression we need to be able to naturally negate at least
	// one side or we cannot do the transformation at all.
	if (!CanNegateL && !CanNegateR)
	return false;
	// If we the result of the OR will be negated and we can naturally negate
	// the leafs, then this sub-tree as a whole negates naturally.
	CanNegate = WillNegate && CanNegateL && CanNegateR;
	// If we cannot naturally negate the whole sub-tree, then this must be
	// emitted first.
	MustBeFirst = !CanNegate;
	} else {
	assert(Opcode == ISD::AND && "Must be OR or AND");
	// We cannot naturally negate an AND operation.
	CanNegate = false;
	MustBeFirst = MustBeFirstL \|\| MustBeFirstR;
	}
	return true;
	}
	return false;
	}

	/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
	/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
	/// Tries to transform the given i1 producing node @p Val to a series compare
	/// and conditional compare operations. @returns an NZCV flags producing node
	/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
	/// transformation was not possible.
	/// \p Negate is true if we want this sub-tree being negated just by changing
	/// SETCC conditions.
	static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
	AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
	AArch64CC::CondCode Predicate) {
	// We're at a tree leaf, produce a conditional comparison operation.
	unsigned Opcode = Val->getOpcode();
	if (Opcode == ISD::SETCC) {
	SDValue LHS = Val->getOperand(0);
	SDValue RHS = Val->getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
	bool isInteger = LHS.getValueType().isInteger();
	if (Negate)
	CC = getSetCCInverse(CC, isInteger);
	SDLoc DL(Val);
	// Determine OutCC and handle FP special case.
	if (isInteger) {
	OutCC = changeIntCCToAArch64CC(CC);
	} else {
	assert(LHS.getValueType().isFloatingPoint());
	AArch64CC::CondCode ExtraCC;
	changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
	// Some floating point conditions can't be tested with a single condition
	// code. Construct an additional comparison in this case.
	if (ExtraCC != AArch64CC::AL) {
	SDValue ExtraCmp;
	if (!CCOp.getNode())
	ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
	else
	ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
	ExtraCC, DL, DAG);
	CCOp = ExtraCmp;
	Predicate = ExtraCC;
	}
	}

	// Produce a normal comparison if we are first in the chain
	if (!CCOp)
	return emitComparison(LHS, RHS, CC, DL, DAG);
	// Otherwise produce a ccmp.
	return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
	DAG);
	}
	assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");

	bool IsOR = Opcode == ISD::OR;

	SDValue LHS = Val->getOperand(0);
	bool CanNegateL;
	bool MustBeFirstL;
	bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
	assert(ValidL && "Valid conjunction/disjunction tree");
	(void)ValidL;

	SDValue RHS = Val->getOperand(1);
	bool CanNegateR;
	bool MustBeFirstR;
	bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
	assert(ValidR && "Valid conjunction/disjunction tree");
	(void)ValidR;

	// Swap sub-tree that must come first to the right side.
	if (MustBeFirstL) {
	assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
	std::swap(LHS, RHS);
	std::swap(CanNegateL, CanNegateR);
	std::swap(MustBeFirstL, MustBeFirstR);
	}

	bool NegateR;
	bool NegateAfterR;
	bool NegateL;
	bool NegateAfterAll;
	if (Opcode == ISD::OR) {
	// Swap the sub-tree that we can negate naturally to the left.
	if (!CanNegateL) {
	assert(CanNegateR && "at least one side must be negatable");
	assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
	assert(!Negate);
	std::swap(LHS, RHS);
	NegateR = false;
	NegateAfterR = true;
	} else {
	// Negate the left sub-tree if possible, otherwise negate the result.
	NegateR = CanNegateR;
	NegateAfterR = !CanNegateR;
	}
	NegateL = true;
	NegateAfterAll = !Negate;
	} else {
	assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
	assert(!Negate && "Valid conjunction/disjunction tree");

	NegateL = false;
	NegateR = false;
	NegateAfterR = false;
	NegateAfterAll = false;
	}

	// Emit sub-trees.
	AArch64CC::CondCode RHSCC;
	SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
	if (NegateAfterR)
	RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
	SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
	if (NegateAfterAll)
	OutCC = AArch64CC::getInvertedCondCode(OutCC);
	return CmpL;
	}

	/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
	/// In some cases this is even possible with OR operations in the expression.
	/// See \ref AArch64CCMP.
	/// \see emitConjunctionRec().
	static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
	AArch64CC::CondCode &OutCC) {
	bool DummyCanNegate;
	bool DummyMustBeFirst;
	if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
	return SDValue();

	return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
	}

	/// @}

	/// Returns how profitable it is to fold a comparison's operand's shift and/or
	/// extension operations.
	static unsigned getCmpOperandFoldingProfit(SDValue Op) {
	auto isSupportedExtend = [&](SDValue V) {
	if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
	return true;

	if (V.getOpcode() == ISD::AND)
	if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
	uint64_t Mask = MaskCst->getZExtValue();
	return (Mask == 0xFF \|\| Mask == 0xFFFF \|\| Mask == 0xFFFFFFFF);
	}

	return false;
	};

	if (!Op.hasOneUse())
	return 0;

	if (isSupportedExtend(Op))
	return 1;

	unsigned Opc = Op.getOpcode();
	if (Opc == ISD::SHL \|\| Opc == ISD::SRL \|\| Opc == ISD::SRA)
	if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	uint64_t Shift = ShiftCst->getZExtValue();
	if (isSupportedExtend(Op.getOperand(0)))
	return (Shift <= 4) ? 2 : 1;
	EVT VT = Op.getValueType();
	if ((VT == MVT::i32 && Shift <= 31) \|\| (VT == MVT::i64 && Shift <= 63))
	return 1;
	}

	return 0;
	}

	static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
	SDValue &AArch64cc, SelectionDAG &DAG,
	const SDLoc &dl) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
	EVT VT = RHS.getValueType();
	uint64_t C = RHSC->getZExtValue();
	if (!isLegalArithImmed(C)) {
	// Constant does not fit, try adjusting it by one?
	switch (CC) {
	default:
	break;
	case ISD::SETLT:
	case ISD::SETGE:
	if ((VT == MVT::i32 && C != 0x80000000 &&
	isLegalArithImmed((uint32_t)(C - 1))) \|\|
	(VT == MVT::i64 && C != 0x80000000ULL &&
	isLegalArithImmed(C - 1ULL))) {
	CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
	C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETULT:
	case ISD::SETUGE:
	if ((VT == MVT::i32 && C != 0 &&
	isLegalArithImmed((uint32_t)(C - 1))) \|\|
	(VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
	CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
	C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETLE:
	case ISD::SETGT:
	if ((VT == MVT::i32 && C != INT32_MAX &&
	isLegalArithImmed((uint32_t)(C + 1))) \|\|
	(VT == MVT::i64 && C != INT64_MAX &&
	isLegalArithImmed(C + 1ULL))) {
	CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
	C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETULE:
	case ISD::SETUGT:
	if ((VT == MVT::i32 && C != UINT32_MAX &&
	isLegalArithImmed((uint32_t)(C + 1))) \|\|
	(VT == MVT::i64 && C != UINT64_MAX &&
	isLegalArithImmed(C + 1ULL))) {
	CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
	C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	}
	}
	}

	// Comparisons are canonicalized so that the RHS operand is simpler than the
	// LHS one, the extreme case being when RHS is an immediate. However, AArch64
	// can fold some shift+extend operations on the RHS operand, so swap the
	// operands if that can be done.
	//
	// For example:
	// lsl w13, w11, #1
	// cmp w13, w12
	// can be turned into:
	// cmp w12, w11, lsl #1
	if (!isa<ConstantSDNode>(RHS) \|\|
	!isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
	SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;

	if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
	std::swap(LHS, RHS);
	CC = ISD::getSetCCSwappedOperands(CC);
	}
	}

	SDValue Cmp;
	AArch64CC::CondCode AArch64CC;
	if ((CC == ISD::SETEQ \|\| CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
	const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);

	// The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
	// For the i8 operand, the largest immediate is 255, so this can be easily
	// encoded in the compare instruction. For the i16 operand, however, the
	// largest immediate cannot be encoded in the compare.
	// Therefore, use a sign extending load and cmn to avoid materializing the
	// -1 constant. For example,
	// movz w1, #65535
	// ldrh w0, [x0, #0]
	// cmp w0, w1
	// >
	// ldrsh w0, [x0, #0]
	// cmn w0, #1
	// Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
	// if and only if (sext LHS) == (sext RHS). The checks are in place to
	// ensure both the LHS and RHS are truly zero extended and to make sure the
	// transformation is profitable.
	if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
	cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
	cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
	LHS.getNode()->hasNUsesOfValue(1, 0)) {
	int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
	if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
	SDValue SExt =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
	DAG.getValueType(MVT::i16));
	Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
	RHS.getValueType()),
	CC, dl, DAG);
	AArch64CC = changeIntCCToAArch64CC(CC);
	}
	}

	if (!Cmp && (RHSC->isNullValue() \|\| RHSC->isOne())) {
	if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
	if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
	AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
	}
	}
	}

	if (!Cmp) {
	Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
	AArch64CC = changeIntCCToAArch64CC(CC);
	}
	AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
	return Cmp;
	}

	static std::pair<SDValue, SDValue>
	getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
	assert((Op.getValueType() == MVT::i32 \|\| Op.getValueType() == MVT::i64) &&
	"Unsupported value type");
	SDValue Value, Overflow;
	SDLoc DL(Op);
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	unsigned Opc = 0;
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Unknown overflow instruction!");
	case ISD::SADDO:
	Opc = AArch64ISD::ADDS;
	CC = AArch64CC::VS;
	break;
	case ISD::UADDO:
	Opc = AArch64ISD::ADDS;
	CC = AArch64CC::HS;
	break;
	case ISD::SSUBO:
	Opc = AArch64ISD::SUBS;
	CC = AArch64CC::VS;
	break;
	case ISD::USUBO:
	Opc = AArch64ISD::SUBS;
	CC = AArch64CC::LO;
	break;
	// Multiply needs a little bit extra work.
	case ISD::SMULO:
	case ISD::UMULO: {
	CC = AArch64CC::NE;
	bool IsSigned = Op.getOpcode() == ISD::SMULO;
	if (Op.getValueType() == MVT::i32) {
	unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	// For a 32 bit multiply with overflow check we want the instruction
	// selector to generate a widening multiply (SMADDL/UMADDL). For that we
	// need to generate the following pattern:
	// (i64 add 0, (i64 mul (i64 sext\|zext i32 %a), (i64 sext\|zext i32 %b))
	LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
	RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
	SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
	SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
	DAG.getConstant(0, DL, MVT::i64));
	// On AArch64 the upper 32 bits are always zero extended for a 32 bit
	// operation. We need to clear out the upper 32 bits, because we used a
	// widening multiply that wrote all 64 bits. In the end this should be a
	// noop.
	Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
	if (IsSigned) {
	// The signed overflow check requires more than just a simple check for
	// any bit set in the upper 32 bits of the result. These bits could be
	// just the sign bits of a negative number. To perform the overflow
	// check we have to arithmetic shift right the 32nd bit of the result by
	// 31 bits. Then we compare the result to the upper 32 bits.
	SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
	DAG.getConstant(32, DL, MVT::i64));
	UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
	SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
	DAG.getConstant(31, DL, MVT::i64));
	// It is important that LowerBits is last, otherwise the arithmetic
	// shift will not be folded into the compare (SUBS).
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
	.getValue(1);
	} else {
	// The overflow check for unsigned multiply is easy. We only need to
	// check if any of the upper 32 bits are set. This can be done with a
	// CMP (shifted register). For that we need to generate the following
	// pattern:
	// (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
	SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
	DAG.getConstant(32, DL, MVT::i64));
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow =
	DAG.getNode(AArch64ISD::SUBS, DL, VTs,
	DAG.getConstant(0, DL, MVT::i64),
	UpperBits).getValue(1);
	}
	break;
	}
	assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
	// For the 64 bit multiply
	Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
	if (IsSigned) {
	SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
	SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
	DAG.getConstant(63, DL, MVT::i64));
	// It is important that LowerBits is last, otherwise the arithmetic
	// shift will not be folded into the compare (SUBS).
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
	.getValue(1);
	} else {
	SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow =
	DAG.getNode(AArch64ISD::SUBS, DL, VTs,
	DAG.getConstant(0, DL, MVT::i64),
	UpperBits).getValue(1);
	}
	break;
	}
	} // switch (...)

	if (Opc) {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

	// Emit the AArch64 operation with overflow check.
	Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
	Overflow = Value.getValue(1);
	}
	return std::make_pair(Value, Overflow);
	}

	SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
	RTLIB::Libcall Call) const {
	SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
	return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
	}

	// Returns true if the given Op is the overflow flag result of an overflow
	// intrinsic operation.
	static bool isOverflowIntrOpRes(SDValue Op) {
	unsigned Opc = Op.getOpcode();
	return (Op.getResNo() == 1 &&
	(Opc == ISD::SADDO \|\| Opc == ISD::UADDO \|\| Opc == ISD::SSUBO \|\|
	Opc == ISD::USUBO \|\| Opc == ISD::SMULO \|\| Opc == ISD::UMULO));
	}

	static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
	SDValue Sel = Op.getOperand(0);
	SDValue Other = Op.getOperand(1);
	SDLoc dl(Sel);

	// If the operand is an overflow checking operation, invert the condition
	// code and kill the Not operation. I.e., transform:
	// (xor (overflow_op_bool, 1))
	// -->
	// (csel 1, 0, invert(cc), overflow_op_bool)
	// ... which later gets transformed to just a cset instruction with an
	// inverted condition code, rather than a cset + eor sequence.
	if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
	return SDValue();

	SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
	SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
	AArch64CC::CondCode CC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
	SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
	return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
	CCVal, Overflow);
	}
	// If neither operand is a SELECT_CC, give up.
	if (Sel.getOpcode() != ISD::SELECT_CC)
	std::swap(Sel, Other);
	if (Sel.getOpcode() != ISD::SELECT_CC)
	return Op;

	// The folding we want to perform is:
	// (xor x, (select_cc a, b, cc, 0, -1) )
	// -->
	// (csel x, (xor x, -1), cc ...)
	//
	// The latter will get matched to a CSINV instruction.

	ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
	SDValue LHS = Sel.getOperand(0);
	SDValue RHS = Sel.getOperand(1);
	SDValue TVal = Sel.getOperand(2);
	SDValue FVal = Sel.getOperand(3);

	// FIXME: This could be generalized to non-integer comparisons.
	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
	return Op;

	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);

	// The values aren't constants, this isn't the pattern we're looking for.
	if (!CFVal \|\| !CTVal)
	return Op;

	// We can commute the SELECT_CC by inverting the condition. This
	// might be needed to make this fit into a CSINV pattern.
	if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}

	// If the constants line up, perform the transform!
	if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);

	FVal = Other;
	TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
	DAG.getConstant(-1ULL, dl, Other.getValueType()));

	return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
	CCVal, Cmp);
	}

	return Op;
	}

	static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	unsigned Opc;
	bool ExtraOp = false;
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Invalid code");
	case ISD::ADDC:
	Opc = AArch64ISD::ADDS;
	break;
	case ISD::SUBC:
	Opc = AArch64ISD::SUBS;
	break;
	case ISD::ADDE:
	Opc = AArch64ISD::ADCS;
	ExtraOp = true;
	break;
	case ISD::SUBE:
	Opc = AArch64ISD::SBCS;
	ExtraOp = true;
	break;
	}

	if (!ExtraOp)
	return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
	Op.getOperand(2));
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
	return SDValue();

	SDLoc dl(Op);
	AArch64CC::CondCode CC;
	// The actual operation that sets the overflow or carry flag.
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);

	// We use 0 and 1 as false and true values.
	SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
	SDValue FVal = DAG.getConstant(0, dl, MVT::i32);

	// We use an inverted condition, because the conditional select is inverted
	// too. This will allow it to be selected to a single instruction:
	// CSINC Wd, WZR, WZR, invert(cond).
	SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
	CCVal, Overflow);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
	}

	// Prefetch operands are:
	// 1: Address to prefetch
	// 2: bool isWrite
	// 3: int locality (0 = no locality ... 3 = extreme locality)
	// 4: bool isDataCache
	static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();

	bool IsStream = !Locality;
	// When the locality number is set
	if (Locality) {
	// The front-end should have filtered out the out-of-range values
	assert(Locality <= 3 && "Prefetch locality out-of-range");
	// The locality degree is the opposite of the cache speed.
	// Put the number the other way around.
	// The encoding starts at 0 for level 1
	Locality = 3 - Locality;
	}

	// built the mask value encoding the expected behavior.
	unsigned PrfOp = (IsWrite << 4) \| // Load/Store bit
	(!IsData << 3) \| // IsDataCache bit
	(Locality << 1) \| // Cache level bits
	(unsigned)IsStream; // Stream bit
	return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
	DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
	}

	SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");

	RTLIB::Libcall LC;
	LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());

	return LowerF128Call(Op, DAG, LC);
	}

	SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getOperand(0).getValueType() != MVT::f128) {
	// It's legal except when f128 is involved
	return Op;
	}

	RTLIB::Libcall LC;
	LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());

	// FP_ROUND node has a second operand indicating whether it is known to be
	// precise. That doesn't take part in the LibCall so we can't directly use
	// LowerF128Call.
	SDValue SrcVal = Op.getOperand(0);
	return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /isSigned/ false,
	SDLoc(Op)).first;
	}

	SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
	SelectionDAG &DAG) const {
	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
	// Any additional optimization in this function should be recorded
	// in the cost tables.
	EVT InVT = Op.getOperand(0).getValueType();
	EVT VT = Op.getValueType();
	unsigned NumElts = InVT.getVectorNumElements();

	// f16 conversions are promoted to f32 when full fp16 is not supported.
	if (InVT.getVectorElementType() == MVT::f16 &&
	!Subtarget->hasFullFP16()) {
	MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
	SDLoc dl(Op);
	return DAG.getNode(
	Op.getOpcode(), dl, Op.getValueType(),
	DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
	}

	if (VT.getSizeInBits() < InVT.getSizeInBits()) {
	SDLoc dl(Op);
	SDValue Cv =
	DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
	Op.getOperand(0));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
	}

	if (VT.getSizeInBits() > InVT.getSizeInBits()) {
	SDLoc dl(Op);
	MVT ExtVT =
	MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
	VT.getVectorNumElements());
	SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
	return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
	}

	// Type changing conversions are illegal.
	return Op;
	}

	SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getOperand(0).getValueType().isVector())
	return LowerVectorFP_TO_INT(Op, DAG);

	// f16 conversions are promoted to f32 when full fp16 is not supported.
	if (Op.getOperand(0).getValueType() == MVT::f16 &&
	!Subtarget->hasFullFP16()) {
	SDLoc dl(Op);
	return DAG.getNode(
	Op.getOpcode(), dl, Op.getValueType(),
	DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
	}

	if (Op.getOperand(0).getValueType() != MVT::f128) {
	// It's legal except when f128 is involved
	return Op;
	}

	RTLIB::Libcall LC;
	if (Op.getOpcode() == ISD::FP_TO_SINT)
	LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
	else
	LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());

	SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
	return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
	}

	static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
	// Any additional optimization in this function should be recorded
	// in the cost tables.
	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	SDValue In = Op.getOperand(0);
	EVT InVT = In.getValueType();

	if (VT.getSizeInBits() < InVT.getSizeInBits()) {
	MVT CastVT =
	MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
	InVT.getVectorNumElements());
	In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
	return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() > InVT.getSizeInBits()) {
	unsigned CastOpc =
	Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	EVT CastVT = VT.changeVectorElementTypeToInteger();
	In = DAG.getNode(CastOpc, dl, CastVT, In);
	return DAG.getNode(Op.getOpcode(), dl, VT, In);
	}

	return Op;
	}

	SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getValueType().isVector())
	return LowerVectorINT_TO_FP(Op, DAG);

	// f16 conversions are promoted to f32 when full fp16 is not supported.
	if (Op.getValueType() == MVT::f16 &&
	!Subtarget->hasFullFP16()) {
	SDLoc dl(Op);
	return DAG.getNode(
	ISD::FP_ROUND, dl, MVT::f16,
	DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
	DAG.getIntPtrConstant(0, dl));
	}

	// i128 conversions are libcalls.
	if (Op.getOperand(0).getValueType() == MVT::i128)
	return SDValue();

	// Other conversions are legal, unless it's to the completely software-based
	// fp128.
	if (Op.getValueType() != MVT::f128)
	return Op;

	RTLIB::Libcall LC;
	if (Op.getOpcode() == ISD::SINT_TO_FP)
	LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
	else
	LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());

	return LowerF128Call(Op, DAG, LC);
	}

	SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
	SelectionDAG &DAG) const {
	// For iOS, we want to call an alternative entry point: __sincos_stret,
	// which returns the values in two S / D registers.
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	ArgListTy Args;
	ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
	: RTLIB::SINCOS_STRET_F32;
	const char *LibcallName = getLibcallName(LC);
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));

	StructType *RetTy = StructType::get(ArgTy, ArgTy);
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
	return CallResult.first;
	}

	static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
	if (Op.getValueType() != MVT::f16)
	return SDValue();

	assert(Op.getOperand(0).getValueType() == MVT::i16);
	SDLoc DL(Op);

	Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
	Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
	return SDValue(
	DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
	DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
	0);
	}

	static EVT getExtensionTo64Bits(const EVT &OrigVT) {
	if (OrigVT.getSizeInBits() >= 64)
	return OrigVT;

	assert(OrigVT.isSimple() && "Expecting a simple value type");

	MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
	switch (OrigSimpleTy) {
	default: llvm_unreachable("Unexpected Vector Type");
	case MVT::v2i8:
	case MVT::v2i16:
	return MVT::v2i32;
	case MVT::v4i8:
	return MVT::v4i16;
	}
	}

	static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
	const EVT &OrigTy,
	const EVT &ExtTy,
	unsigned ExtOpcode) {
	// The vector originally had a size of OrigTy. It was then extended to ExtTy.
	// We expect the ExtTy to be 128-bits total. If the OrigTy is less than
	// 64-bits we need to insert a new extension so that it will be 64-bits.
	assert(ExtTy.is128BitVector() && "Unexpected extension size");
	if (OrigTy.getSizeInBits() >= 64)
	return N;

	// Must extend size to at least 64 bits to be used as an operand for VMULL.
	EVT NewVT = getExtensionTo64Bits(OrigTy);

	return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
	}

	static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
	bool isSigned) {
	EVT VT = N->getValueType(0);

	if (N->getOpcode() != ISD::BUILD_VECTOR)
	return false;

	for (const SDValue &Elt : N->op_values()) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
	unsigned EltSize = VT.getScalarSizeInBits();
	unsigned HalfSize = EltSize / 2;
	if (isSigned) {
	if (!isIntN(HalfSize, C->getSExtValue()))
	return false;
	} else {
	if (!isUIntN(HalfSize, C->getZExtValue()))
	return false;
	}
	continue;
	}
	return false;
	}

	return true;
	}

	static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() == ISD::SIGN_EXTEND \|\| N->getOpcode() == ISD::ZERO_EXTEND)
	return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
	N->getOperand(0)->getValueType(0),
	N->getValueType(0),
	N->getOpcode());

	assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
	EVT VT = N->getValueType(0);
	SDLoc dl(N);
	unsigned EltSize = VT.getScalarSizeInBits() / 2;
	unsigned NumElts = VT.getVectorNumElements();
	MVT TruncVT = MVT::getIntegerVT(EltSize);
	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0; i != NumElts; ++i) {
	ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
	const APInt &CInt = C->getAPIntValue();
	// Element types smaller than 32 bits are not legal, so use i32 elements.
	// The values are implicitly truncated so sext vs. zext doesn't matter.
	Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
	}
	return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
	}

	static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
	return N->getOpcode() == ISD::SIGN_EXTEND \|\|
	isExtendedBUILD_VECTOR(N, DAG, true);
	}

	static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
	return N->getOpcode() == ISD::ZERO_EXTEND \|\|
	isExtendedBUILD_VECTOR(N, DAG, false);
	}

	static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
	SDNode *N0 = N->getOperand(0).getNode();
	SDNode *N1 = N->getOperand(1).getNode();
	return N0->hasOneUse() && N1->hasOneUse() &&
	isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
	}
	return false;
	}

	static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
	SDNode *N0 = N->getOperand(0).getNode();
	SDNode *N1 = N->getOperand(1).getNode();
	return N0->hasOneUse() && N1->hasOneUse() &&
	isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
	}
	return false;
	}

	SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	// The rounding mode is in bits 23:22 of the FPSCR.
	// The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
	// The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
	// so that the shift + and get folded into a bitfield extract.
	SDLoc dl(Op);

	SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
	DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
	MVT::i64));
	SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
	SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
	DAG.getConstant(1U << 22, dl, MVT::i32));
	SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
	DAG.getConstant(22, dl, MVT::i32));
	return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
	DAG.getConstant(3, dl, MVT::i32));
	}

	static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
	// Multiplications are only custom-lowered for 128-bit vectors so that
	// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
	EVT VT = Op.getValueType();
	assert(VT.is128BitVector() && VT.isInteger() &&
	"unexpected type for custom-lowering ISD::MUL");
	SDNode *N0 = Op.getOperand(0).getNode();
	SDNode *N1 = Op.getOperand(1).getNode();
	unsigned NewOpc = 0;
	bool isMLA = false;
	bool isN0SExt = isSignExtended(N0, DAG);
	bool isN1SExt = isSignExtended(N1, DAG);
	if (isN0SExt && isN1SExt)
	NewOpc = AArch64ISD::SMULL;
	else {
	bool isN0ZExt = isZeroExtended(N0, DAG);
	bool isN1ZExt = isZeroExtended(N1, DAG);
	if (isN0ZExt && isN1ZExt)
	NewOpc = AArch64ISD::UMULL;
	else if (isN1SExt \|\| isN1ZExt) {
	// Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
	// into (s/zext A * s/zext C) + (s/zext B * s/zext C)
	if (isN1SExt && isAddSubSExt(N0, DAG)) {
	NewOpc = AArch64ISD::SMULL;
	isMLA = true;
	} else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
	NewOpc = AArch64ISD::UMULL;
	isMLA = true;
	} else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
	std::swap(N0, N1);
	NewOpc = AArch64ISD::UMULL;
	isMLA = true;
	}
	}

	if (!NewOpc) {
	if (VT == MVT::v2i64)
	// Fall through to expand this. It is not legal.
	return SDValue();
	else
	// Other vector multiplications are legal.
	return Op;
	}
	}

	// Legalize to a S/UMULL instruction
	SDLoc DL(Op);
	SDValue Op0;
	SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
	if (!isMLA) {
	Op0 = skipExtensionForVectorMULL(N0, DAG);
	assert(Op0.getValueType().is64BitVector() &&
	Op1.getValueType().is64BitVector() &&
	"unexpected types for extended operands to VMULL");
	return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
	}
	// Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
	// isel lowering to take advantage of no-stall back to back s/umul + s/umla.
	// This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
	SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
	SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
	EVT Op1VT = Op1.getValueType();
	return DAG.getNode(N0->getOpcode(), DL, VT,
	DAG.getNode(NewOpc, DL, VT,
	DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
	DAG.getNode(NewOpc, DL, VT,
	DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
	}

	SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.
	case Intrinsic::thread_pointer: {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
	}
	case Intrinsic::aarch64_neon_abs: {
	EVT Ty = Op.getValueType();
	if (Ty == MVT::i64) {
	SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
	Op.getOperand(1));
	Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
	return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
	} else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
	return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
	} else {
	report_fatal_error("Unexpected type for AArch64 NEON intrinic");
	}
	}
	case Intrinsic::aarch64_neon_smax:
	return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_umax:
	return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_smin:
	return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_umin:
	return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));

	case Intrinsic::localaddress: {
	const auto &MF = DAG.getMachineFunction();
	const auto *RegInfo = Subtarget->getRegisterInfo();
	unsigned Reg = RegInfo->getLocalAddressRegister(MF);
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
	Op.getSimpleValueType());
	}

	case Intrinsic::eh_recoverfp: {
	// FIXME: This needs to be implemented to correctly handle highly aligned
	// stack objects. For now we simply return the incoming FP. Refer D53541
	// for more details.
	SDValue FnOp = Op.getOperand(1);
	SDValue IncomingFPOp = Op.getOperand(2);
	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
	if (!Fn)
	report_fatal_error(
	"llvm.eh.recoverfp must take a function as the first argument");
	return IncomingFPOp;
	}
	}
	}

	// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
	static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
	EVT VT, EVT MemVT,
	SelectionDAG &DAG) {
	assert(VT.isVector() && "VT should be a vector type");
	assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);

	SDValue Value = ST->getValue();

	// It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
	// the word lane which represent the v4i8 subvector. It optimizes the store
	// to:
	//
	// xtn v0.8b, v0.8h
	// str s0, [x0]

	SDValue Undef = DAG.getUNDEF(MVT::i16);
	SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
	{Undef, Undef, Undef, Undef});

	SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
	Value, UndefVec);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);

	Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
	SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
	Trunc, DAG.getConstant(0, DL, MVT::i64));

	return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
	ST->getBasePtr(), ST->getMemOperand());
	}

	// Custom lowering for any store, vector or scalar and/or default or with
	// a truncate operations. Currently only custom lower truncate operation
	// from vector v4i16 to v4i8.
	SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc Dl(Op);
	StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
	assert (StoreNode && "Can only custom lower store nodes");

	SDValue Value = StoreNode->getValue();

	EVT VT = Value.getValueType();
	EVT MemVT = StoreNode->getMemoryVT();

	assert (VT.isVector() && "Can only custom lower vector store types");

	unsigned AS = StoreNode->getAddressSpace();
	unsigned Align = StoreNode->getAlignment();
	if (Align < MemVT.getStoreSize() &&
	!allowsMisalignedMemoryAccesses(
	MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
	return scalarizeVectorStore(StoreNode, DAG);
	}

	if (StoreNode->isTruncatingStore()) {
	return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
	SelectionDAG &DAG) const {
	LLVM_DEBUG(dbgs() << "Custom lowering: ");
	LLVM_DEBUG(Op.dump());

	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("unimplemented operand");
	return SDValue();
	case ISD::BITCAST:
	return LowerBITCAST(Op, DAG);
	case ISD::GlobalAddress:
	return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress:
	return LowerGlobalTLSAddress(Op, DAG);
	case ISD::SETCC:
	return LowerSETCC(Op, DAG);
	case ISD::BR_CC:
	return LowerBR_CC(Op, DAG);
	case ISD::SELECT:
	return LowerSELECT(Op, DAG);
	case ISD::SELECT_CC:
	return LowerSELECT_CC(Op, DAG);
	case ISD::JumpTable:
	return LowerJumpTable(Op, DAG);
	case ISD::BR_JT:
	return LowerBR_JT(Op, DAG);
	case ISD::ConstantPool:
	return LowerConstantPool(Op, DAG);
	case ISD::BlockAddress:
	return LowerBlockAddress(Op, DAG);
	case ISD::VASTART:
	return LowerVASTART(Op, DAG);
	case ISD::VACOPY:
	return LowerVACOPY(Op, DAG);
	case ISD::VAARG:
	return LowerVAARG(Op, DAG);
	case ISD::ADDC:
	case ISD::ADDE:
	case ISD::SUBC:
	case ISD::SUBE:
	return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO:
	return LowerXALUO(Op, DAG);
	case ISD::FADD:
	return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
	case ISD::FSUB:
	return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
	case ISD::FMUL:
	return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
	case ISD::FDIV:
	return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
	case ISD::FP_ROUND:
	return LowerFP_ROUND(Op, DAG);
	case ISD::FP_EXTEND:
	return LowerFP_EXTEND(Op, DAG);
	case ISD::FRAMEADDR:
	return LowerFRAMEADDR(Op, DAG);
	case ISD::SPONENTRY:
	return LowerSPONENTRY(Op, DAG);
	case ISD::RETURNADDR:
	return LowerRETURNADDR(Op, DAG);
	case ISD::ADDROFRETURNADDR:
	return LowerADDROFRETURNADDR(Op, DAG);
	case ISD::INSERT_VECTOR_ELT:
	return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT:
	return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::BUILD_VECTOR:
	return LowerBUILD_VECTOR(Op, DAG);
	case ISD::VECTOR_SHUFFLE:
	return LowerVECTOR_SHUFFLE(Op, DAG);
	case ISD::EXTRACT_SUBVECTOR:
	return LowerEXTRACT_SUBVECTOR(Op, DAG);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL:
	return LowerVectorSRA_SRL_SHL(Op, DAG);
	case ISD::SHL_PARTS:
	return LowerShiftLeftParts(Op, DAG);
	case ISD::SRL_PARTS:
	case ISD::SRA_PARTS:
	return LowerShiftRightParts(Op, DAG);
	case ISD::CTPOP:
	return LowerCTPOP(Op, DAG);
	case ISD::FCOPYSIGN:
	return LowerFCOPYSIGN(Op, DAG);
	case ISD::OR:
	return LowerVectorOR(Op, DAG);
	case ISD::XOR:
	return LowerXOR(Op, DAG);
	case ISD::PREFETCH:
	return LowerPREFETCH(Op, DAG);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return LowerINT_TO_FP(Op, DAG);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	return LowerFP_TO_INT(Op, DAG);
	case ISD::FSINCOS:
	return LowerFSINCOS(Op, DAG);
	case ISD::FLT_ROUNDS_:
	return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::MUL:
	return LowerMUL(Op, DAG);
	case ISD::INTRINSIC_WO_CHAIN:
	return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::STORE:
	return LowerSTORE(Op, DAG);
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	case ISD::VECREDUCE_FMAX:
	case ISD::VECREDUCE_FMIN:
	return LowerVECREDUCE(Op, DAG);
	case ISD::ATOMIC_LOAD_SUB:
	return LowerATOMIC_LOAD_SUB(Op, DAG);
	case ISD::ATOMIC_LOAD_AND:
	return LowerATOMIC_LOAD_AND(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC:
	return LowerDYNAMIC_STACKALLOC(Op, DAG);
	}
	}

	//===----------------------------------------------------------------------===//
	// Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	/// Selects the correct CCAssignFn for a given CallingConvention value.
	CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
	bool IsVarArg) const {
	switch (CC) {
	default:
	report_fatal_error("Unsupported calling convention.");
	case CallingConv::WebKit_JS:
	return CC_AArch64_WebKit_JS;
	case CallingConv::GHC:
	return CC_AArch64_GHC;
	case CallingConv::C:
	case CallingConv::Fast:
	case CallingConv::PreserveMost:
	case CallingConv::CXX_FAST_TLS:
	case CallingConv::Swift:
	if (Subtarget->isTargetWindows() && IsVarArg)
	return CC_AArch64_Win64_VarArg;
	if (!Subtarget->isTargetDarwin())
	return CC_AArch64_AAPCS;
	return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
	case CallingConv::Win64:
	return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
	case CallingConv::AArch64_VectorCall:
	return CC_AArch64_AAPCS;
	}
	}

	CCAssignFn *
	AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
	return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	}

	SDValue AArch64TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	// At this point, Ins[].VT may already be promoted to i32. To correctly
	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
	// Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
	// we use a special version of AnalyzeFormalArguments to pass in ValVT and
	// LocVT.
	unsigned NumArgs = Ins.size();
	Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
	unsigned CurArgIdx = 0;
	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ValVT = Ins[i].VT;
	if (Ins[i].isOrigArg()) {
	std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
	CurArgIdx = Ins[i].getOrigArgIndex();

	// Get type of the original argument.
	EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
	/AllowUnknown/ true);
	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
	ValVT = MVT::i8;
	else if (ActualMVT == MVT::i16)
	ValVT = MVT::i16;
	}
	CCAssignFn AssignFn = CCAssignFnForCall(CallConv, /IsVarArg=*/false);
	bool Res =
	AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	assert(ArgLocs.size() == Ins.size());
	SmallVector<SDValue, 16> ArgValues;
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];

	if (Ins[i].Flags.isByVal()) {
	// Byval is used for HFAs in the PCS, but the system should work in a
	// non-compliant manner for larger structs.
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	int Size = Ins[i].Flags.getByValSize();
	unsigned NumRegs = (Size + 7) / 8;

	// FIXME: This works on big-endian for composite byvals, which are the common
	// case. It should also work for fundamental types too.
	unsigned FrameIdx =
	MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
	SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
	InVals.push_back(FrameIdxN);

	continue;
	}

	if (VA.isRegLoc()) {
	// Arguments stored in registers.
	EVT RegVT = VA.getLocVT();

	SDValue ArgValue;
	const TargetRegisterClass *RC;

	if (RegVT == MVT::i32)
	RC = &AArch64::GPR32RegClass;
	else if (RegVT == MVT::i64)
	RC = &AArch64::GPR64RegClass;
	else if (RegVT == MVT::f16)
	RC = &AArch64::FPR16RegClass;
	else if (RegVT == MVT::f32)
	RC = &AArch64::FPR32RegClass;
	else if (RegVT == MVT::f64 \|\| RegVT.is64BitVector())
	RC = &AArch64::FPR64RegClass;
	else if (RegVT == MVT::f128 \|\| RegVT.is128BitVector())
	RC = &AArch64::FPR128RegClass;
	else
	llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");

	// Transform the arguments in physical registers into virtual ones.
	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);

	// If this is an 8, 16 or 32-bit value, it is really passed promoted
	// to 64 bits. Insert an assert[sz]ext to capture this, then
	// truncate to the right size.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
	break;
	case CCValAssign::AExt:
	case CCValAssign::SExt:
	case CCValAssign::ZExt:
	// SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
	// nodes after our lowering.
	assert(RegVT == Ins[i].VT && "incorrect register location selected");
	break;
	}

	InVals.push_back(ArgValue);

	} else { // VA.isRegLoc()
	assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
	unsigned ArgOffset = VA.getLocMemOffset();
	unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;

	uint32_t BEAlign = 0;
	if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
	!Ins[i].Flags.isInConsecutiveRegs())
	BEAlign = 8 - ArgSize;

	int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);

	// Create load nodes to retrieve arguments from the stack.
	SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	SDValue ArgValue;

	// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
	MVT MemVT = VA.getValVT();

	switch (VA.getLocInfo()) {
	default:
	break;
	case CCValAssign::BCvt:
	MemVT = VA.getLocVT();
	break;
	case CCValAssign::SExt:
	ExtType = ISD::SEXTLOAD;
	break;
	case CCValAssign::ZExt:
	ExtType = ISD::ZEXTLOAD;
	break;
	case CCValAssign::AExt:
	ExtType = ISD::EXTLOAD;
	break;
	}

	ArgValue = DAG.getExtLoad(
	ExtType, DL, VA.getLocVT(), Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
	MemVT);

	InVals.push_back(ArgValue);
	}
	}

	// varargs
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	if (isVarArg) {
	if (!Subtarget->isTargetDarwin() \|\| IsWin64) {
	// The AAPCS variadic function ABI is identical to the non-variadic
	// one. As a result there may be more arguments in registers and we should
	// save them for future reference.
	// Win64 variadic functions also pass arguments in registers, but all float
	// arguments are passed in integer registers.
	saveVarArgRegisters(CCInfo, DAG, DL, Chain);
	}

	// This will point to the next argument passed via stack.
	unsigned StackOffset = CCInfo.getNextStackOffset();
	// We currently pass all varargs at 8-byte alignment.
	StackOffset = ((StackOffset + 7) & ~7);
	FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));

	if (MFI.hasMustTailInVarArgFunc()) {
	SmallVector<MVT, 2> RegParmTypes;
	RegParmTypes.push_back(MVT::i64);
	RegParmTypes.push_back(MVT::f128);
	// Compute the set of forwarded registers. The rest are scratch.
	SmallVectorImpl<ForwardedRegister> &Forwards =
	FuncInfo->getForwardedMustTailRegParms();
	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
	CC_AArch64_AAPCS);

	// Conservatively forward X8, since it might be used for aggregate return.
	if (!CCInfo.isAllocated(AArch64::X8)) {
	unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
	Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
	}
	}
	}

	// On Windows, InReg pointers must be returned, so record the pointer in a
	// virtual register at the start of the function so it can be returned in the
	// epilogue.
	if (IsWin64) {
	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
	if (Ins[I].Flags.isInReg()) {
	assert(!FuncInfo->getSRetReturnReg());

	MVT PtrTy = getPointerTy(DAG.getDataLayout());
	unsigned Reg =
	MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
	FuncInfo->setSRetReturnReg(Reg);

	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
	break;
	}
	}
	}

	unsigned StackArgSize = CCInfo.getNextStackOffset();
	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
	if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
	// This is a non-standard ABI so by fiat I say we're allowed to make full
	// use of the stack area to be popped, which must be aligned to 16 bytes in
	// any case:
	StackArgSize = alignTo(StackArgSize, 16);

	// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
	// a multiple of 16.
	FuncInfo->setArgumentStackToRestore(StackArgSize);

	// This realignment carries over to the available bytes below. Our own
	// callers will guarantee the space is free by giving an aligned value to
	// CALLSEQ_START.
	}
	// Even if we're not expected to free up the space, it's useful to know how
	// much is there while considering tail calls (because we can reuse it).
	FuncInfo->setBytesInStackArgArea(StackArgSize);

	if (Subtarget->hasCustomCallingConv())
	Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);

	return Chain;
	}

	void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
	SelectionDAG &DAG,
	const SDLoc &DL,
	SDValue &Chain) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());

	SmallVector<SDValue, 8> MemOps;

	static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
	AArch64::X3, AArch64::X4, AArch64::X5,
	AArch64::X6, AArch64::X7 };
	static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
	unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);

	unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
	int GPRIdx = 0;
	if (GPRSaveSize != 0) {
	if (IsWin64) {
	GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
	if (GPRSaveSize & 15)
	// The extra size here, if triggered, will always be 8.
	MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
	} else
	GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);

	SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);

	for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
	unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
	SDValue Store = DAG.getStore(
	Val.getValue(1), DL, Val, FIN,
	IsWin64
	? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
	GPRIdx,
	(i - FirstVariadicGPR) * 8)
	: MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
	MemOps.push_back(Store);
	FIN =
	DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
	}
	}
	FuncInfo->setVarArgsGPRIndex(GPRIdx);
	FuncInfo->setVarArgsGPRSize(GPRSaveSize);

	if (Subtarget->hasFPARMv8() && !IsWin64) {
	static const MCPhysReg FPRArgRegs[] = {
	AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
	AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
	static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
	unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);

	unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
	int FPRIdx = 0;
	if (FPRSaveSize != 0) {
	FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);

	SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);

	for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
	unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);

	SDValue Store = DAG.getStore(
	Val.getValue(1), DL, Val, FIN,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
	MemOps.push_back(Store);
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
	DAG.getConstant(16, DL, PtrVT));
	}
	}
	FuncInfo->setVarArgsFPRIndex(FPRIdx);
	FuncInfo->setVarArgsFPRSize(FPRSaveSize);
	}

	if (!MemOps.empty()) {
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}
	}

	/// LowerCallResult - Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	SDValue AArch64TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
	SDValue ThisVal) const {
	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	CCValAssign VA = RVLocs[i];

	// Pass 'this' value directly from the argument to return value, to avoid
	// reg unit interference
	if (i == 0 && isThisReturn) {
	assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
	"unexpected return calling convention register assignment");
	InVals.push_back(ThisVal);
	continue;
	}

	SDValue Val =
	DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
	Chain = Val.getValue(1);
	InFlag = Val.getValue(2);

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
	break;
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return CC == CallingConv::Fast;
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	case CallingConv::C:
	case CallingConv::PreserveMost:
	case CallingConv::Swift:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	bool AArch64TargetLowering::isEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	MachineFunction &MF = DAG.getMachineFunction();
	const Function &CallerF = MF.getFunction();
	CallingConv::ID CallerCC = CallerF.getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;

	// Byval parameters hand the function a pointer directly into the stack area
	// we want to reuse during a tail call. Working around this is possible (see
	// X86) but less efficient and uglier in LowerCall.
	for (Function::const_arg_iterator i = CallerF.arg_begin(),
	e = CallerF.arg_end();
	i != e; ++i) {
	if (i->hasByValAttr())
	return false;

	// On Windows, "inreg" attributes signify non-aggregate indirect returns.
	// In this case, it is necessary to save/restore X0 in the callee. Tail
	// call opt interferes with this. So we disable tail call opt when the
	// caller has an argument with "inreg" attribute.

	// FIXME: Check whether the callee also has an "inreg" argument.
	if (i->hasInRegAttr())
	return false;
	}

	if (getTargetMachine().Options.GuaranteedTailCallOpt)
	return canGuaranteeTCO(CalleeCC) && CCMatch;

	// Externally-defined functions with weak linkage should not be
	// tail-called on AArch64 when the OS does not support dynamic
	// pre-emption of symbols, as the AAELF spec requires normal calls
	// to undefined weak functions to be replaced with a NOP or jump to the
	// next instruction. The behaviour of branch instructions in this
	// situation (as used for tail calls) is implementation-defined, so we
	// cannot rely on the linker replacing the tail call with a return.
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = G->getGlobal();
	const Triple &TT = getTargetMachine().getTargetTriple();
	if (GV->hasExternalWeakLinkage() &&
	(!TT.isOSWindows() \|\| TT.isOSBinFormatELF() \|\| TT.isOSBinFormatMachO()))
	return false;
	}

	// Now we search for cases where we can use a tail call without changing the
	// ABI. Sibcall is used in some places (particularly gcc) to refer to this
	// concept.

	// I want anyone implementing a new calling convention to think long and hard
	// about this assert.
	assert((!isVarArg \|\| CalleeCC == CallingConv::C) &&
	"Unexpected variadic calling convention");

	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// At least two cases here: if caller is fastcc then we can't have any
	// memory arguments (we'd be expected to clean up the stack afterwards). If
	// caller is C then we could potentially use its argument area.

	// FIXME: for now we take the most conservative of these in both cases:
	// disallow all variadic memory operands.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
	for (const CCValAssign &ArgLoc : ArgLocs)
	if (!ArgLoc.isRegLoc())
	return false;
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	CCAssignFnForCall(CalleeCC, isVarArg),
	CCAssignFnForCall(CallerCC, isVarArg)))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (Subtarget->hasCustomCallingConv()) {
	TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
	TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
	}
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	// Nothing more to check if the callee is taking no arguments
	if (Outs.empty())
	return true;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));

	const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

	// If the stack arguments for this call do not fit into our own save area then
	// the call cannot be made tail.
	if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
	return false;

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;

	return true;
	}

	SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
	SelectionDAG &DAG,
	MachineFrameInfo &MFI,
	int ClobberedFI) const {
	SmallVector<SDValue, 8> ArgChains;
	int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
	int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;

	// Include the original chain at the beginning of the list. When this is
	// used by target LowerCall hooks, this helps legalize find the
	// CALLSEQ_BEGIN node.
	ArgChains.push_back(Chain);

	// Add a chain value for each stack argument corresponding
	for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
	UE = DAG.getEntryNode().getNode()->use_end();
	U != UE; ++U)
	if (LoadSDNode L = dyn_cast<LoadSDNode>(U))
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
	if (FI->getIndex() < 0) {
	int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
	int64_t InLastByte = InFirstByte;
	InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;

	if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) \|\|
	(FirstByte <= InFirstByte && InFirstByte <= LastByte))
	ArgChains.push_back(SDValue(L, 1));
	}

	// Build a tokenfactor for all the chains.
	return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
	}

	bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
	bool TailCallOpt) const {
	return CallCC == CallingConv::Fast && TailCallOpt;
	}

	/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
	/// and add input and output parameter nodes.
	SDValue
	AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &DL = CLI.DL;
	SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
	SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
	SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &IsTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool IsVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool IsThisReturn = false;

	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
	bool IsSibCall = false;

	if (IsTailCall) {
	// Check if it's really possible to do a tail call.
	IsTailCall = isEligibleForTailCallOptimization(
	Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
	if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");

	// A sibling call is one where we're under the usual C ABI and not planning
	// to change that but can still do a tail call:
	if (!TailCallOpt && IsTailCall)
	IsSibCall = true;

	if (IsTailCall)
	++NumTailCalls;
	}

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	if (IsVarArg) {
	// Handle fixed and variable vector arguments differently.
	// Variable vector arguments always go into memory.
	unsigned NumArgs = Outs.size();

	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ArgVT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
	/IsVarArg=/ !Outs[i].IsFixed);
	bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	} else {
	// At this point, Outs[].VT may already be promoted to i32. To correctly
	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
	// Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
	// we use a special version of AnalyzeCallOperands to pass in ValVT and
	// LocVT.
	unsigned NumArgs = Outs.size();
	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ValVT = Outs[i].VT;
	// Get type of the original argument.
	EVT ActualVT = getValueType(DAG.getDataLayout(),
	CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
	/AllowUnknown/ true);
	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
	ValVT = MVT::i8;
	else if (ActualMVT == MVT::i16)
	ValVT = MVT::i16;

	CCAssignFn AssignFn = CCAssignFnForCall(CallConv, /IsVarArg=*/false);
	bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getNextStackOffset();

	if (IsSibCall) {
	// Since we're not changing the ABI to make this a tail call, the memory
	// operands are already available in the caller's incoming argument space.
	NumBytes = 0;
	}

	// FPDiff is the byte offset of the call's argument area from the callee's.
	// Stores to callee stack arguments will be placed in FixedStackSlots offset
	// by this amount for a tail call. In a sibling call it must be 0 because the
	// caller will deallocate the entire stack and the callee still expects its
	// arguments to begin at SP+0. Completely unused for non-tail calls.
	int FPDiff = 0;

	if (IsTailCall && !IsSibCall) {
	unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();

	// Since callee will pop argument stack as a tail call, we must keep the
	// popped size 16-byte aligned.
	NumBytes = alignTo(NumBytes, 16);

	// FPDiff will be negative if this tail call requires more space than we
	// would automatically have in our incoming argument space. Positive if we
	// can actually shrink the stack.
	FPDiff = NumReusableBytes - NumBytes;

	// The stack pointer must be 16-byte aligned at all times it's used for a
	// memory operation, which in practice means at all times and in
	// particular across call boundaries. Therefore our own arguments started at
	// a 16-byte aligned SP and the delta applied for the tail call should
	// satisfy the same constraint.
	assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
	}

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	if (!IsSibCall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);

	SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
	getPointerTy(DAG.getDataLayout()));

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
	const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
	for (const auto &F : Forwards) {
	SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
	RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
	}
	}

	// Walk the register/memloc assignments, inserting copies/loads.
	for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
	++i, ++realArgIdx) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[realArgIdx];
	ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExt:
	if (Outs[realArgIdx].ArgVT == MVT::i1) {
	// AAPCS requires i1 to be zero-extended to 8-bits by the caller.
	Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
	}
	Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::FPExt:
	Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	}

	if (VA.isRegLoc()) {
	if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
	Outs[0].VT == MVT::i64) {
	assert(VA.getLocVT() == MVT::i64 &&
	"unexpected calling convention register assignment");
	assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
	"unexpected use of 'returned'");
	IsThisReturn = true;
	}
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	} else {
	assert(VA.isMemLoc());

	SDValue DstAddr;
	MachinePointerInfo DstInfo;

	// FIXME: This works on big-endian for composite byvals, which are the
	// common case. It should also work for fundamental types too.
	uint32_t BEAlign = 0;
	unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
	: VA.getValVT().getSizeInBits();
	OpSize = (OpSize + 7) / 8;
	if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
	!Flags.isInConsecutiveRegs()) {
	if (OpSize < 8)
	BEAlign = 8 - OpSize;
	}
	unsigned LocMemOffset = VA.getLocMemOffset();
	int32_t Offset = LocMemOffset + BEAlign;
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
	PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);

	if (IsTailCall) {
	Offset = Offset + FPDiff;
	int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);

	DstAddr = DAG.getFrameIndex(FI, PtrVT);
	DstInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);

	// Make sure any stack arguments overlapping with where we're storing
	// are loaded before this eventual operation. Otherwise they'll be
	// clobbered.
	Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
	} else {
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);

	DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
	DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
	LocMemOffset);
	}

	if (Outs[i].Flags.isByVal()) {
	SDValue SizeNode =
	DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
	SDValue Cpy = DAG.getMemcpy(
	Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
	/isVol = / false, /AlwaysInline = / false,
	/isTailCall = / false,
	DstInfo, MachinePointerInfo());

	MemOpChains.push_back(Cpy);
	} else {
	// Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
	// promoted to a legal register type i32, we should truncate Arg back to
	// i1/i8/i16.
	if (VA.getValVT() == MVT::i1 \|\| VA.getValVT() == MVT::i8 \|\|
	VA.getValVT() == MVT::i16)
	Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);

	SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
	MemOpChains.push_back(Store);
	}
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (auto &RegToPass : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
	RegToPass.second, InFlag);
	InFlag = Chain.getValue(1);
	}

	// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
	// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
	// node so that legalize doesn't hack it.
	if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	auto GV = G->getGlobal();
	if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
	AArch64II::MO_GOT) {
	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
	Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
	} else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
	assert(Subtarget->isTargetWindows() &&
	"Windows is the only supported COFF target");
	Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
	} else {
	const GlobalValue *GV = G->getGlobal();
	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
	}
	} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	Subtarget->isTargetMachO()) {
	const char *Sym = S->getSymbol();
	Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
	Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
	} else {
	const char *Sym = S->getSymbol();
	Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
	}
	}

	// We don't usually want to end the call-sequence here because we would tidy
	// the frame up after the call, however in the ABI-changing tail-call case
	// we've carefully laid out the parameters so that when sp is reset they'll be
	// in the correct location.
	if (IsTailCall && !IsSibCall) {
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
	DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
	InFlag = Chain.getValue(1);
	}

	std::vector<SDValue> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (IsTailCall) {
	// Each tail call may have to adjust the stack by a different amount, so
	// this information must travel along with the operation for eventual
	// consumption by emitEpilogue.
	Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
	}

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (auto &RegToPass : RegsToPass)
	Ops.push_back(DAG.getRegister(RegToPass.first,
	RegToPass.second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	const uint32_t *Mask;
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	if (IsThisReturn) {
	// For 'this' returns, use the X0-preserving mask if applicable
	Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
	if (!Mask) {
	IsThisReturn = false;
	Mask = TRI->getCallPreservedMask(MF, CallConv);
	}
	} else
	Mask = TRI->getCallPreservedMask(MF, CallConv);

	if (Subtarget->hasCustomCallingConv())
	TRI->UpdateCustomCallPreservedMask(MF, &Mask);

	if (TRI->isAnyArgRegReserved(MF))
	TRI->emitReservedArgRegCallError(MF);

	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	// If we're doing a tall call, use a TC_RETURN here rather than an
	// actual call instruction.
	if (IsTailCall) {
	MF.getFrameInfo().setHasTailCall();
	return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
	}

	// Returns a chain and a flag for retval copy to use.
	Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
	InFlag = Chain.getValue(1);

	uint64_t CalleePopBytes =
	DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
	DAG.getIntPtrConstant(CalleePopBytes, DL, true),
	InFlag, DL);
	if (!Ins.empty())
	InFlag = Chain.getValue(1);

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
	InVals, IsThisReturn,
	IsThisReturn ? OutVals[0] : SDValue());
	}

	bool AArch64TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC);
	}

	SDValue
	AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	auto &MF = DAG.getMachineFunction();
	auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC);

	// Copy the result values into the output registers.
	SDValue Flag;
	SmallVector<SDValue, 4> RetOps(1, Chain);
	for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
	++i, ++realRVLocIdx) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");
	SDValue Arg = OutVals[realRVLocIdx];

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	if (Outs[i].ArgVT == MVT::i1) {
	// AAPCS requires i1 to be zero-extended to i8 by the producer of the
	// value. This is strictly redundant on Darwin (which uses "zeroext
	// i1"), but will be optimised out before ISel.
	Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	}
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
	break;
	}

	Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}

	// Windows AArch64 ABIs require that for returning structs by value we copy
	// the sret argument into X0 for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into X0.
	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
	SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
	getPointerTy(MF.getDataLayout()));

	unsigned RetValReg = AArch64::X0;
	Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
	Flag = Chain.getValue(1);

	RetOps.push_back(
	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
	}

	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (AArch64::GPR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else if (AArch64::FPR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Code
	//===----------------------------------------------------------------------===//

	SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
	N->getOffset(), Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
	N->getOffset(), Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
	}

	// (loadGOT sym)
	template <class NodeTy>
	SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT \| Flags);
	// FIXME: Once remat is capable of dealing with instructions with register
	// operands, expand this into two nodes instead of using a wrapper node.
	return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
	}

	// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
	template <class NodeTy>
	SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	const unsigned char MO_NC = AArch64II::MO_NC;
	return DAG.getNode(
	AArch64ISD::WrapperLarge, DL, Ty,
	getTargetNode(N, Ty, DAG, AArch64II::MO_G3 \| Flags),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G2 \| MO_NC \| Flags),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G1 \| MO_NC \| Flags),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G0 \| MO_NC \| Flags));
	}

	// (addlow (adrp %hi(sym)) %lo(sym))
	template <class NodeTy>
	SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE \| Flags);
	SDValue Lo = getTargetNode(N, Ty, DAG,
	AArch64II::MO_PAGEOFF \| AArch64II::MO_NC \| Flags);
	SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
	return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
	}

	// (adr sym)
	template <class NodeTy>
	SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
	unsigned Flags) const {
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
	return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
	}

	SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
	const GlobalValue *GV = GN->getGlobal();
	unsigned char OpFlags =
	Subtarget->ClassifyGlobalReference(GV, getTargetMachine());

	if (OpFlags != AArch64II::MO_NO_FLAG)
	assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
	"unexpected offset in global node");

	// This also catches the large code model case for Darwin, and tiny code
	// model with got relocations.
	if ((OpFlags & AArch64II::MO_GOT) != 0) {
	return getGOT(GN, DAG, OpFlags);
	}

	SDValue Result;
	if (getTargetMachine().getCodeModel() == CodeModel::Large) {
	Result = getAddrLarge(GN, DAG, OpFlags);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	Result = getAddrTiny(GN, DAG, OpFlags);
	} else {
	Result = getAddr(GN, DAG, OpFlags);
	}
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(GN);
	if (OpFlags & (AArch64II::MO_DLLIMPORT \| AArch64II::MO_COFFSTUB))
	Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	return Result;
	}

	/// Convert a TLS address reference into the correct sequence of loads
	/// and calls to compute the variable's address (for Darwin, currently) and
	/// return an SDValue containing the final node.

	/// Darwin only has one TLS scheme which must be capable of dealing with the
	/// fully general situation, in the worst case. This means:
	/// + "extern __thread" declaration.
	/// + Defined in a possibly unknown dynamic library.
	///
	/// The general system is that each __thread variable has a [3 x i64] descriptor
	/// which contains information used by the runtime to calculate the address. The
	/// only part of this the compiler needs to know about is the first xword, which
	/// contains a function pointer that must be called with the address of the
	/// entire descriptor in "x0".
	///
	/// Since this descriptor may be in a different unit, in general even the
	/// descriptor must be accessed via an indirect load. The "ideal" code sequence
	/// is:
	/// adrp x0, _var@TLVPPAGE
	/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
	/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
	/// ; the function pointer
	/// blr x1 ; Uses descriptor address in x0
	/// ; Address of _var is now in x0.
	///
	/// If the address of _var's descriptor is known to the linker, then it can
	/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
	/// a slight efficiency gain.
	SDValue
	AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetDarwin() &&
	"This function expects a Darwin target");

	SDLoc DL(Op);
	MVT PtrVT = getPointerTy(DAG.getDataLayout());
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();

	SDValue TLVPAddr =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
	SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);

	// The first entry in the descriptor is a function pointer that we must call
	// to obtain the address of the variable.
	SDValue Chain = DAG.getEntryNode();
	SDValue FuncTLVGet = DAG.getLoad(
	MVT::i64, DL, Chain, DescAddr,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()),
	/* Alignment = */ 8,
	MachineMemOperand::MONonTemporal \| MachineMemOperand::MOInvariant \|
	MachineMemOperand::MODereferenceable);
	Chain = FuncTLVGet.getValue(1);

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// TLS calls preserve all registers except those that absolutely must be
	// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
	// silly).
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *Mask = TRI->getTLSCallPreservedMask();
	if (Subtarget->hasCustomCallingConv())
	TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);

	// Finally, we can make the call. This is just a degenerate version of a
	// normal AArch64 call node: x0 takes the address of the descriptor, and
	// returns the address of the variable in this thread.
	Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
	Chain =
	DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
	Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
	DAG.getRegisterMask(Mask), Chain.getValue(1));
	return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
	}

	/// When accessing thread-local variables under either the general-dynamic or
	/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
	/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
	/// is a function pointer to carry out the resolution.
	///
	/// The sequence is:
	/// adrp x0, :tlsdesc:var
	/// ldr x1, [x0, #:tlsdesc_lo12:var]
	/// add x0, x0, #:tlsdesc_lo12:var
	/// .tlsdesccall var
	/// blr x1
	/// (TPIDR_EL0 offset now in x0)
	///
	/// The above sequence must be produced unscheduled, to enable the linker to
	/// optimize/relax this sequence.
	/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
	/// above sequence, and expanded really late in the compilation flow, to ensure
	/// the sequence is produced as per above.
	SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
	const SDLoc &DL,
	SelectionDAG &DAG) const {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	Chain =
	DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
	SDValue Glue = Chain.getValue(1);

	return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
	}

	SDValue
	AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetELF() && "This function expects an ELF target");
	if (getTargetMachine().getCodeModel() == CodeModel::Large)
	report_fatal_error("ELF TLS only supported in small memory model");
	// Different choices can be made for the maximum size of the TLS area for a
	// module. For the small address model, the default TLS size is 16MiB and the
	// maximum TLS size is 4GiB.
	// FIXME: add -mtls-size command line option and make it control the 16MiB
	// vs. 4GiB code sequence generation.
	// FIXME: add tiny codemodel support. We currently generate the same code as
	// small, which may be larger than needed.
	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());

	if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
	if (Model == TLSModel::LocalDynamic)
	Model = TLSModel::GeneralDynamic;
	}

	SDValue TPOff;
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);
	const GlobalValue *GV = GA->getGlobal();

	SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);

	if (Model == TLSModel::LocalExec) {
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	SDValue TPWithOff_lo =
	SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
	HiVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	SDValue TPWithOff =
	SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
	LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	return TPWithOff;
	} else if (Model == TLSModel::InitialExec) {
	TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
	TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
	} else if (Model == TLSModel::LocalDynamic) {
	// Local-dynamic accesses proceed in two phases. A general-dynamic TLS
	// descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
	// the beginning of the module's TLS region, followed by a DTPREL offset
	// calculation.

	// These accesses will need deduplicating if there's more than one.
	AArch64FunctionInfo *MFI =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	// The call needs a relocation too for linker relaxation. It doesn't make
	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
	// the address.
	SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
	AArch64II::MO_TLS);

	// Now we can calculate the offset from TPIDR_EL0 to this module's
	// thread-local area.
	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);

	// Now use :dtprel_whatever: operations to calculate this variable's offset
	// in its thread-storage area.
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, MVT::i64, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, MVT::i64, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	} else if (Model == TLSModel::GeneralDynamic) {
	// The call needs a relocation too for linker relaxation. It doesn't make
	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
	// the address.
	SDValue SymAddr =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);

	// Finally we can make a call to calculate the offset from tpidr_el0.
	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
	} else
	llvm_unreachable("Unsupported ELF TLS access model");

	return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
	}

	SDValue
	AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");

	SDValue Chain = DAG.getEntryNode();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);

	SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);

	// Load the ThreadLocalStoragePointer from the TEB
	// A pointer to the TLS array is located at offset 0x58 from the TEB.
	SDValue TLSArray =
	DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
	TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
	Chain = TLSArray.getValue(1);

	// Load the TLS index from the C runtime;
	// This does the same as getAddr(), but without having a GlobalAddressSDNode.
	// This also does the same as LOADgot, but using a generic i32 load,
	// while LOADgot only loads i64.
	SDValue TLSIndexHi =
	DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
	SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
	"_tls_index", PtrVT, AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
	SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
	SDValue TLSIndex =
	DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
	TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
	Chain = TLSIndex.getValue(1);

	// The pointer to the thread's TLS data area is at the TLS Index scaled by 8
	// offset into the TLSArray.
	TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
	SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
	DAG.getConstant(3, DL, PtrVT));
	SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
	DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
	MachinePointerInfo());
	Chain = TLS.getValue(1);

	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
	const GlobalValue *GV = GA->getGlobal();
	SDValue TGAHi = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue TGALo = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	// Add the offset from the start of the .tls section (section base).
	SDValue Addr =
	SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
	return Addr;
	}

	SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
	if (DAG.getTarget().useEmulatedTLS())
	return LowerToTLSEmulatedModel(GA, DAG);

	if (Subtarget->isTargetDarwin())
	return LowerDarwinGlobalTLSAddress(Op, DAG);
	if (Subtarget->isTargetELF())
	return LowerELFGlobalTLSAddress(Op, DAG);
	if (Subtarget->isTargetWindows())
	return LowerWindowsGlobalTLSAddress(Op, DAG);

	llvm_unreachable("Unexpected platform trying to use TLS");
	}

	SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
	SDValue LHS = Op.getOperand(2);
	SDValue RHS = Op.getOperand(3);
	SDValue Dest = Op.getOperand(4);
	SDLoc dl(Op);

	MachineFunction &MF = DAG.getMachineFunction();
	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
	// will not be produced, as they are conditional branch instructions that do
	// not set flags.
	bool ProduceNonFlagSettingCondBr =
	!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);

	// Handle f128 first, since lowering it will result in comparing the return
	// value of a libcall against zero, which is just what the rest of LowerBR_CC
	// is expecting to deal with.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);

	// If softenSetCCOperands returned a scalar, we need to compare the result
	// against zero to select between true and false values.
	if (!RHS.getNode()) {
	RHS = DAG.getConstant(0, dl, LHS.getValueType());
	CC = ISD::SETNE;
	}
	}

	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a branch
	// instruction.
	if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
	return SDValue();

	// The actual operation with overflow check.
	AArch64CC::CondCode OFCC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);

	if (CC == ISD::SETNE)
	OFCC = getInvertedCondCode(OFCC);
	SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);

	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Overflow);
	}

	if (LHS.getValueType().isInteger()) {
	assert((LHS.getValueType() == RHS.getValueType()) &&
	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));

	// If the RHS of the comparison is zero, we can potentially fold this
	// to a specialized branch.
	const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
	if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
	if (CC == ISD::SETEQ) {
	// See if we can use a TBZ to fold in an AND as well.
	// TBZ has a smaller branch displacement than CBZ. If the offset is
	// out of bounds, a late MI-layer pass rewrites branches.
	// 403.gcc is an example that hits this case.
	if (LHS.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	isPowerOf2_64(LHS.getConstantOperandVal(1))) {
	SDValue Test = LHS.getOperand(0);
	uint64_t Mask = LHS.getConstantOperandVal(1);
	return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
	DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
	Dest);
	}

	return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
	} else if (CC == ISD::SETNE) {
	// See if we can use a TBZ to fold in an AND as well.
	// TBZ has a smaller branch displacement than CBZ. If the offset is
	// out of bounds, a late MI-layer pass rewrites branches.
	// 403.gcc is an example that hits this case.
	if (LHS.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	isPowerOf2_64(LHS.getConstantOperandVal(1))) {
	SDValue Test = LHS.getOperand(0);
	uint64_t Mask = LHS.getConstantOperandVal(1);
	return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
	DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
	Dest);
	}

	return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
	} else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
	// Don't combine AND since emitComparison converts the AND to an ANDS
	// (a.k.a. TST) and the test in the test bit and branch instruction
	// becomes redundant. This would also increase register pressure.
	uint64_t Mask = LHS.getValueSizeInBits() - 1;
	return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
	DAG.getConstant(Mask, dl, MVT::i64), Dest);
	}
	}
	if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
	LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
	// Don't combine AND since emitComparison converts the AND to an ANDS
	// (a.k.a. TST) and the test in the test bit and branch instruction
	// becomes redundant. This would also increase register pressure.
	uint64_t Mask = LHS.getValueSizeInBits() - 1;
	return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
	DAG.getConstant(Mask, dl, MVT::i64), Dest);
	}

	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Cmp);
	}

	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::f32 \|\|
	LHS.getValueType() == MVT::f64);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two branches to implement.
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue BR1 =
	DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
	if (CC2 != AArch64CC::AL) {
	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
	Cmp);
	}

	return BR1;
	}

	SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);

	SDValue In1 = Op.getOperand(0);
	SDValue In2 = Op.getOperand(1);
	EVT SrcVT = In2.getValueType();

	if (SrcVT.bitsLT(VT))
	In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
	else if (SrcVT.bitsGT(VT))
	In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));

	EVT VecVT;
	uint64_t EltMask;
	SDValue VecVal1, VecVal2;

	auto setVecVal = [&] (int Idx) {
	if (!VT.isVector()) {
	VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
	DAG.getUNDEF(VecVT), In1);
	VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
	DAG.getUNDEF(VecVT), In2);
	} else {
	VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
	VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
	}
	};

	if (VT == MVT::f32 \|\| VT == MVT::v2f32 \|\| VT == MVT::v4f32) {
	VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
	EltMask = 0x80000000ULL;
	setVecVal(AArch64::ssub);
	} else if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
	VecVT = MVT::v2i64;

	// We want to materialize a mask with the high bit set, but the AdvSIMD
	// immediate moves cannot materialize that in a single instruction for
	// 64-bit elements. Instead, materialize zero and then negate it.
	EltMask = 0;

	setVecVal(AArch64::dsub);
	} else if (VT == MVT::f16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v8f16) {
	VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
	EltMask = 0x8000ULL;
	setVecVal(AArch64::hsub);
	} else {
	llvm_unreachable("Invalid type for copysign!");
	}

	SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);

	// If we couldn't materialize the mask above, then the mask vector will be
	// the zero vector, and we need to negate it here.
	if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
	BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
	BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
	BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
	}

	SDValue Sel =
	DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);

	if (VT == MVT::f16)
	return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
	if (VT == MVT::f32)
	return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
	else if (VT == MVT::f64)
	return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
	else
	return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
	}

	SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat))
	return SDValue();

	if (!Subtarget->hasNEON())
	return SDValue();

	// While there is no integer popcount instruction, it can
	// be more efficiently lowered to the following sequence that uses
	// AdvSIMD registers/instructions as long as the copies to/from
	// the AdvSIMD registers are cheap.
	// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
	// CNT V0.8B, V0.8B // 8xbyte pop-counts
	// ADDV B0, V0.8B // sum 8xbyte pop-counts
	// UMOV X0, V0.B[0] // copy byte result back to integer reg
	SDValue Val = Op.getOperand(0);
	SDLoc DL(Op);
	EVT VT = Op.getValueType();

	if (VT == MVT::i32 \|\| VT == MVT::i64) {
	if (VT == MVT::i32)
	Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);

	SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
	SDValue UaddLV = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
	DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);

	if (VT == MVT::i64)
	UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
	return UaddLV;
	}

	assert((VT == MVT::v1i64 \|\| VT == MVT::v2i64 \|\| VT == MVT::v2i32 \|\|
	VT == MVT::v4i32 \|\| VT == MVT::v4i16 \|\| VT == MVT::v8i16) &&
	"Unexpected type for custom ctpop lowering");

	EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
	Val = DAG.getBitcast(VT8Bit, Val);
	Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);

	// Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
	unsigned EltSize = 8;
	unsigned NumElts = VT.is64BitVector() ? 8 : 16;
	while (EltSize != VT.getScalarSizeInBits()) {
	EltSize *= 2;
	NumElts /= 2;
	MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
	Val = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
	DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
	}

	return Val;
	}

	SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	if (Op.getValueType().isVector())
	return LowerVSETCC(Op, DAG);

	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDLoc dl(Op);

	// We chose ZeroOrOneBooleanContents, so use zero and one.
	EVT VT = Op.getValueType();
	SDValue TVal = DAG.getConstant(1, dl, VT);
	SDValue FVal = DAG.getConstant(0, dl, VT);

	// Handle f128 first, since one possible outcome is a normal integer
	// comparison which gets picked up by the next if statement.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);

	// If softenSetCCOperands returned a scalar, use it.
	if (!RHS.getNode()) {
	assert(LHS.getValueType() == Op.getValueType() &&
	"Unexpected setcc expansion!");
	return LHS;
	}
	}

	if (LHS.getValueType().isInteger()) {
	SDValue CCVal;
	SDValue Cmp =
	getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);

	// Note that we inverted the condition above, so we reverse the order of
	// the true and false operands here. This will allow the setcc to be
	// matched to a single CSINC instruction.
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
	}

	// Now we know we're dealing with FP values.
	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::f32 \|\|
	LHS.getValueType() == MVT::f64);

	// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
	// and do the comparison.
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);

	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);
	if (CC2 == AArch64CC::AL) {
	changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);

	// Note that we inverted the condition above, so we reverse the order of
	// the true and false operands here. This will allow the setcc to be
	// matched to a single CSINC instruction.
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
	} else {
	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
	// totally clean. Some of them require two CSELs to implement. As is in
	// this case, we emit the first CSEL and then emit a second using the output
	// of the first as the RHS. We're effectively OR'ing the two CC's together.

	// FIXME: It would be nice if we could match the two CSELs to two CSINCs.
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue CS1 =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);

	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
	}
	}

	SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
	SDValue RHS, SDValue TVal,
	SDValue FVal, const SDLoc &dl,
	SelectionDAG &DAG) const {
	// Handle f128 first, because it will result in a comparison of some RTLIB
	// call result against zero.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);

	// If softenSetCCOperands returned a scalar, we need to compare the result
	// against zero to select between true and false values.
	if (!RHS.getNode()) {
	RHS = DAG.getConstant(0, dl, LHS.getValueType());
	CC = ISD::SETNE;
	}
	}

	// Also handle f16, for which we need to do a f32 comparison.
	if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
	}

	// Next, handle integers.
	if (LHS.getValueType().isInteger()) {
	assert((LHS.getValueType() == RHS.getValueType()) &&
	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));

	unsigned Opcode = AArch64ISD::CSEL;

	// If both the TVal and the FVal are constants, see if we can swap them in
	// order to for a CSINV or CSINC out of them.
	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);

	if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	} else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	} else if (TVal.getOpcode() == ISD::XOR) {
	// If TVal is a NOT we want to swap TVal and FVal so that we can match
	// with a CSINV rather than a CSEL.
	if (isAllOnesConstant(TVal.getOperand(1))) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}
	} else if (TVal.getOpcode() == ISD::SUB) {
	// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
	// that we can match with a CSNEG rather than a CSEL.
	if (isNullConstant(TVal.getOperand(0))) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}
	} else if (CTVal && CFVal) {
	const int64_t TrueVal = CTVal->getSExtValue();
	const int64_t FalseVal = CFVal->getSExtValue();
	bool Swap = false;

	// If both TVal and FVal are constants, see if FVal is the
	// inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
	// instead of a CSEL in that case.
	if (TrueVal == ~FalseVal) {
	Opcode = AArch64ISD::CSINV;
	} else if (TrueVal == -FalseVal) {
	Opcode = AArch64ISD::CSNEG;
	} else if (TVal.getValueType() == MVT::i32) {
	// If our operands are only 32-bit wide, make sure we use 32-bit
	// arithmetic for the check whether we can use CSINC. This ensures that
	// the addition in the check will wrap around properly in case there is
	// an overflow (which would not be the case if we do the check with
	// 64-bit arithmetic).
	const uint32_t TrueVal32 = CTVal->getZExtValue();
	const uint32_t FalseVal32 = CFVal->getZExtValue();

	if ((TrueVal32 == FalseVal32 + 1) \|\| (TrueVal32 + 1 == FalseVal32)) {
	Opcode = AArch64ISD::CSINC;

	if (TrueVal32 > FalseVal32) {
	Swap = true;
	}
	}
	// 64-bit check whether we can use CSINC.
	} else if ((TrueVal == FalseVal + 1) \|\| (TrueVal + 1 == FalseVal)) {
	Opcode = AArch64ISD::CSINC;

	if (TrueVal > FalseVal) {
	Swap = true;
	}
	}

	// Swap TVal and FVal if necessary.
	if (Swap) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}

	if (Opcode != AArch64ISD::CSEL) {
	// Drop FVal since we can get its value by simply inverting/negating
	// TVal.
	FVal = TVal;
	}
	}

	// Avoid materializing a constant when possible by reusing a known value in
	// a register. However, don't perform this optimization if the known value
	// is one, zero or negative one in the case of a CSEL. We can always
	// materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
	// FVal, respectively.
	ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
	if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
	!RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	// Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
	// "a != C ? x : a" to avoid materializing C.
	if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
	TVal = LHS;
	else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
	FVal = LHS;
	} else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
	assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
	// Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
	// avoid materializing C.
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
	Opcode = AArch64ISD::CSINV;
	TVal = LHS;
	FVal = DAG.getConstant(0, dl, FVal.getValueType());
	}
	}

	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
	EVT VT = TVal.getValueType();
	return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
	}

	// Now we know we're dealing with FP values.
	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::f32 \|\|
	LHS.getValueType() == MVT::f64);
	assert(LHS.getValueType() == RHS.getValueType());
	EVT VT = TVal.getValueType();
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two CSELs to implement.
	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);

	if (DAG.getTarget().Options.UnsafeFPMath) {
	// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
	// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
	ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
	if (RHSVal && RHSVal->isZero()) {
	ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
	ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);

	if ((CC == ISD::SETEQ \|\| CC == ISD::SETOEQ \|\| CC == ISD::SETUEQ) &&
	CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
	TVal = LHS;
	else if ((CC == ISD::SETNE \|\| CC == ISD::SETONE \|\| CC == ISD::SETUNE) &&
	CFVal && CFVal->isZero() &&
	FVal.getValueType() == LHS.getValueType())
	FVal = LHS;
	}
	}

	// Emit first, and possibly only, CSEL.
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);

	// If we need a second CSEL, emit it, using the output of the first as the
	// RHS. We're effectively OR'ing the two CC's together.
	if (CC2 != AArch64CC::AL) {
	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
	}

	// Otherwise, return the output of the first CSEL.
	return CS1;
	}

	SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
	SelectionDAG &DAG) const {
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue TVal = Op.getOperand(2);
	SDValue FVal = Op.getOperand(3);
	SDLoc DL(Op);
	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
	}

	SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue CCVal = Op->getOperand(0);
	SDValue TVal = Op->getOperand(1);
	SDValue FVal = Op->getOperand(2);
	SDLoc DL(Op);

	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a select
	// instruction.
	if (isOverflowIntrOpRes(CCVal)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
	return SDValue();

	AArch64CC::CondCode OFCC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
	SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);

	return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
	CCVal, Overflow);
	}

	// Lower it the same way as we would lower a SELECT_CC node.
	ISD::CondCode CC;
	SDValue LHS, RHS;
	if (CCVal.getOpcode() == ISD::SETCC) {
	LHS = CCVal.getOperand(0);
	RHS = CCVal.getOperand(1);
	CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
	} else {
	LHS = CCVal;
	RHS = DAG.getConstant(0, DL, CCVal.getValueType());
	CC = ISD::SETNE;
	}
	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
	}

	SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
	SelectionDAG &DAG) const {
	// Jump table entries as PC relative offsets. No additional tweaking
	// is necessary here. Just get the address of the jump table.
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	!Subtarget->isTargetMachO()) {
	return getAddrLarge(JT, DAG);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	return getAddrTiny(JT, DAG);
	}
	return getAddr(JT, DAG);
	}

	SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
	SelectionDAG &DAG) const {
	// Jump table entries as PC relative offsets. No additional tweaking
	// is necessary here. Just get the address of the jump table.
	SDLoc DL(Op);
	SDValue JT = Op.getOperand(1);
	SDValue Entry = Op.getOperand(2);
	int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();

	SDNode *Dest =
	DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
	Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
	return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
	SDValue(Dest, 0));
	}

	SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
	SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	if (getTargetMachine().getCodeModel() == CodeModel::Large) {
	// Use the GOT for the large code model on iOS.
	if (Subtarget->isTargetMachO()) {
	return getGOT(CP, DAG);
	}
	return getAddrLarge(CP, DAG);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	return getAddrTiny(CP, DAG);
	} else {
	return getAddr(CP, DAG);
	}
	}

	SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	!Subtarget->isTargetMachO()) {
	return getAddrLarge(BA, DAG);
	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
	return getAddrTiny(BA, DAG);
	}
	return getAddr(BA, DAG);
	}

	SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	AArch64FunctionInfo *FuncInfo =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();

	SDLoc DL(Op);
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
	getPointerTy(DAG.getDataLayout()));
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	AArch64FunctionInfo *FuncInfo =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();

	SDLoc DL(Op);
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
	? FuncInfo->getVarArgsGPRIndex()
	: FuncInfo->getVarArgsStackIndex(),
	getPointerTy(DAG.getDataLayout()));
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	// The layout of the va_list struct is specified in the AArch64 Procedure Call
	// Standard, section B.3.
	MachineFunction &MF = DAG.getMachineFunction();
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);

	SDValue Chain = Op.getOperand(0);
	SDValue VAList = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SmallVector<SDValue, 4> MemOps;

	// void *__stack at offset 0
	SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
	MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
	MachinePointerInfo(SV), /* Alignment = */ 8));

	// void *__gr_top at offset 8
	int GPRSize = FuncInfo->getVarArgsGPRSize();
	if (GPRSize > 0) {
	SDValue GRTop, GRTopAddr;

	GRTopAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));

	GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
	GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
	DAG.getConstant(GPRSize, DL, PtrVT));

	MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
	MachinePointerInfo(SV, 8),
	/* Alignment = */ 8));
	}

	// void *__vr_top at offset 16
	int FPRSize = FuncInfo->getVarArgsFPRSize();
	if (FPRSize > 0) {
	SDValue VRTop, VRTopAddr;
	VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(16, DL, PtrVT));

	VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
	VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
	DAG.getConstant(FPRSize, DL, PtrVT));

	MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
	MachinePointerInfo(SV, 16),
	/* Alignment = */ 8));
	}

	// int __gr_offs at offset 24
	SDValue GROffsAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
	MemOps.push_back(DAG.getStore(
	Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
	MachinePointerInfo(SV, 24), /* Alignment = */ 4));

	// int __vr_offs at offset 28
	SDValue VROffsAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
	MemOps.push_back(DAG.getStore(
	Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
	MachinePointerInfo(SV, 28), /* Alignment = */ 4));

	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();

	if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
	return LowerWin64_VASTART(Op, DAG);
	else if (Subtarget->isTargetDarwin())
	return LowerDarwin_VASTART(Op, DAG);
	else
	return LowerAAPCS_VASTART(Op, DAG);
	}

	SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
	SelectionDAG &DAG) const {
	// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
	// pointer.
	SDLoc DL(Op);
	unsigned VaListSize =
	Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows() ? 8 : 32;
	const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

	return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1),
	Op.getOperand(2),
	DAG.getConstant(VaListSize, DL, MVT::i32),
	8, false, false, false, MachinePointerInfo(DestSV),
	MachinePointerInfo(SrcSV));
	}

	SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget->isTargetDarwin() &&
	"automatic va_arg instruction only works on Darwin");

	const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue Chain = Op.getOperand(0);
	SDValue Addr = Op.getOperand(1);
	unsigned Align = Op.getConstantOperandVal(3);
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
	Chain = VAList.getValue(1);

	if (Align > 8) {
	assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
	VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(Align - 1, DL, PtrVT));
	VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
	DAG.getConstant(-(int64_t)Align, DL, PtrVT));
	}

	Type ArgTy = VT.getTypeForEVT(DAG.getContext());
	uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

	// Scalar integer and FP values smaller than 64 bits are implicitly extended
	// up to 64 bits. At the very least, we have to increase the striding of the
	// vaargs list to match this, and for FP values we need to introduce
	// FP_ROUND nodes as well.
	if (VT.isInteger() && !VT.isVector())
	ArgSize = 8;
	bool NeedFPTrunc = false;
	if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
	ArgSize = 8;
	NeedFPTrunc = true;
	}

	// Increment the pointer, VAList, to the next vaarg
	SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(ArgSize, DL, PtrVT));
	// Store the incremented VAList to the legalized pointer
	SDValue APStore =
	DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));

	// Load the actual argument out of the pointer VAList
	if (NeedFPTrunc) {
	// Load the value as an f64.
	SDValue WideFP =
	DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
	// Round the value down to an f32.
	SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
	DAG.getIntPtrConstant(1, DL));
	SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
	// Merge the rounded value with the chain output of the load.
	return DAG.getMergeValues(Ops, DL);
	}

	return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
	}

	SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setFrameAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDValue FrameAddr =
	DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

	EVT VT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);
	int FI = MFI.CreateFixedObject(4, 0, false);
	return DAG.getFrameIndex(FI, VT);
	}

	#define GET_REGISTER_MATCHER
	#include "AArch64GenAsmMatcher.inc"

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	unsigned Reg = MatchRegisterName(RegName);
	if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
	const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
	unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
	if (!Subtarget->isXRegisterReserved(DwarfRegNum))
	Reg = 0;
	}
	if (Reg)
	return Reg;
	report_fatal_error(Twine("Invalid register name \""
	+ StringRef(RegName) + "\"."));
	}

	SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);

	SDValue FrameAddr =
	DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
	SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));

	return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
	}

	SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	if (Depth) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
	return DAG.getLoad(VT, DL, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Return LR, which contains the return address. Mark it an implicit live-in.
	unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
	return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
	}

	/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
	/// i64 values and take a 2 x i64 value to shift plus a shift amount.
	SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;

	assert(Op.getOpcode() == ISD::SRA_PARTS \|\| Op.getOpcode() == ISD::SRL_PARTS);

	SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
	DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
	SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);

	// Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
	// is "undef". We wanted 0, so CSEL it directly.
	SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
	ISD::SETEQ, dl, DAG);
	SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
	HiBitsForLo =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
	HiBitsForLo, CCVal, Cmp);

	SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i64));

	SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
	SDValue LoForNormalShift =
	DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);

	Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
	dl, DAG);
	CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
	SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
	SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
	LoForNormalShift, CCVal, Cmp);

	// AArch64 shifts larger than the register width are wrapped rather than
	// clamped, so we can't just emit "hi >> x".
	SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
	SDValue HiForBigShift =
	Opc == ISD::SRA
	? DAG.getNode(Opc, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i64))
	: DAG.getConstant(0, dl, VT);
	SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
	HiForNormalShift, CCVal, Cmp);

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
	/// i64 values and take a 2 x i64 value to shift plus a shift amount.
	SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);

	assert(Op.getOpcode() == ISD::SHL_PARTS);
	SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
	DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
	SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);

	// Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
	// is "undef". We wanted 0, so CSEL it directly.
	SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
	ISD::SETEQ, dl, DAG);
	SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
	LoBitsForHi =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
	LoBitsForHi, CCVal, Cmp);

	SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i64));
	SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
	SDValue HiForNormalShift =
	DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);

	SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);

	Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
	dl, DAG);
	CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
	SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
	HiForNormalShift, CCVal, Cmp);

	// AArch64 shifts of larger than register sizes are wrapped rather than
	// clamped, so we can't just emit "lo << a" if a is too big.
	SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
	SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
	SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
	LoForNormalShift, CCVal, Cmp);

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	bool AArch64TargetLowering::isOffsetFoldingLegal(
	const GlobalAddressSDNode *GA) const {
	// Offsets are folded in the DAG combine rather than here so that we can
	// intelligently choose an offset based on the uses.
	return false;
	}

	bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
	bool OptForSize) const {
	bool IsLegal = false;
	// We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
	// 16-bit case when target has full fp16 support.
	// FIXME: We should be able to handle f128 as well with a clever lowering.
	const APInt ImmInt = Imm.bitcastToAPInt();
	if (VT == MVT::f64)
	IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 \|\| Imm.isPosZero();
	else if (VT == MVT::f32)
	IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 \|\| Imm.isPosZero();
	else if (VT == MVT::f16 && Subtarget->hasFullFP16())
	IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 \|\| Imm.isPosZero();
	// TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
	// generate that fmov.

	// If we can not materialize in immediate field for fmov, check if the
	// value can be encoded as the immediate operand of a logical instruction.
	// The immediate value will be created with either MOVZ, MOVN, or ORR.
	if (!IsLegal && (VT == MVT::f64 \|\| VT == MVT::f32)) {
	// The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
	// however the mov+fmov sequence is always better because of the reduced
	// cache pressure. The timings are still the same if you consider
	// movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
	// movw+movk is fused). So we limit up to 2 instrdduction at most.
	SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
	AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
	Insn);
	unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
	IsLegal = Insn.size() <= Limit;
	}

	LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
	<< " imm value: "; Imm.dump(););
	return IsLegal;
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Optimization Hooks
	//===----------------------------------------------------------------------===//

	static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
	SDValue Operand, SelectionDAG &DAG,
	int &ExtraSteps) {
	EVT VT = Operand.getValueType();
	if (ST->hasNEON() &&
	(VT == MVT::f64 \|\| VT == MVT::v1f64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::f32 \|\| VT == MVT::v1f32 \|\|
	VT == MVT::v2f32 \|\| VT == MVT::v4f32)) {
	if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
	// For the reciprocal estimates, convergence is quadratic, so the number
	// of digits is doubled after each iteration. In ARMv8, the accuracy of
	// the initial estimate is 2^-8. Thus the number of extra steps to refine
	// the result for float (23 mantissa bits) is 2 and for double (52
	// mantissa bits) is 3.
	ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;

	return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
	SelectionDAG &DAG, int Enabled,
	int &ExtraSteps,
	bool &UseOneConst,
	bool Reciprocal) const {
	if (Enabled == ReciprocalEstimate::Enabled \|\|
	(Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
	if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
	DAG, ExtraSteps)) {
	SDLoc DL(Operand);
	EVT VT = Operand.getValueType();

	SDNodeFlags Flags;
	Flags.setAllowReassociation(true);

	// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
	// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
	for (int i = ExtraSteps; i > 0; --i) {
	SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
	Flags);
	Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
	}
	if (!Reciprocal) {
	EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	VT);
	SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
	SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);

	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
	// Correct the result if the operand is 0.0.
	Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
	VT, Eq, Operand, Estimate);
	}

	ExtraSteps = 0;
	return Estimate;
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
	SelectionDAG &DAG, int Enabled,
	int &ExtraSteps) const {
	if (Enabled == ReciprocalEstimate::Enabled)
	if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
	DAG, ExtraSteps)) {
	SDLoc DL(Operand);
	EVT VT = Operand.getValueType();

	SDNodeFlags Flags;
	Flags.setAllowReassociation(true);

	// Newton reciprocal iteration: E * (2 - X * E)
	// AArch64 reciprocal iteration instruction: (2 - M * N)
	for (int i = ExtraSteps; i > 0; --i) {
	SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
	Estimate, Flags);
	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
	}

	ExtraSteps = 0;
	return Estimate;
	}

	return SDValue();
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Table of Constraints
	// TODO: This is the current set of constraints supported by ARM for the
	// compiler, not all of them may make sense.
	//
	// r - A general register
	// w - An FP/SIMD register of some size in the range v0-v31
	// x - An FP/SIMD register of some size in the range v0-v15
	// I - Constant that can be used with an ADD instruction
	// J - Constant that can be used with a SUB instruction
	// K - Constant that can be used with a 32-bit logical instruction
	// L - Constant that can be used with a 64-bit logical instruction
	// M - Constant that can be used as a 32-bit MOV immediate
	// N - Constant that can be used as a 64-bit MOV immediate
	// Q - A memory reference with base register and no offset
	// S - A symbolic address
	// Y - Floating point constant zero
	// Z - Integer constant zero
	//
	// Note that general register operands will be output using their 64-bit x
	// register name, whatever the size of the variable, unless the asm operand
	// is prefixed by the %w modifier. Floating-point and SIMD register operands
	// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
	// %q modifier.
	const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
	// At this point, we have to lower this constraint to something else, so we
	// lower it to an "r" or "w". However, by doing this we will force the result
	// to be in register, while the X constraint is much more permissive.
	//
	// Although we are correct (we are free to emit anything, without
	// constraints), we might break use cases that would expect us to be more
	// efficient and emit something else.
	if (!Subtarget->hasFPARMv8())
	return "r";

	if (ConstraintVT.isFloatingPoint())
	return "w";

	if (ConstraintVT.isVector() &&
	(ConstraintVT.getSizeInBits() == 64 \|\|
	ConstraintVT.getSizeInBits() == 128))
	return "w";

	return "r";
	}

	/// getConstraintType - Given a constraint letter, return the type of
	/// constraint it is for this target.
	AArch64TargetLowering::ConstraintType
	AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	default:
	break;
	- case 'z':
	- return C_Other;
	case 'x':
	case 'w':
	return C_RegisterClass;
	// An address with a single base register. Due to the way we
	// currently handle addresses it is the same as 'r'.
	case 'Q':
	return C_Memory;
	+ case 'I':
	+ case 'J':
	+ case 'K':
	+ case 'L':
	+ case 'M':
	+ case 'N':
	+ case 'Y':
	+ case 'Z':
	+ return C_Immediate;
	+ case 'z':
	case 'S': // A symbolic address
	return C_Other;
	}
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	AArch64TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	break;
	case 'x':
	case 'w':
	if (type->isFloatingPointTy() \|\| type->isVectorTy())
	weight = CW_Register;
	break;
	case 'z':
	weight = CW_Constant;
	break;
	}
	return weight;
	}

	std::pair<unsigned, const TargetRegisterClass *>
	AArch64TargetLowering::getRegForInlineAsmConstraint(
	const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'r':
	if (VT.getSizeInBits() == 64)
	return std::make_pair(0U, &AArch64::GPR64commonRegClass);
	return std::make_pair(0U, &AArch64::GPR32commonRegClass);
	case 'w':
	if (!Subtarget->hasFPARMv8())
	break;
	if (VT.getSizeInBits() == 16)
	return std::make_pair(0U, &AArch64::FPR16RegClass);
	if (VT.getSizeInBits() == 32)
	return std::make_pair(0U, &AArch64::FPR32RegClass);
	if (VT.getSizeInBits() == 64)
	return std::make_pair(0U, &AArch64::FPR64RegClass);
	if (VT.getSizeInBits() == 128)
	return std::make_pair(0U, &AArch64::FPR128RegClass);
	break;
	// The instructions that this constraint is designed for can
	// only take 128-bit registers so just use that regclass.
	case 'x':
	if (!Subtarget->hasFPARMv8())
	break;
	if (VT.getSizeInBits() == 128)
	return std::make_pair(0U, &AArch64::FPR128_loRegClass);
	break;
	}
	}
	if (StringRef("{cc}").equals_lower(Constraint))
	return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<unsigned, const TargetRegisterClass *> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	unsigned Size = Constraint.size();
	if ((Size == 4 \|\| Size == 5) && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
	int RegNo;
	bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
	if (!Failed && RegNo >= 0 && RegNo <= 31) {
	// v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
	// By default we'll emit v0-v31 for this unless there's a modifier where
	// we'll emit the correct register as well.
	if (VT != MVT::Other && VT.getSizeInBits() == 64) {
	Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
	Res.second = &AArch64::FPR64RegClass;
	} else {
	Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
	Res.second = &AArch64::FPR128RegClass;
	}
	}
	}
	}

	if (Res.second && !Subtarget->hasFPARMv8() &&
	!AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
	!AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
	return std::make_pair(0U, nullptr);

	return Res;
	}

	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
	/// vector. If it is invalid, don't add anything to Ops.
	void AArch64TargetLowering::LowerAsmOperandForConstraint(
	SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Currently only support length 1 constraints.
	if (Constraint.length() != 1)
	return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default:
	break;

	// This set of constraints deal with valid constants for various instructions.
	// Validate and return a target constant for them if we can.
	case 'z': {
	// 'z' maps to xzr or wzr so it needs an input of 0.
	if (!isNullConstant(Op))
	return;

	if (Op.getValueType() == MVT::i64)
	Result = DAG.getRegister(AArch64::XZR, MVT::i64);
	else
	Result = DAG.getRegister(AArch64::WZR, MVT::i32);
	break;
	}
	case 'S': {
	// An absolute symbolic address or label reference.
	if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
	Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
	GA->getValueType(0));
	} else if (const BlockAddressSDNode *BA =
	dyn_cast<BlockAddressSDNode>(Op)) {
	Result =
	DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
	} else if (const ExternalSymbolSDNode *ES =
	dyn_cast<ExternalSymbolSDNode>(Op)) {
	Result =
	DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0));
	} else
	return;
	break;
	}

	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return;

	// Grab the value and do some validation.
	uint64_t CVal = C->getZExtValue();
	switch (ConstraintLetter) {
	// The I constraint applies only to simple ADD or SUB immediate operands:
	// i.e. 0 to 4095 with optional shift by 12
	// The J constraint applies only to ADD or SUB immediates that would be
	// valid when negated, i.e. if [an add pattern] were to be output as a SUB
	// instruction [or vice versa], in other words -1 to -4095 with optional
	// left shift by 12.
	case 'I':
	if (isUInt<12>(CVal) \|\| isShiftedUInt<12, 12>(CVal))
	break;
	return;
	case 'J': {
	uint64_t NVal = -C->getSExtValue();
	if (isUInt<12>(NVal) \|\| isShiftedUInt<12, 12>(NVal)) {
	CVal = C->getSExtValue();
	break;
	}
	return;
	}
	// The K and L constraints apply only to logical immediates, including
	// what used to be the MOVI alias for ORR (though the MOVI alias has now
	// been removed and MOV should be used). So these constraints have to
	// distinguish between bit patterns that are valid 32-bit or 64-bit
	// "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
	// not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
	// versa.
	case 'K':
	if (AArch64_AM::isLogicalImmediate(CVal, 32))
	break;
	return;
	case 'L':
	if (AArch64_AM::isLogicalImmediate(CVal, 64))
	break;
	return;
	// The M and N constraints are a superset of K and L respectively, for use
	// with the MOV (immediate) alias. As well as the logical immediates they
	// also match 32 or 64-bit immediates that can be loaded either using a
	// single MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
	// (M) or 64-bit 0x1234000000000000 (N) etc.
	// As a note some of this code is liberally stolen from the asm parser.
	case 'M': {
	if (!isUInt<32>(CVal))
	return;
	if (AArch64_AM::isLogicalImmediate(CVal, 32))
	break;
	if ((CVal & 0xFFFF) == CVal)
	break;
	if ((CVal & 0xFFFF0000ULL) == CVal)
	break;
	uint64_t NCVal = ~(uint32_t)CVal;
	if ((NCVal & 0xFFFFULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF0000ULL) == NCVal)
	break;
	return;
	}
	case 'N': {
	if (AArch64_AM::isLogicalImmediate(CVal, 64))
	break;
	if ((CVal & 0xFFFFULL) == CVal)
	break;
	if ((CVal & 0xFFFF0000ULL) == CVal)
	break;
	if ((CVal & 0xFFFF00000000ULL) == CVal)
	break;
	if ((CVal & 0xFFFF000000000000ULL) == CVal)
	break;
	uint64_t NCVal = ~CVal;
	if ((NCVal & 0xFFFFULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF0000ULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF00000000ULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
	break;
	return;
	}
	default:
	return;
	}

	// All assembler immediates are 64-bit integers.
	Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
	break;
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}

	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Advanced SIMD Support
	//===----------------------------------------------------------------------===//

	/// WidenVector - Given a value in the V64 register class, produce the
	/// equivalent value in the V128 register class.
	static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
	EVT VT = V64Reg.getValueType();
	unsigned NarrowSize = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType().getSimpleVT();
	MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
	SDLoc DL(V64Reg);

	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
	V64Reg, DAG.getConstant(0, DL, MVT::i32));
	}

	/// getExtFactor - Determine the adjustment factor for the position when
	/// generating an "extract from vector registers" instruction.
	static unsigned getExtFactor(SDValue &V) {
	EVT EltType = V.getValueType().getVectorElementType();
	return EltType.getSizeInBits() / 8;
	}

	/// NarrowVector - Given a value in the V128 register class, produce the
	/// equivalent value in the V64 register class.
	static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
	EVT VT = V128Reg.getValueType();
	unsigned WideSize = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType().getSimpleVT();
	MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
	SDLoc DL(V128Reg);

	return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
	}

	// Gather data to see if the operation can be modelled as a
	// shuffle in combination with VEXTs.
	SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	unsigned NumElts = VT.getVectorNumElements();

	struct ShuffleSourceInfo {
	SDValue Vec;
	unsigned MinElt;
	unsigned MaxElt;

	// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
	// be compatible with the shuffle we intend to construct. As a result
	// ShuffleVec will be some sliding window into the original Vec.
	SDValue ShuffleVec;

	// Code should guarantee that element i in Vec starts at element "WindowBase
	// + i * WindowScale in ShuffleVec".
	int WindowBase;
	int WindowScale;

	ShuffleSourceInfo(SDValue Vec)
	: Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
	ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}

	bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
	};

	// First gather all vectors used as an immediate source for this BUILD_VECTOR
	// node.
	SmallVector<ShuffleSourceInfo, 2> Sources;
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(V.getOperand(1))) {
	LLVM_DEBUG(
	dbgs() << "Reshuffle failed: "
	"a shuffle can only come from building a vector from "
	"various elements of other vectors, provided their "
	"indices are constant\n");
	return SDValue();
	}

	// Add this element source to the list if it's not already there.
	SDValue SourceVec = V.getOperand(0);
	auto Source = find(Sources, SourceVec);
	if (Source == Sources.end())
	Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));

	// Update the minimum and maximum lane number seen.
	unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
	Source->MinElt = std::min(Source->MinElt, EltNo);
	Source->MaxElt = std::max(Source->MaxElt, EltNo);
	}

	if (Sources.size() > 2) {
	LLVM_DEBUG(
	dbgs() << "Reshuffle failed: currently only do something sane when at "
	"most two source vectors are involved\n");
	return SDValue();
	}

	// Find out the smallest element size among result and two sources, and use
	// it as element size to build the shuffle_vector.
	EVT SmallestEltTy = VT.getVectorElementType();
	for (auto &Source : Sources) {
	EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
	if (SrcEltTy.bitsLT(SmallestEltTy)) {
	SmallestEltTy = SrcEltTy;
	}
	}
	unsigned ResMultiplier =
	VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
	NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
	EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);

	// If the source vector is too wide or too narrow, we may nevertheless be able
	// to construct a compatible shuffle either by concatenating it with UNDEF or
	// extracting a suitable range of elements.
	for (auto &Src : Sources) {
	EVT SrcVT = Src.ShuffleVec.getValueType();

	if (SrcVT.getSizeInBits() == VT.getSizeInBits())
	continue;

	// This stage of the search produces a source with the same element type as
	// the original, but with a total width matching the BUILD_VECTOR output.
	EVT EltVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
	EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);

	if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
	assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
	// We can pad out the smaller vector for free, so if it's part of a
	// shuffle...
	Src.ShuffleVec =
	DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
	DAG.getUNDEF(Src.ShuffleVec.getValueType()));
	continue;
	}

	assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());

	if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
	LLVM_DEBUG(
	dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
	return SDValue();
	}

	if (Src.MinElt >= NumSrcElts) {
	// The extraction can just take the second half
	Src.ShuffleVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(NumSrcElts, dl, MVT::i64));
	Src.WindowBase = -NumSrcElts;
	} else if (Src.MaxElt < NumSrcElts) {
	// The extraction can just take the first half
	Src.ShuffleVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(0, dl, MVT::i64));
	} else {
	// An actual VEXT is needed
	SDValue VEXTSrc1 =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(0, dl, MVT::i64));
	SDValue VEXTSrc2 =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(NumSrcElts, dl, MVT::i64));
	unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);

	Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
	VEXTSrc2,
	DAG.getConstant(Imm, dl, MVT::i32));
	Src.WindowBase = -Src.MinElt;
	}
	}

	// Another possible incompatibility occurs from the vector element types. We
	// can fix this by bitcasting the source vectors to the same type we intend
	// for the shuffle.
	for (auto &Src : Sources) {
	EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
	if (SrcEltTy == SmallestEltTy)
	continue;
	assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
	Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
	Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
	Src.WindowBase *= Src.WindowScale;
	}

	// Final sanity check before we try to actually produce a shuffle.
	LLVM_DEBUG(for (auto Src
	: Sources)
	assert(Src.ShuffleVec.getValueType() == ShuffleVT););

	// The stars all align, our next step is to produce the mask for the shuffle.
	SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
	int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
	for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
	SDValue Entry = Op.getOperand(i);
	if (Entry.isUndef())
	continue;

	auto Src = find(Sources, Entry.getOperand(0));
	int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();

	// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
	// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
	// segment.
	EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
	int BitsDefined =
	std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
	int LanesDefined = BitsDefined / BitsPerShuffleLane;

	// This source is expected to fill ResMultiplier lanes of the final shuffle,
	// starting at the appropriate offset.
	int LaneMask = &Mask[i ResMultiplier];

	int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
	ExtractBase += NumElts * (Src - Sources.begin());
	for (int j = 0; j < LanesDefined; ++j)
	LaneMask[j] = ExtractBase + j;
	}

	// Final check before we try to produce nonsense...
	if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
	LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
	return SDValue();
	}

	SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
	for (unsigned i = 0; i < Sources.size(); ++i)
	ShuffleOps[i] = Sources[i].ShuffleVec;

	SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
	ShuffleOps[1], Mask);
	SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);

	LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
	dbgs() << "Reshuffle, creating node: "; V.dump(););

	return V;
	}

	// check if an EXT instruction can handle the shuffle mask when the
	// vector sources of the shuffle are the same.
	static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
	unsigned NumElts = VT.getVectorNumElements();

	// Assume that the first shuffle index is not UNDEF. Fail if it is.
	if (M[0] < 0)
	return false;

	Imm = M[0];

	// If this is a VEXT shuffle, the immediate value is the index of the first
	// element. The other shuffle indices must be the successive elements after
	// the first one.
	unsigned ExpectedElt = Imm;
	for (unsigned i = 1; i < NumElts; ++i) {
	// Increment the expected index. If it wraps around, just follow it
	// back to index zero and keep going.
	++ExpectedElt;
	if (ExpectedElt == NumElts)
	ExpectedElt = 0;

	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if (ExpectedElt != static_cast<unsigned>(M[i]))
	return false;
	}

	return true;
	}

	// check if an EXT instruction can handle the shuffle mask when the
	// vector sources of the shuffle are different.
	static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
	unsigned &Imm) {
	// Look for the first non-undef element.
	const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });

	// Benefit form APInt to handle overflow when calculating expected element.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
	APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
	// The following shuffle indices must be the successive elements after the
	// first real element.
	const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
	[&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
	if (FirstWrongElt != M.end())
	return false;

	// The index of an EXT is the first element if it is not UNDEF.
	// Watch out for the beginning UNDEFs. The EXT index should be the expected
	// value of the first element. E.g.
	// <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
	// <-1, -1, 0, 1, ...> is treated as <2NumElts-2, 2NumElts-1, 0, 1, ...>.
	// ExpectedElt is the last mask index plus 1.
	Imm = ExpectedElt.getZExtValue();

	// There are two difference cases requiring to reverse input vectors.
	// For example, for vector <4 x i32> we have the following cases,
	// Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
	// Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
	// For both cases, we finally use mask <5, 6, 7, 0>, which requires
	// to reverse two input vectors.
	if (Imm < NumElts)
	ReverseEXT = true;
	else
	Imm -= NumElts;

	return true;
	}

	/// isREVMask - Check if a vector shuffle corresponds to a REV
	/// instruction with the specified blocksize. (The order of the elements
	/// within each block of the vector is reversed.)
	static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
	assert((BlockSize == 16 \|\| BlockSize == 32 \|\| BlockSize == 64) &&
	"Only possible block sizes for REV are: 16, 32, 64");

	unsigned EltSz = VT.getScalarSizeInBits();
	if (EltSz == 64)
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	unsigned BlockElts = M[0] + 1;
	// If the first shuffle index is UNDEF, be optimistic.
	if (M[0] < 0)
	BlockElts = BlockSize / EltSz;

	if (BlockSize <= EltSz \|\| BlockSize != BlockElts * EltSz)
	return false;

	for (unsigned i = 0; i < NumElts; ++i) {
	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
	return false;
	}

	return true;
	}

	static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	unsigned Idx = WhichResult * NumElts / 2;
	for (unsigned i = 0; i != NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != Idx) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
	return false;
	Idx += 1;
	}

	return true;
	}

	static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i != NumElts; ++i) {
	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if ((unsigned)M[i] != 2 * i + WhichResult)
	return false;
	}

	return true;
	}

	static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts % 2 != 0)
	return false;
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i < NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
	return false;
	}
	return true;
	}

	/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
	static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts % 2 != 0)
	return false;
	WhichResult = (M[0] == 0 ? 0 : 1);
	unsigned Idx = WhichResult * NumElts / 2;
	for (unsigned i = 0; i != NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != Idx) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
	return false;
	Idx += 1;
	}

	return true;
	}

	/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
	static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned Half = VT.getVectorNumElements() / 2;
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned j = 0; j != 2; ++j) {
	unsigned Idx = WhichResult;
	for (unsigned i = 0; i != Half; ++i) {
	int MIdx = M[i + j * Half];
	if (MIdx >= 0 && (unsigned)MIdx != Idx)
	return false;
	Idx += 2;
	}
	}

	return true;
	}

	/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
	static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts % 2 != 0)
	return false;
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i < NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
	return false;
	}
	return true;
	}

	static bool isINSMask(ArrayRef<int> M, int NumInputElements,
	bool &DstIsLeft, int &Anomaly) {
	if (M.size() != static_cast<size_t>(NumInputElements))
	return false;

	int NumLHSMatch = 0, NumRHSMatch = 0;
	int LastLHSMismatch = -1, LastRHSMismatch = -1;

	for (int i = 0; i < NumInputElements; ++i) {
	if (M[i] == -1) {
	++NumLHSMatch;
	++NumRHSMatch;
	continue;
	}

	if (M[i] == i)
	++NumLHSMatch;
	else
	LastLHSMismatch = i;

	if (M[i] == i + NumInputElements)
	++NumRHSMatch;
	else
	LastRHSMismatch = i;
	}

	if (NumLHSMatch == NumInputElements - 1) {
	DstIsLeft = true;
	Anomaly = LastLHSMismatch;
	return true;
	} else if (NumRHSMatch == NumInputElements - 1) {
	DstIsLeft = false;
	Anomaly = LastRHSMismatch;
	return true;
	}

	return false;
	}

	static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
	if (VT.getSizeInBits() != 128)
	return false;

	unsigned NumElts = VT.getVectorNumElements();

	for (int I = 0, E = NumElts / 2; I != E; I++) {
	if (Mask[I] != I)
	return false;
	}

	int Offset = NumElts / 2;
	for (int I = NumElts / 2, E = NumElts; I != E; I++) {
	if (Mask[I] != I + SplitLHS * Offset)
	return false;
	}

	return true;
	}

	static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	EVT VT = Op.getValueType();
	SDValue V0 = Op.getOperand(0);
	SDValue V1 = Op.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();

	if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() \|\|
	VT.getVectorElementType() != V1.getValueType().getVectorElementType())
	return SDValue();

	bool SplitV0 = V0.getValueSizeInBits() == 128;

	if (!isConcatMask(Mask, VT, SplitV0))
	return SDValue();

	EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);
	if (SplitV0) {
	V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
	DAG.getConstant(0, DL, MVT::i64));
	}
	if (V1.getValueSizeInBits() == 128) {
	V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
	DAG.getConstant(0, DL, MVT::i64));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
	}

	/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
	/// the specified operations to build the shuffle.
	static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
	SDValue RHS, SelectionDAG &DAG,
	const SDLoc &dl) {
	unsigned OpNum = (PFEntry >> 26) & 0x0F;
	unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
	unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);

	enum {
	OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
	OP_VREV,
	OP_VDUP0,
	OP_VDUP1,
	OP_VDUP2,
	OP_VDUP3,
	OP_VEXT1,
	OP_VEXT2,
	OP_VEXT3,
	OP_VUZPL, // VUZP, left result
	OP_VUZPR, // VUZP, right result
	OP_VZIPL, // VZIP, left result
	OP_VZIPR, // VZIP, right result
	OP_VTRNL, // VTRN, left result
	OP_VTRNR // VTRN, right result
	};

	if (OpNum == OP_COPY) {
	if (LHSID == (1 * 9 + 2) * 9 + 3)
	return LHS;
	assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
	return RHS;
	}

	SDValue OpLHS, OpRHS;
	OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
	OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
	EVT VT = OpLHS.getValueType();

	switch (OpNum) {
	default:
	llvm_unreachable("Unknown shuffle opcode!");
	case OP_VREV:
	// VREV divides the vector in half and swaps within the half.
	if (VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::f32)
	return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
	// vrev <4 x i16> -> REV32
	if (VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::f16)
	return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
	// vrev <4 x i8> -> REV16
	assert(VT.getVectorElementType() == MVT::i8);
	return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
	case OP_VDUP0:
	case OP_VDUP1:
	case OP_VDUP2:
	case OP_VDUP3: {
	EVT EltTy = VT.getVectorElementType();
	unsigned Opcode;
	if (EltTy == MVT::i8)
	Opcode = AArch64ISD::DUPLANE8;
	else if (EltTy == MVT::i16 \|\| EltTy == MVT::f16)
	Opcode = AArch64ISD::DUPLANE16;
	else if (EltTy == MVT::i32 \|\| EltTy == MVT::f32)
	Opcode = AArch64ISD::DUPLANE32;
	else if (EltTy == MVT::i64 \|\| EltTy == MVT::f64)
	Opcode = AArch64ISD::DUPLANE64;
	else
	llvm_unreachable("Invalid vector element type?");

	if (VT.getSizeInBits() == 64)
	OpLHS = WidenVector(OpLHS, DAG);
	SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
	return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
	}
	case OP_VEXT1:
	case OP_VEXT2:
	case OP_VEXT3: {
	unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
	return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
	DAG.getConstant(Imm, dl, MVT::i32));
	}
	case OP_VUZPL:
	return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VUZPR:
	return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VZIPL:
	return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VZIPR:
	return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VTRNL:
	return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VTRNR:
	return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	}
	}

	static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
	SelectionDAG &DAG) {
	// Check to see if we can use the TBL instruction.
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	SDLoc DL(Op);

	EVT EltVT = Op.getValueType().getVectorElementType();
	unsigned BytesPerElt = EltVT.getSizeInBits() / 8;

	SmallVector<SDValue, 8> TBLMask;
	for (int Val : ShuffleMask) {
	for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
	unsigned Offset = Byte + Val * BytesPerElt;
	TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
	}
	}

	MVT IndexVT = MVT::v8i8;
	unsigned IndexLen = 8;
	if (Op.getValueSizeInBits() == 128) {
	IndexVT = MVT::v16i8;
	IndexLen = 16;
	}

	SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
	SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);

	SDValue Shuffle;
	if (V2.getNode()->isUndef()) {
	if (IndexLen == 8)
	V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
	DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	} else {
	if (IndexLen == 8) {
	V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
	DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	} else {
	// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
	// cannot currently represent the register constraints on the input
	// table registers.
	// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
	// DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
	// IndexLen));
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
	V2Cst, DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	}
	}
	return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
	}

	static unsigned getDUPLANEOp(EVT EltType) {
	if (EltType == MVT::i8)
	return AArch64ISD::DUPLANE8;
	if (EltType == MVT::i16 \|\| EltType == MVT::f16)
	return AArch64ISD::DUPLANE16;
	if (EltType == MVT::i32 \|\| EltType == MVT::f32)
	return AArch64ISD::DUPLANE32;
	if (EltType == MVT::i64 \|\| EltType == MVT::f64)
	return AArch64ISD::DUPLANE64;

	llvm_unreachable("Invalid vector element type?");
	}

	SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();

	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());

	// Convert shuffles that are directly supported on NEON to target-specific
	// DAG nodes, instead of keeping them as shuffles and matching them again
	// during code selection. This is more efficient and avoids the possibility
	// of inconsistencies between legalization and selection.
	ArrayRef<int> ShuffleMask = SVN->getMask();

	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);

	if (SVN->isSplat()) {
	int Lane = SVN->getSplatIndex();
	// If this is undef splat, generate it via "just" vdup, if possible.
	if (Lane == -1)
	Lane = 0;

	if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
	V1.getOperand(0));
	// Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
	// constant. If so, we can just reference the lane's definition directly.
	if (V1.getOpcode() == ISD::BUILD_VECTOR &&
	!isa<ConstantSDNode>(V1.getOperand(Lane)))
	return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));

	// Otherwise, duplicate from the lane of the input vector.
	unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());

	// SelectionDAGBuilder may have "helpfully" already extracted or conatenated
	// to make a vector of the same size as this SHUFFLE. We can ignore the
	// extract entirely, and canonicalise the concat using WidenVector.
	if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
	V1 = V1.getOperand(0);
	} else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
	unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
	Lane -= Idx * VT.getVectorNumElements() / 2;
	V1 = WidenVector(V1.getOperand(Idx), DAG);
	} else if (VT.getSizeInBits() == 64)
	V1 = WidenVector(V1, DAG);

	return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
	}

	if (isREVMask(ShuffleMask, VT, 64))
	return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
	if (isREVMask(ShuffleMask, VT, 32))
	return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
	if (isREVMask(ShuffleMask, VT, 16))
	return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);

	bool ReverseEXT = false;
	unsigned Imm;
	if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
	if (ReverseEXT)
	std::swap(V1, V2);
	Imm *= getExtFactor(V1);
	return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
	DAG.getConstant(Imm, dl, MVT::i32));
	} else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
	Imm *= getExtFactor(V1);
	return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
	DAG.getConstant(Imm, dl, MVT::i32));
	}

	unsigned WhichResult;
	if (isZIPMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}
	if (isUZPMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}
	if (isTRNMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}

	if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}
	if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}
	if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}

	if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
	return Concat;

	bool DstIsLeft;
	int Anomaly;
	int NumInputElements = V1.getValueType().getVectorNumElements();
	if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
	SDValue DstVec = DstIsLeft ? V1 : V2;
	SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);

	SDValue SrcVec = V1;
	int SrcLane = ShuffleMask[Anomaly];
	if (SrcLane >= NumInputElements) {
	SrcVec = V2;
	SrcLane -= VT.getVectorNumElements();
	}
	SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);

	EVT ScalarVT = VT.getVectorElementType();

	if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
	ScalarVT = MVT::i32;

	return DAG.getNode(
	ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
	DstLaneV);
	}

	// If the shuffle is not directly supported and it has 4 elements, use
	// the PerfectShuffle-generated table to synthesize it from other shuffles.
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts == 4) {
	unsigned PFIndexes[4];
	for (unsigned i = 0; i != 4; ++i) {
	if (ShuffleMask[i] < 0)
	PFIndexes[i] = 8;
	else
	PFIndexes[i] = ShuffleMask[i];
	}

	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
	PFIndexes[2] * 9 + PFIndexes[3];
	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	if (Cost <= 4)
	return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
	}

	return GenerateTBL(Op, ShuffleMask, DAG);
	}

	static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
	APInt &UndefBits) {
	EVT VT = BVN->getValueType(0);
	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
	unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;

	for (unsigned i = 0; i < NumSplats; ++i) {
	CnstBits <<= SplatBitSize;
	UndefBits <<= SplatBitSize;
	CnstBits \|= SplatBits.zextOrTrunc(VT.getSizeInBits());
	UndefBits \|= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
	}

	return true;
	}

	return false;
	}

	// Try 64-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;

	if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
	Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);

	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 32-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits,
	const SDValue *LHS = nullptr) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	bool isAdvSIMDModImm = false;
	uint64_t Shift;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
	Shift = 0;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
	Shift = 8;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
	Shift = 16;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
	Shift = 24;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov;

	if (LHS)
	Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));
	else
	Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));

	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 16-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits,
	const SDValue *LHS = nullptr) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	bool isAdvSIMDModImm = false;
	uint64_t Shift;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
	Shift = 0;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
	Shift = 8;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov;

	if (LHS)
	Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));
	else
	Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));

	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 32-bit splatted SIMD immediate with shifted ones.
	static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
	SelectionDAG &DAG, const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	bool isAdvSIMDModImm = false;
	uint64_t Shift;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
	Shift = 264;
	}
	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
	Shift = 272;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32),
	DAG.getConstant(Shift, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try 8-bit splatted SIMD immediate.
	static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;

	if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
	Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);

	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Try FP splatted SIMD immediate.
	static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
	const APInt &Bits) {
	if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
	uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
	EVT VT = Op.getValueType();
	bool isWide = (VT.getSizeInBits() == 128);
	MVT MovTy;
	bool isAdvSIMDModImm = false;

	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
	MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
	}
	else if (isWide &&
	(isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
	Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
	MovTy = MVT::v2f64;
	}

	if (isAdvSIMDModImm) {
	SDLoc dl(Op);
	SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
	DAG.getConstant(Value, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	return SDValue();
	}

	// Specialized code to quickly find if PotentialBVec is a BuildVector that
	// consists of only the same constant int value, returned in reference arg
	// ConstVal
	static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
	uint64_t &ConstVal) {
	BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
	if (!Bvec)
	return false;
	ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
	if (!FirstElt)
	return false;
	EVT VT = Bvec->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	for (unsigned i = 1; i < NumElts; ++i)
	if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
	return false;
	ConstVal = FirstElt->getZExtValue();
	return true;
	}

	static unsigned getIntrinsicID(const SDNode *N) {
	unsigned Opcode = N->getOpcode();
	switch (Opcode) {
	default:
	return Intrinsic::not_intrinsic;
	case ISD::INTRINSIC_WO_CHAIN: {
	unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
	if (IID < Intrinsic::num_intrinsics)
	return IID;
	return Intrinsic::not_intrinsic;
	}
	}
	}

	// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
	// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
	// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
	// Also, logical shift right -> sri, with the same structure.
	static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);

	if (!VT.isVector())
	return SDValue();

	SDLoc DL(N);

	// Is the first op an AND?
	const SDValue And = N->getOperand(0);
	if (And.getOpcode() != ISD::AND)
	return SDValue();

	// Is the second op an shl or lshr?
	SDValue Shift = N->getOperand(1);
	// This will have been turned into: AArch64ISD::VSHL vector, #shift
	// or AArch64ISD::VLSHR vector, #shift
	unsigned ShiftOpc = Shift.getOpcode();
	if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
	return SDValue();
	bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;

	// Is the shift amount constant?
	ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
	if (!C2node)
	return SDValue();

	// Is the and mask vector all constant?
	uint64_t C1;
	if (!isAllConstantBuildVector(And.getOperand(1), C1))
	return SDValue();

	// Is C1 == ~C2, taking into account how much one can shift elements of a
	// particular size?
	uint64_t C2 = C2node->getZExtValue();
	unsigned ElemSizeInBits = VT.getScalarSizeInBits();
	if (C2 > ElemSizeInBits)
	return SDValue();
	unsigned ElemMask = (1 << ElemSizeInBits) - 1;
	if ((C1 & ElemMask) != (~C2 & ElemMask))
	return SDValue();

	SDValue X = And.getOperand(0);
	SDValue Y = Shift.getOperand(0);

	unsigned Intrin =
	IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
	SDValue ResultSLI =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
	Shift.getOperand(1));

	LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
	LLVM_DEBUG(N->dump(&DAG));
	LLVM_DEBUG(dbgs() << "into: \n");
	LLVM_DEBUG(ResultSLI->dump(&DAG));

	++NumShiftInserts;
	return ResultSLI;
	}

	SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
	SelectionDAG &DAG) const {
	// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
	if (EnableAArch64SlrGeneration) {
	if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
	return Res;
	}

	EVT VT = Op.getValueType();

	SDValue LHS = Op.getOperand(0);
	BuildVectorSDNode *BVN =
	dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
	if (!BVN) {
	// OR commutes, so try swapping the operands.
	LHS = Op.getOperand(1);
	BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
	}
	if (!BVN)
	return Op;

	APInt DefBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	if (resolveBuildVector(BVN, DefBits, UndefBits)) {
	SDValue NewOp;

	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
	DefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
	DefBits, &LHS)))
	return NewOp;

	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
	UndefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
	UndefBits, &LHS)))
	return NewOp;
	}

	// We can always fall back to a non-immediate OR.
	return Op;
	}

	// Normalize the operands of BUILD_VECTOR. The value of constant operands will
	// be truncated to fit element width.
	static SDValue NormalizeBuildVector(SDValue Op,
	SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	EVT EltTy= VT.getVectorElementType();

	if (EltTy.isFloatingPoint() \|\| EltTy.getSizeInBits() > 16)
	return Op;

	SmallVector<SDValue, 16> Ops;
	for (SDValue Lane : Op->ops()) {
	// For integer vectors, type legalization would have promoted the
	// operands already. Otherwise, if Op is a floating-point splat
	// (with operands cast to integers), then the only possibilities
	// are constants and UNDEFs.
	if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
	APInt LowBits(EltTy.getSizeInBits(),
	CstLane->getZExtValue());
	Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
	} else if (Lane.getNode()->isUndef()) {
	Lane = DAG.getUNDEF(MVT::i32);
	} else {
	assert(Lane.getValueType() == MVT::i32 &&
	"Unexpected BUILD_VECTOR operand type");
	}
	Ops.push_back(Lane);
	}
	return DAG.getBuildVector(VT, dl, Ops);
	}

	static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	APInt DefBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
	if (resolveBuildVector(BVN, DefBits, UndefBits)) {
	SDValue NewOp;
	if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
	return NewOp;

	DefBits = ~DefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
	return NewOp;

	DefBits = UndefBits;
	if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
	return NewOp;

	DefBits = ~UndefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
	return NewOp;
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	// Try to build a simple constant vector.
	Op = NormalizeBuildVector(Op, DAG);
	if (VT.isInteger()) {
	// Certain vector constants, used to express things like logical NOT and
	// arithmetic NEG, are passed through unmodified. This allows special
	// patterns for these operations to match, which will lower these constants
	// to whatever is proven necessary.
	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
	if (BVN->isConstant())
	if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
	unsigned BitSize = VT.getVectorElementType().getSizeInBits();
	APInt Val(BitSize,
	Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
	if (Val.isNullValue() \|\| Val.isAllOnesValue())
	return Op;
	}
	}

	if (SDValue V = ConstantBuildVector(Op, DAG))
	return V;

	// Scan through the operands to find some interesting properties we can
	// exploit:
	// 1) If only one value is used, we can use a DUP, or
	// 2) if only the low element is not undef, we can just insert that, or
	// 3) if only one constant value is used (w/ some non-constant lanes),
	// we can splat the constant value into the whole vector then fill
	// in the non-constant lanes.
	// 4) FIXME: If different constant values are used, but we can intelligently
	// select the values we'll be overwriting for the non-constant
	// lanes such that we can directly materialize the vector
	// some other way (MOVI, e.g.), we can be sneaky.
	// 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
	SDLoc dl(Op);
	unsigned NumElts = VT.getVectorNumElements();
	bool isOnlyLowElement = true;
	bool usesOnlyOneValue = true;
	bool usesOnlyOneConstantValue = true;
	bool isConstant = true;
	bool AllLanesExtractElt = true;
	unsigned NumConstantLanes = 0;
	SDValue Value;
	SDValue ConstantValue;
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	AllLanesExtractElt = false;
	if (V.isUndef())
	continue;
	if (i > 0)
	isOnlyLowElement = false;
	if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
	isConstant = false;

	if (isa<ConstantSDNode>(V) \|\| isa<ConstantFPSDNode>(V)) {
	++NumConstantLanes;
	if (!ConstantValue.getNode())
	ConstantValue = V;
	else if (ConstantValue != V)
	usesOnlyOneConstantValue = false;
	}

	if (!Value.getNode())
	Value = V;
	else if (V != Value)
	usesOnlyOneValue = false;
	}

	if (!Value.getNode()) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
	return DAG.getUNDEF(VT);
	}

	// Convert BUILD_VECTOR where all elements but the lowest are undef into
	// SCALAR_TO_VECTOR, except for when we have a single-element constant vector
	// as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
	if (isOnlyLowElement && !(NumElts == 1 && isa<ConstantSDNode>(Value))) {
	LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
	"SCALAR_TO_VECTOR node\n");
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
	}

	if (AllLanesExtractElt) {
	SDNode *Vector = nullptr;
	bool Even = false;
	bool Odd = false;
	// Check whether the extract elements match the Even pattern <0,2,4,...> or
	// the Odd pattern <1,3,5,...>.
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	const SDNode *N = V.getNode();
	if (!isa<ConstantSDNode>(N->getOperand(1)))
	break;
	SDValue N0 = N->getOperand(0);

	// All elements are extracted from the same vector.
	if (!Vector) {
	Vector = N0.getNode();
	// Check that the type of EXTRACT_VECTOR_ELT matches the type of
	// BUILD_VECTOR.
	if (VT.getVectorElementType() !=
	N0.getValueType().getVectorElementType())
	break;
	} else if (Vector != N0.getNode()) {
	Odd = false;
	Even = false;
	break;
	}

	// Extracted values are either at Even indices <0,2,4,...> or at Odd
	// indices <1,3,5,...>.
	uint64_t Val = N->getConstantOperandVal(1);
	if (Val == 2 * i) {
	Even = true;
	continue;
	}
	if (Val - 1 == 2 * i) {
	Odd = true;
	continue;
	}

	// Something does not match: abort.
	Odd = false;
	Even = false;
	break;
	}
	if (Even \|\| Odd) {
	SDValue LHS =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
	DAG.getConstant(0, dl, MVT::i64));
	SDValue RHS =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
	DAG.getConstant(NumElts, dl, MVT::i64));

	if (Even && !Odd)
	return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
	RHS);
	if (Odd && !Even)
	return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
	RHS);
	}
	}

	// Use DUP for non-constant splats. For f32 constant splats, reduce to
	// i32 and try again.
	if (usesOnlyOneValue) {
	if (!isConstant) {
	if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Value.getValueType() != VT) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
	return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
	}

	// This is actually a DUPLANExx operation, which keeps everything vectory.

	SDValue Lane = Value.getOperand(1);
	Value = Value.getOperand(0);
	if (Value.getValueSizeInBits() == 64) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
	"widening it\n");
	Value = WidenVector(Value, DAG);
	}

	unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
	return DAG.getNode(Opcode, dl, VT, Value, Lane);
	}

	if (VT.getVectorElementType().isFloatingPoint()) {
	SmallVector<SDValue, 8> Ops;
	EVT EltTy = VT.getVectorElementType();
	assert ((EltTy == MVT::f16 \|\| EltTy == MVT::f32 \|\| EltTy == MVT::f64) &&
	"Unsupported floating-point vector type");
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
	"BITCASTS, and try again\n");
	MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
	for (unsigned i = 0; i < NumElts; ++i)
	Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
	SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
	LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
	Val.dump(););
	Val = LowerBUILD_VECTOR(Val, DAG);
	if (Val.getNode())
	return DAG.getNode(ISD::BITCAST, dl, VT, Val);
	}
	}

	// If there was only one constant value used and for more than one lane,
	// start by splatting that value, then replace the non-constant lanes. This
	// is better than the default, which will perform a separate initialization
	// for each lane.
	if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
	// Firstly, try to materialize the splat constant.
	SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
	Val = ConstantBuildVector(Vec, DAG);
	if (!Val) {
	// Otherwise, materialize the constant and splat it.
	Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
	DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
	}

	// Now insert the non-constant lanes.
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
	if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V))
	// Note that type legalization likely mucked about with the VT of the
	// source operand, so we may have to convert it here before inserting.
	Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
	}
	return Val;
	}

	// This will generate a load from the constant pool.
	if (isConstant) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
	"expansion\n");
	return SDValue();
	}

	// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
	if (NumElts >= 4) {
	if (SDValue shuffle = ReconstructShuffle(Op, DAG))
	return shuffle;
	}

	// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
	// know the default expansion would otherwise fall back on something even
	// worse. For a vector with one or two non-undef values, that's
	// scalar_to_vector for the elements followed by a shuffle (provided the
	// shuffle is valid for the target) and materialization element by element
	// on the stack followed by a load for everything else.
	if (!isConstant && !usesOnlyOneValue) {
	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
	"of INSERT_VECTOR_ELT\n");

	SDValue Vec = DAG.getUNDEF(VT);
	SDValue Op0 = Op.getOperand(0);
	unsigned i = 0;

	// Use SCALAR_TO_VECTOR for lane zero to
	// a) Avoid a RMW dependency on the full vector register, and
	// b) Allow the register coalescer to fold away the copy if the
	// value is already in an S or D register, and we're forced to emit an
	// INSERT_SUBREG that we can't fold anywhere.
	//
	// We also allow types like i8 and i16 which are illegal scalar but legal
	// vector element types. After type-legalization the inserted value is
	// extended (i32) and it is safe to cast them to the vector type by ignoring
	// the upper bits of the lowest lane (e.g. v8i8, v4i16).
	if (!Op0.isUndef()) {
	LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
	++i;
	}
	LLVM_DEBUG(if (i < NumElts) dbgs()
	<< "Creating nodes for the other vector elements:\n";);
	for (; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
	Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
	}
	return Vec;
	}

	LLVM_DEBUG(
	dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
	"better alternative\n");
	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");

	// Check for non-constant or out of range lane.
	EVT VT = Op.getOperand(0).getValueType();
	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();


	// Insertion/extraction are legal for V128 types.
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v8f16)
	return Op;

	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
	return SDValue();

	// For V64 types, we perform insertion by expanding the value
	// to a V128 type and perform the insertion on that.
	SDLoc DL(Op);
	SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
	EVT WideTy = WideVec.getValueType();

	SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
	Op.getOperand(1), Op.getOperand(2));
	// Re-narrow the resultant vector.
	return NarrowVector(Node, DAG);
	}

	SDValue
	AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");

	// Check for non-constant or out of range lane.
	EVT VT = Op.getOperand(0).getValueType();
	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();


	// Insertion/extraction are legal for V128 types.
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v8f16)
	return Op;

	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
	return SDValue();

	// For V64 types, we perform extraction by expanding the value
	// to a V128 type and perform the extraction on that.
	SDLoc DL(Op);
	SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
	EVT WideTy = WideVec.getValueType();

	EVT ExtrTy = WideTy.getVectorElementType();
	if (ExtrTy == MVT::i16 \|\| ExtrTy == MVT::i8)
	ExtrTy = MVT::i32;

	// For extractions, we just return the result directly.
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
	Op.getOperand(1));
	}

	SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getOperand(0).getValueType();
	SDLoc dl(Op);
	// Just in case...
	if (!VT.isVector())
	return SDValue();

	ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!Cst)
	return SDValue();
	unsigned Val = Cst->getZExtValue();

	unsigned Size = Op.getValueSizeInBits();

	// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
	if (Val == 0)
	return Op;

	// If this is extracting the upper 64-bits of a 128-bit vector, we match
	// that directly.
	if (Size == 64 && Val * VT.getScalarSizeInBits() == 64)
	return Op;

	return SDValue();
	}

	bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
	if (VT.getVectorNumElements() == 4 &&
	(VT.is128BitVector() \|\| VT.is64BitVector())) {
	unsigned PFIndexes[4];
	for (unsigned i = 0; i != 4; ++i) {
	if (M[i] < 0)
	PFIndexes[i] = 8;
	else
	PFIndexes[i] = M[i];
	}

	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
	PFIndexes[2] * 9 + PFIndexes[3];
	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	if (Cost <= 4)
	return true;
	}

	bool DummyBool;
	int DummyInt;
	unsigned DummyUnsigned;

	return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) \|\| isREVMask(M, VT, 64) \|\|
	isREVMask(M, VT, 32) \|\| isREVMask(M, VT, 16) \|\|
	isEXTMask(M, VT, DummyBool, DummyUnsigned) \|\|
	// isTBLMask(M, VT) \|\| // FIXME: Port TBL support from ARM.
	isTRNMask(M, VT, DummyUnsigned) \|\| isUZPMask(M, VT, DummyUnsigned) \|\|
	isZIPMask(M, VT, DummyUnsigned) \|\|
	isTRN_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isUZP_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isZIP_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) \|\|
	isConcatMask(M, VT, VT.getSizeInBits() == 128));
	}

	/// getVShiftImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift operation, where all the elements of the
	/// build_vector must have the same constant integer value.
	static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
	// Ignore bit_converts.
	while (Op.getOpcode() == ISD::BITCAST)
	Op = Op.getOperand(0);
	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (!BVN \|\| !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
	HasAnyUndefs, ElementBits) \|\|
	SplatBitSize > ElementBits)
	return false;
	Cnt = SplatBits.getSExtValue();
	return true;
	}

	/// isVShiftLImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift left operation. That value must be in the range:
	/// 0 <= Value < ElementBits for a left shift; or
	/// 0 <= Value <= ElementBits for a long left shift.
	static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
	assert(VT.isVector() && "vector shift count is not a vector type");
	int64_t ElementBits = VT.getScalarSizeInBits();
	if (!getVShiftImm(Op, ElementBits, Cnt))
	return false;
	return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
	}

	/// isVShiftRImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift right operation. The value must be in the range:
	/// 1 <= Value <= ElementBits for a right shift; or
	static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
	assert(VT.isVector() && "vector shift count is not a vector type");
	int64_t ElementBits = VT.getScalarSizeInBits();
	if (!getVShiftImm(Op, ElementBits, Cnt))
	return false;
	return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
	}

	SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	int64_t Cnt;

	if (!Op.getOperand(1).getValueType().isVector())
	return Op;
	unsigned EltSize = VT.getScalarSizeInBits();

	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("unexpected shift opcode");

	case ISD::SHL:
	if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
	return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
	DAG.getConstant(Cnt, DL, MVT::i32));
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
	MVT::i32),
	Op.getOperand(0), Op.getOperand(1));
	case ISD::SRA:
	case ISD::SRL:
	// Right shift immediate
	if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
	unsigned Opc =
	(Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
	return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
	DAG.getConstant(Cnt, DL, MVT::i32));
	}

	// Right shift register. Note, there is not a shift right register
	// instruction, but the shift left register instruction takes a signed
	// value, where negative numbers specify a right shift.
	unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
	: Intrinsic::aarch64_neon_ushl;
	// negate the shift amount
	SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
	SDValue NegShiftLeft =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
	NegShift);
	return NegShiftLeft;
	}

	return SDValue();
	}

	static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
	AArch64CC::CondCode CC, bool NoNans, EVT VT,
	const SDLoc &dl, SelectionDAG &DAG) {
	EVT SrcVT = LHS.getValueType();
	assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
	"function only supposed to emit natural comparisons");

	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
	APInt CnstBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
	bool IsZero = IsCnst && (CnstBits == 0);

	if (SrcVT.getVectorElementType().isFloatingPoint()) {
	switch (CC) {
	default:
	return SDValue();
	case AArch64CC::NE: {
	SDValue Fcmeq;
	if (IsZero)
	Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
	else
	Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
	return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
	}
	case AArch64CC::EQ:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
	case AArch64CC::GE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
	case AArch64CC::GT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
	case AArch64CC::LS:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
	case AArch64CC::LT:
	if (!NoNans)
	return SDValue();
	// If we ignore NaNs then we can use to the MI implementation.
	LLVM_FALLTHROUGH;
	case AArch64CC::MI:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
	}
	}

	switch (CC) {
	default:
	return SDValue();
	case AArch64CC::NE: {
	SDValue Cmeq;
	if (IsZero)
	Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
	else
	Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
	return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
	}
	case AArch64CC::EQ:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
	case AArch64CC::GE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
	case AArch64CC::GT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
	case AArch64CC::LE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
	case AArch64CC::LS:
	return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
	case AArch64CC::LO:
	return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
	case AArch64CC::LT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
	case AArch64CC::HI:
	return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
	case AArch64CC::HS:
	return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
	}
	}

	SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
	SelectionDAG &DAG) const {
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
	SDLoc dl(Op);

	if (LHS.getValueType().getVectorElementType().isInteger()) {
	assert(LHS.getValueType() == RHS.getValueType());
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	SDValue Cmp =
	EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
	return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
	}

	const bool FullFP16 =
	static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();

	// Make v4f16 (only) fcmp operations utilise vector instructions
	// v8f16 support will be a litle more complicated
	if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
	if (LHS.getValueType().getVectorNumElements() == 4) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
	SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
	DAG.ReplaceAllUsesWith(Op, NewSetcc);
	CmpVT = MVT::v4i32;
	} else
	return SDValue();
	}

	assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) \|\|
	LHS.getValueType().getVectorElementType() != MVT::f128);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two branches to implement.
	AArch64CC::CondCode CC1, CC2;
	bool ShouldInvert;
	changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);

	bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
	SDValue Cmp =
	EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
	if (!Cmp.getNode())
	return SDValue();

	if (CC2 != AArch64CC::AL) {
	SDValue Cmp2 =
	EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
	if (!Cmp2.getNode())
	return SDValue();

	Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
	}

	Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());

	if (ShouldInvert)
	Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());

	return Cmp;
	}

	static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
	SelectionDAG &DAG) {
	SDValue VecOp = ScalarOp.getOperand(0);
	auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
	DAG.getConstant(0, DL, MVT::i64));
	}

	SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	switch (Op.getOpcode()) {
	case ISD::VECREDUCE_ADD:
	return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
	case ISD::VECREDUCE_SMAX:
	return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
	case ISD::VECREDUCE_SMIN:
	return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
	case ISD::VECREDUCE_UMAX:
	return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
	case ISD::VECREDUCE_UMIN:
	return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
	case ISD::VECREDUCE_FMAX: {
	assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
	DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
	Op.getOperand(0));
	}
	case ISD::VECREDUCE_FMIN: {
	assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
	DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
	Op.getOperand(0));
	}
	default:
	llvm_unreachable("Unhandled reduction");
	}
	}

	SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
	SelectionDAG &DAG) const {
	auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
	if (!Subtarget.hasLSE())
	return SDValue();

	// LSE has an atomic load-add instruction, but not a load-sub.
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue RHS = Op.getOperand(2);
	AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
	RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
	Op.getOperand(0), Op.getOperand(1), RHS,
	AN->getMemOperand());
	}

	SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
	SelectionDAG &DAG) const {
	auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
	if (!Subtarget.hasLSE())
	return SDValue();

	// LSE has an atomic load-clear instruction, but not a load-and.
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue RHS = Op.getOperand(2);
	AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
	RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
	Op.getOperand(0), Op.getOperand(1), RHS,
	AN->getMemOperand());
	}

	SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
	SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);

	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
	if (Subtarget->hasCustomCallingConv())
	TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);

	Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
	DAG.getConstant(4, dl, MVT::i64));
	Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
	Chain =
	DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
	Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
	DAG.getRegisterMask(Mask), Chain.getValue(1));
	// To match the actual intent better, we should read the output from X15 here
	// again (instead of potentially spilling it to the stack), but rereading Size
	// from X15 here doesn't work at -O0, since it thinks that X15 is undefined
	// here.

	Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
	DAG.getConstant(4, dl, MVT::i64));
	return Chain;
	}

	SDValue
	AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetWindows() &&
	"Only Windows alloca probing supported");
	SDLoc dl(Op);
	// Get the inputs.
	SDNode *Node = Op.getNode();
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	EVT VT = Node->getValueType(0);

	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
	"no-stack-arg-probe")) {
	SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
	Chain = SP.getValue(1);
	SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
	if (Align)
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
	SDValue Ops[2] = {SP, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);

	SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
	Chain = SP.getValue(1);
	SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
	if (Align)
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	SDValue Ops[2] = {SP, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
	/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
	/// specified in the intrinsic calls.
	bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {
	auto &DL = I.getModule()->getDataLayout();
	switch (Intrinsic) {
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_ld4:
	case Intrinsic::aarch64_neon_ld1x2:
	case Intrinsic::aarch64_neon_ld1x3:
	case Intrinsic::aarch64_neon_ld1x4:
	case Intrinsic::aarch64_neon_ld2lane:
	case Intrinsic::aarch64_neon_ld3lane:
	case Intrinsic::aarch64_neon_ld4lane:
	case Intrinsic::aarch64_neon_ld2r:
	case Intrinsic::aarch64_neon_ld3r:
	case Intrinsic::aarch64_neon_ld4r: {
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	// Conservatively set memVT to the entire set of vectors loaded.
	uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align = 0;
	// volatile loads with NEON intrinsics not supported
	Info.flags = MachineMemOperand::MOLoad;
	return true;
	}
	case Intrinsic::aarch64_neon_st2:
	case Intrinsic::aarch64_neon_st3:
	case Intrinsic::aarch64_neon_st4:
	case Intrinsic::aarch64_neon_st1x2:
	case Intrinsic::aarch64_neon_st1x3:
	case Intrinsic::aarch64_neon_st1x4:
	case Intrinsic::aarch64_neon_st2lane:
	case Intrinsic::aarch64_neon_st3lane:
	case Intrinsic::aarch64_neon_st4lane: {
	Info.opc = ISD::INTRINSIC_VOID;
	// Conservatively set memVT to the entire set of vectors stored.
	unsigned NumElts = 0;
	for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
	Type *ArgTy = I.getArgOperand(ArgI)->getType();
	if (!ArgTy->isVectorTy())
	break;
	NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
	}
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align = 0;
	// volatile stores with NEON intrinsics not supported
	Info.flags = MachineMemOperand::MOStore;
	return true;
	}
	case Intrinsic::aarch64_ldaxr:
	case Intrinsic::aarch64_ldxr: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
	return true;
	}
	case Intrinsic::aarch64_stlxr:
	case Intrinsic::aarch64_stxr: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = 0;
	Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
	return true;
	}
	case Intrinsic::aarch64_ldaxp:
	case Intrinsic::aarch64_ldxp:
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::i128;
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = 16;
	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
	return true;
	case Intrinsic::aarch64_stlxp:
	case Intrinsic::aarch64_stxp:
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::i128;
	Info.ptrVal = I.getArgOperand(2);
	Info.offset = 0;
	Info.align = 16;
	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
	return true;
	default:
	break;
	}

	return false;
	}

	bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	// TODO: This may be worth removing. Check regression tests for diffs.
	if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
	return false;

	// If we're reducing the load width in order to avoid having to use an extra
	// instruction to do extension then it's probably a good idea.
	if (ExtTy != ISD::NON_EXTLOAD)
	return true;
	// Don't reduce load width if it would prevent us from combining a shift into
	// the offset.
	MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
	assert(Mem);
	const SDValue &Base = Mem->getBasePtr();
	if (Base.getOpcode() == ISD::ADD &&
	Base.getOperand(1).getOpcode() == ISD::SHL &&
	Base.getOperand(1).hasOneUse() &&
	Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
	// The shift can be combined if it matches the size of the value being
	// loaded (and so reducing the width would make it not match).
	uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
	uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
	if (ShiftAmount == Log2_32(LoadBytes))
	return false;
	}
	// We have no reason to disallow reducing the load width, so allow it.
	return true;
	}

	// Truncations from 64-bit GPR to 32-bit GPR is free.
	bool AArch64TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}
	bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	/// Check if it is profitable to hoist instruction in then/else to if.
	/// Not profitable if I and it's user can form a FMA instruction
	/// because we prefer FMSUB/FMADD.
	bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
	if (I->getOpcode() != Instruction::FMul)
	return true;

	if (!I->hasOneUse())
	return true;

	Instruction *User = I->user_back();

	if (User &&
	!(User->getOpcode() == Instruction::FSub \|\|
	User->getOpcode() == Instruction::FAdd))
	return true;

	const TargetOptions &Options = getTargetMachine().Options;
	const DataLayout &DL = I->getModule()->getDataLayout();
	EVT VT = getValueType(DL, User->getOperand(0)->getType());

	return !(isFMAFasterThanFMulAndFAdd(VT) &&
	isOperationLegalOrCustom(ISD::FMA, VT) &&
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
	Options.UnsafeFPMath));
	}

	// All 32-bit GPR operations implicitly zero the high-half of the corresponding
	// 64-bit GPR.
	bool AArch64TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 == 32 && NumBits2 == 64;
	}
	bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 == 32 && NumBits2 == 64;
	}

	bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2)) {
	return true;
	}

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	// 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
	return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
	VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
	VT1.getSizeInBits() <= 32);
	}

	bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
	if (isa<FPExtInst>(Ext))
	return false;

	// Vector types are not free.
	if (Ext->getType()->isVectorTy())
	return false;

	for (const Use &U : Ext->uses()) {
	// The extension is free if we can fold it with a left shift in an
	// addressing mode or an arithmetic operation: add, sub, and cmp.

	// Is there a shift?
	const Instruction *Instr = cast<Instruction>(U.getUser());

	// Is this a constant shift?
	switch (Instr->getOpcode()) {
	case Instruction::Shl:
	if (!isa<ConstantInt>(Instr->getOperand(1)))
	return false;
	break;
	case Instruction::GetElementPtr: {
	gep_type_iterator GTI = gep_type_begin(Instr);
	auto &DL = Ext->getModule()->getDataLayout();
	std::advance(GTI, U.getOperandNo()-1);
	Type *IdxTy = GTI.getIndexedType();
	// This extension will end up with a shift because of the scaling factor.
	// 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
	// Get the shift amount based on the scaling factor:
	// log2(sizeof(IdxTy)) - log2(8).
	uint64_t ShiftAmt =
	countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
	// Is the constant foldable in the shift of the addressing mode?
	// I.e., shift amount is between 1 and 4 inclusive.
	if (ShiftAmt == 0 \|\| ShiftAmt > 4)
	return false;
	break;
	}
	case Instruction::Trunc:
	// Check if this is a noop.
	// trunc(sext ty1 to ty2) to ty1.
	if (Instr->getType() == Ext->getOperand(0)->getType())
	continue;
	LLVM_FALLTHROUGH;
	default:
	return false;
	}

	// At this point we can use the bfm family, so this extension is free
	// for that use.
	}
	return true;
	}

	/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
	/// or upper half of the vector elements.
	static bool areExtractShuffleVectors(Value Op1, Value Op2) {
	auto areTypesHalfed = [](Value FullV, Value HalfV) {
	auto *FullVT = cast<VectorType>(FullV->getType());
	auto *HalfVT = cast<VectorType>(HalfV->getType());
	return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth();
	};

	auto extractHalf = [](Value FullV, Value HalfV) {
	auto *FullVT = cast<VectorType>(FullV->getType());
	auto *HalfVT = cast<VectorType>(HalfV->getType());
	return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
	};

	Constant M1, M2;
	Value S1Op1, S2Op1;
	if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) \|\|
	!match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2))))
	return false;

	// Check that the operands are half as wide as the result and we extract
	// half of the elements of the input vectors.
	if (!areTypesHalfed(S1Op1, Op1) \|\| !areTypesHalfed(S2Op1, Op2) \|\|
	!extractHalf(S1Op1, Op1) \|\| !extractHalf(S2Op1, Op2))
	return false;

	// Check the mask extracts either the lower or upper half of vector
	// elements.
	int M1Start = -1;
	int M2Start = -1;
	int NumElements = cast<VectorType>(Op1->getType())->getNumElements() * 2;
	if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) \|\|
	!ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) \|\|
	M1Start != M2Start \|\| (M1Start != 0 && M2Start != (NumElements / 2)))
	return false;

	return true;
	}

	/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
	/// of the vector elements.
	static bool areExtractExts(Value Ext1, Value Ext2) {
	auto areExtDoubled = [](Instruction *Ext) {
	return Ext->getType()->getScalarSizeInBits() ==
	2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
	};

	if (!match(Ext1, m_ZExtOrSExt(m_Value())) \|\|
	!match(Ext2, m_ZExtOrSExt(m_Value())) \|\|
	!areExtDoubled(cast<Instruction>(Ext1)) \|\|
	!areExtDoubled(cast<Instruction>(Ext2)))
	return false;

	return true;
	}

	/// Check if sinking \p I's operands to I's basic block is profitable, because
	/// the operands can be folded into a target instruction, e.g.
	/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
	bool AArch64TargetLowering::shouldSinkOperands(
	Instruction I, SmallVectorImpl<Use > &Ops) const {
	if (!I->getType()->isVectorTy())
	return false;

	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
	switch (II->getIntrinsicID()) {
	case Intrinsic::aarch64_neon_umull:
	if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
	return false;
	Ops.push_back(&II->getOperandUse(0));
	Ops.push_back(&II->getOperandUse(1));
	return true;
	default:
	return false;
	}
	}

	switch (I->getOpcode()) {
	case Instruction::Sub:
	case Instruction::Add: {
	if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
	return false;

	// If the exts' operands extract either the lower or upper elements, we
	// can sink them too.
	auto Ext1 = cast<Instruction>(I->getOperand(0));
	auto Ext2 = cast<Instruction>(I->getOperand(1));
	if (areExtractShuffleVectors(Ext1, Ext2)) {
	Ops.push_back(&Ext1->getOperandUse(0));
	Ops.push_back(&Ext2->getOperandUse(0));
	}

	Ops.push_back(&I->getOperandUse(0));
	Ops.push_back(&I->getOperandUse(1));

	return true;
	}
	default:
	return false;
	}
	return false;
	}

	bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
	unsigned &RequiredAligment) const {
	if (!LoadedType.isSimple() \|\|
	(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
	return false;
	// Cyclone supports unaligned accesses.
	RequiredAligment = 0;
	unsigned NumBits = LoadedType.getSizeInBits();
	return NumBits == 32 \|\| NumBits == 64;
	}

	/// A helper function for determining the number of interleaved accesses we
	/// will generate when lowering accesses of the given type.
	unsigned
	AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
	const DataLayout &DL) const {
	return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
	}

	MachineMemOperand::Flags
	AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
	if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
	I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
	return MOStridedAccess;
	return MachineMemOperand::MONone;
	}

	bool AArch64TargetLowering::isLegalInterleavedAccessType(
	VectorType *VecTy, const DataLayout &DL) const {

	unsigned VecSize = DL.getTypeSizeInBits(VecTy);
	unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());

	// Ensure the number of vector elements is greater than 1.
	if (VecTy->getNumElements() < 2)
	return false;

	// Ensure the element type is legal.
	if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
	return false;

	// Ensure the total vector size is 64 or a multiple of 128. Types larger than
	// 128 will be split into multiple interleaved accesses.
	return VecSize == 64 \|\| VecSize % 128 == 0;
	}

	/// Lower an interleaved load into a ldN intrinsic.
	///
	/// E.g. Lower an interleaved load (Factor = 2):
	/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
	/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
	/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
	///
	/// Into:
	/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
	/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
	/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
	bool AArch64TargetLowering::lowerInterleavedLoad(
	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
	ArrayRef<unsigned> Indices, unsigned Factor) const {
	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
	"Invalid interleave factor");
	assert(!Shuffles.empty() && "Empty shufflevector input");
	assert(Shuffles.size() == Indices.size() &&
	"Unmatched number of shufflevectors and indices");

	const DataLayout &DL = LI->getModule()->getDataLayout();

	VectorType *VecTy = Shuffles[0]->getType();

	// Skip if we do not have NEON and skip illegal vector types. We can
	// "legalize" wide vector types into multiple interleaved accesses as long as
	// the vector types are divisible by 128.
	if (!Subtarget->hasNEON() \|\| !isLegalInterleavedAccessType(VecTy, DL))
	return false;

	unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);

	// A pointer vector can not be the return type of the ldN intrinsics. Need to
	// load integer vectors first and then convert to pointer vectors.
	Type *EltTy = VecTy->getVectorElementType();
	if (EltTy->isPointerTy())
	VecTy =
	VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());

	IRBuilder<> Builder(LI);

	// The base address of the load.
	Value *BaseAddr = LI->getPointerOperand();

	if (NumLoads > 1) {
	// If we're going to generate more than one load, reset the sub-vector type
	// to something legal.
	VecTy = VectorType::get(VecTy->getVectorElementType(),
	VecTy->getVectorNumElements() / NumLoads);

	// We will compute the pointer operand of each load from the original base
	// address using GEPs. Cast the base address to a pointer to the scalar
	// element type.
	BaseAddr = Builder.CreateBitCast(
	BaseAddr, VecTy->getVectorElementType()->getPointerTo(
	LI->getPointerAddressSpace()));
	}

	Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
	Type *Tys[2] = {VecTy, PtrTy};
	static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
	Intrinsic::aarch64_neon_ld3,
	Intrinsic::aarch64_neon_ld4};
	Function *LdNFunc =
	Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);

	// Holds sub-vectors extracted from the load intrinsic return values. The
	// sub-vectors are associated with the shufflevector instructions they will
	// replace.
	DenseMap<ShuffleVectorInst , SmallVector<Value , 4>> SubVecs;

	for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {

	// If we're generating more than one load, compute the base address of
	// subsequent loads as an offset from the previous.
	if (LoadCount > 0)
	BaseAddr =
	Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
	VecTy->getVectorNumElements() * Factor);

	CallInst *LdN = Builder.CreateCall(
	LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");

	// Extract and store the sub-vectors returned by the load intrinsic.
	for (unsigned i = 0; i < Shuffles.size(); i++) {
	ShuffleVectorInst *SVI = Shuffles[i];
	unsigned Index = Indices[i];

	Value *SubVec = Builder.CreateExtractValue(LdN, Index);

	// Convert the integer vector to pointer vector if the element is pointer.
	if (EltTy->isPointerTy())
	SubVec = Builder.CreateIntToPtr(
	SubVec, VectorType::get(SVI->getType()->getVectorElementType(),
	VecTy->getVectorNumElements()));
	SubVecs[SVI].push_back(SubVec);
	}
	}

	// Replace uses of the shufflevector instructions with the sub-vectors
	// returned by the load intrinsic. If a shufflevector instruction is
	// associated with more than one sub-vector, those sub-vectors will be
	// concatenated into a single wide vector.
	for (ShuffleVectorInst *SVI : Shuffles) {
	auto &SubVec = SubVecs[SVI];
	auto *WideVec =
	SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
	SVI->replaceAllUsesWith(WideVec);
	}

	return true;
	}

	/// Lower an interleaved store into a stN intrinsic.
	///
	/// E.g. Lower an interleaved store (Factor = 3):
	/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
	/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
	/// store <12 x i32> %i.vec, <12 x i32>* %ptr
	///
	/// Into:
	/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
	/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
	/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
	///
	/// Note that the new shufflevectors will be removed and we'll only generate one
	/// st3 instruction in CodeGen.
	///
	/// Example for a more general valid mask (Factor 3). Lower:
	/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
	/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
	/// store <12 x i32> %i.vec, <12 x i32>* %ptr
	///
	/// Into:
	/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
	/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
	/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
	bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
	ShuffleVectorInst *SVI,
	unsigned Factor) const {
	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
	"Invalid interleave factor");

	VectorType *VecTy = SVI->getType();
	assert(VecTy->getVectorNumElements() % Factor == 0 &&
	"Invalid interleaved store");

	unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
	Type *EltTy = VecTy->getVectorElementType();
	VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);

	const DataLayout &DL = SI->getModule()->getDataLayout();

	// Skip if we do not have NEON and skip illegal vector types. We can
	// "legalize" wide vector types into multiple interleaved accesses as long as
	// the vector types are divisible by 128.
	if (!Subtarget->hasNEON() \|\| !isLegalInterleavedAccessType(SubVecTy, DL))
	return false;

	unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);

	Value *Op0 = SVI->getOperand(0);
	Value *Op1 = SVI->getOperand(1);
	IRBuilder<> Builder(SI);

	// StN intrinsics don't support pointer vectors as arguments. Convert pointer
	// vectors to integer vectors.
	if (EltTy->isPointerTy()) {
	Type *IntTy = DL.getIntPtrType(EltTy);
	unsigned NumOpElts = Op0->getType()->getVectorNumElements();

	// Convert to the corresponding integer vector.
	Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
	Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
	Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);

	SubVecTy = VectorType::get(IntTy, LaneLen);
	}

	// The base address of the store.
	Value *BaseAddr = SI->getPointerOperand();

	if (NumStores > 1) {
	// If we're going to generate more than one store, reset the lane length
	// and sub-vector type to something legal.
	LaneLen /= NumStores;
	SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);

	// We will compute the pointer operand of each store from the original base
	// address using GEPs. Cast the base address to a pointer to the scalar
	// element type.
	BaseAddr = Builder.CreateBitCast(
	BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
	SI->getPointerAddressSpace()));
	}

	auto Mask = SVI->getShuffleMask();

	Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
	Type *Tys[2] = {SubVecTy, PtrTy};
	static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
	Intrinsic::aarch64_neon_st3,
	Intrinsic::aarch64_neon_st4};
	Function *StNFunc =
	Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);

	for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {

	SmallVector<Value *, 5> Ops;

	// Split the shufflevector operands into sub vectors for the new stN call.
	for (unsigned i = 0; i < Factor; i++) {
	unsigned IdxI = StoreCount * LaneLen * Factor + i;
	if (Mask[IdxI] >= 0) {
	Ops.push_back(Builder.CreateShuffleVector(
	Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
	} else {
	unsigned StartMask = 0;
	for (unsigned j = 1; j < LaneLen; j++) {
	unsigned IdxJ = StoreCount * LaneLen * Factor + j;
	if (Mask[IdxJ * Factor + IdxI] >= 0) {
	StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
	break;
	}
	}
	// Note: Filling undef gaps with random elements is ok, since
	// those elements were being written anyway (with undefs).
	// In the case of all undefs we're defaulting to using elems from 0
	// Note: StartMask cannot be negative, it's checked in
	// isReInterleaveMask
	Ops.push_back(Builder.CreateShuffleVector(
	Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
	}
	}

	// If we generating more than one store, we compute the base address of
	// subsequent stores as an offset from the previous.
	if (StoreCount > 0)
	BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
	BaseAddr, LaneLen * Factor);

	Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
	Builder.CreateCall(StNFunc, Ops);
	}
	return true;
	}

	static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
	unsigned AlignCheck) {
	return ((SrcAlign == 0 \|\| SrcAlign % AlignCheck == 0) &&
	(DstAlign == 0 \|\| DstAlign % AlignCheck == 0));
	}

	EVT AArch64TargetLowering::getOptimalMemOpType(
	uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
	bool ZeroMemset, bool MemcpyStrSrc,
	const AttributeList &FuncAttributes) const {
	bool CanImplicitFloat =
	!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
	bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
	bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
	// Only use AdvSIMD to implement memset of 32-byte and above. It would have
	// taken one instruction to materialize the v2i64 zero and one store (with
	// restrictive addressing mode). Just do i64 stores.
	bool IsSmallMemset = IsMemset && Size < 32;
	auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) {
	if (memOpAlign(SrcAlign, DstAlign, AlignCheck))
	return true;
	bool Fast;
	return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
	&Fast) &&
	Fast;
	};

	if (CanUseNEON && IsMemset && !IsSmallMemset &&
	AlignmentIsAcceptable(MVT::v2i64, 16))
	return MVT::v2i64;
	if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16))
	return MVT::f128;
	if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8))
	return MVT::i64;
	if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4))
	return MVT::i32;
	return MVT::Other;
	}

	// 12-bit optionally shifted immediates are legal for adds.
	bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
	if (Immed == std::numeric_limits<int64_t>::min()) {
	LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
	<< ": avoid UB for INT64_MIN\n");
	return false;
	}
	// Same encoding for add/sub, just flip the sign.
	Immed = std::abs(Immed);
	bool IsLegal = ((Immed >> 12) == 0 \|\|
	((Immed & 0xfff) == 0 && Immed >> 24 == 0));
	LLVM_DEBUG(dbgs() << "Is " << Immed
	<< " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
	return IsLegal;
	}

	// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
	// immediates is the same as for an add or a sub.
	bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
	return isLegalAddImmediate(Immed);
	}

	/// isLegalAddressingMode - Return true if the addressing mode represented
	/// by AM is legal for this target, for a load/store of the specified type.
	bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS, Instruction *I) const {
	// AArch64 has five basic addressing modes:
	// reg
	// reg + 9-bit signed offset
	// reg + SIZE_IN_BYTES * 12-bit unsigned offset
	// reg1 + reg2
	// reg + SIZE_IN_BYTES * reg

	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	// No reg+reg+imm addressing.
	if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
	return false;

	// check reg + imm case:
	// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
	uint64_t NumBytes = 0;
	if (Ty->isSized()) {
	uint64_t NumBits = DL.getTypeSizeInBits(Ty);
	NumBytes = NumBits / 8;
	if (!isPowerOf2_64(NumBits))
	NumBytes = 0;
	}

	if (!AM.Scale) {
	int64_t Offset = AM.BaseOffs;

	// 9-bit signed offset
	if (isInt<9>(Offset))
	return true;

	// 12-bit unsigned offset
	unsigned shift = Log2_64(NumBytes);
	if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
	// Must be a multiple of NumBytes (NumBytes is a power of 2)
	(Offset >> shift) << shift == Offset)
	return true;
	return false;
	}

	// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2

	return AM.Scale == 1 \|\| (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
	}

	bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
	// Consider splitting large offset of struct or array.
	return true;
	}

	int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// Operands \| Rt Latency
	// -------------------------------------------
	// Rt, [Xn, Xm] \| 4
	// -------------------------------------------
	// Rt, [Xn, Xm, lsl #imm] \| Rn: 4 Rm: 5
	// Rt, [Xn, Wm, <extend> #imm] \|
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1 if
	// it is not equal to 0 or 1.
	return AM.Scale != 0 && AM.Scale != 1;
	return -1;
	}

	bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	const MCPhysReg *
	AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
	// LR is a callee-save register, but we must treat it as clobbered by any call
	// site. Hence we include LR in the scratch registers, which are in turn added
	// as implicit-defs for stackmaps and patchpoints.
	static const MCPhysReg ScratchRegs[] = {
	AArch64::X16, AArch64::X17, AArch64::LR, 0
	};
	return ScratchRegs;
	}

	bool
	AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
	CombineLevel Level) const {
	N = N->getOperand(0).getNode();
	EVT VT = N->getValueType(0);
	// If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
	// it with shift to let it be lowered to UBFX.
	if (N->getOpcode() == ISD::AND && (VT == MVT::i32 \|\| VT == MVT::i64) &&
	isa<ConstantSDNode>(N->getOperand(1))) {
	uint64_t TruncMask = N->getConstantOperandVal(1);
	if (isMask_64(TruncMask) &&
	N->getOperand(0).getOpcode() == ISD::SRL &&
	isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
	return false;
	}
	return true;
	}

	bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0)
	return false;

	int64_t Val = Imm.getSExtValue();
	if (Val == 0 \|\| AArch64_AM::isLogicalImmediate(Val, BitSize))
	return true;

	if ((int64_t)Val < 0)
	Val = ~Val;
	if (BitSize == 32)
	Val &= (1LL << 32) - 1;

	unsigned LZ = countLeadingZeros((uint64_t)Val);
	unsigned Shift = (63 - LZ) / 16;
	// MOVZ is free so return true for one or fewer MOVK.
	return Shift < 3;
	}

	bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	return (Index == 0 \|\| Index == ResVT.getVectorNumElements());
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// cmge X, X, #0
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	EVT VT = N->getValueType(0);
	if (!Subtarget->hasNEON() \|\| !VT.isVector())
	return SDValue();

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != AArch64ISD::VASHR \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
	return SDValue();

	return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
	}

	// Generate SUBS and CSEL for integer abs.
	static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
	// and change it to SUB and CSEL.
	if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
	N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
	N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
	if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
	if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
	SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	N0.getOperand(0));
	// Generate SUBS & CSEL.
	SDValue Cmp =
	DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
	N0.getOperand(0), DAG.getConstant(0, DL, VT));
	return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
	DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
	SDValue(Cmp.getNode(), 1));
	}
	return SDValue();
	}

	static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	return performIntegerAbsCombine(N, DAG);
	}

	SDValue
	AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const {
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (isIntDivCheap(N->getValueType(0), Attr))
	return SDValue(N,0); // Lower SDIV as SDIV

	// fold (sdiv X, pow2)
	EVT VT = N->getValueType(0);
	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
	!(Divisor.isPowerOf2() \|\| (-Divisor).isPowerOf2()))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	unsigned Lg2 = Divisor.countTrailingZeros();
	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);

	// Add (N0 < 0) ? Pow2 - 1 : 0;
	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
	SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);

	Created.push_back(Cmp.getNode());
	Created.push_back(Add.getNode());
	Created.push_back(CSel.getNode());

	// Divide by pow2.
	SDValue SRA =
	DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));

	// If we're dividing by a positive value, we're done. Otherwise, we must
	// negate the result.
	if (Divisor.isNonNegative())
	return SRA;

	Created.push_back(SRA.getNode());
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
	}

	static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// The below optimizations require a constant RHS.
	if (!isa<ConstantSDNode>(N->getOperand(1)))
	return SDValue();

	ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
	const APInt &ConstValue = C->getAPIntValue();

	// Multiplication of a power of two plus/minus one can be done more
	// cheaply as as shift+add/sub. For now, this is true unilaterally. If
	// future CPUs have a cheaper MADD instruction, this may need to be
	// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
	// 64-bit is 5 cycles, so this is always a win.
	// More aggressively, some multiplications N0 * C can be lowered to
	// shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
	// e.g. 6=32=(2+1)2.
	// TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
	// which equals to (1+2)*16-(1+2).
	SDValue N0 = N->getOperand(0);
	// TrailingZeroes is used to test if the mul can be lowered to
	// shift+add+shift.
	unsigned TrailingZeroes = ConstValue.countTrailingZeros();
	if (TrailingZeroes) {
	// Conservatively do not lower to shift+add+shift if the mul might be
	// folded into smul or umul.
	if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) \|\|
	isZeroExtended(N0.getNode(), DAG)))
	return SDValue();
	// Conservatively do not lower to shift+add+shift if the mul might be
	// folded into madd or msub.
	if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD \|\|
	N->use_begin()->getOpcode() == ISD::SUB))
	return SDValue();
	}
	// Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
	// and shift+add+shift.
	APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);

	unsigned ShiftAmt, AddSubOpc;
	// Is the shifted value the LHS operand of the add/sub?
	bool ShiftValUseIsN0 = true;
	// Do we need to negate the result?
	bool NegateResult = false;

	if (ConstValue.isNonNegative()) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	// (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
	APInt SCVMinus1 = ShiftedConstValue - 1;
	APInt CVPlus1 = ConstValue + 1;
	if (SCVMinus1.isPowerOf2()) {
	ShiftAmt = SCVMinus1.logBase2();
	AddSubOpc = ISD::ADD;
	} else if (CVPlus1.isPowerOf2()) {
	ShiftAmt = CVPlus1.logBase2();
	AddSubOpc = ISD::SUB;
	} else
	return SDValue();
	} else {
	// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
	// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
	APInt CVNegPlus1 = -ConstValue + 1;
	APInt CVNegMinus1 = -ConstValue - 1;
	if (CVNegPlus1.isPowerOf2()) {
	ShiftAmt = CVNegPlus1.logBase2();
	AddSubOpc = ISD::SUB;
	ShiftValUseIsN0 = false;
	} else if (CVNegMinus1.isPowerOf2()) {
	ShiftAmt = CVNegMinus1.logBase2();
	AddSubOpc = ISD::ADD;
	NegateResult = true;
	} else
	return SDValue();
	}

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
	DAG.getConstant(ShiftAmt, DL, MVT::i64));

	SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
	SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
	SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
	assert(!(NegateResult && TrailingZeroes) &&
	"NegateResult and TrailingZeroes cannot both be true for now.");
	// Negate the result.
	if (NegateResult)
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
	// Shift the result.
	if (TrailingZeroes)
	return DAG.getNode(ISD::SHL, DL, VT, Res,
	DAG.getConstant(TrailingZeroes, DL, MVT::i64));
	return Res;
	}

	static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons producing 0 or -1 in each lane to
	// optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| N->getOperand(0)->getOpcode() != ISD::AND \|\|
	N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC \|\|
	VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (BuildVectorSDNode *BV =
	dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
	N->getOperand(0)->getOperand(0), MaskConst);
	SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
	return Res;
	}

	return SDValue();
	}

	static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	// First try to optimize away the conversion when it's conditionally from
	// a constant. Vectors only.
	if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
	return Res;

	EVT VT = N->getValueType(0);
	if (VT != MVT::f32 && VT != MVT::f64)
	return SDValue();

	// Only optimize when the source and destination types have the same width.
	if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
	return SDValue();

	// If the result of an integer load is only used by an integer-to-float
	// conversion, use a fp load instead and a AdvSIMD scalar {S\|U}CVTF instead.
	// This eliminates an "integer-to-vector-move" UOP and improves throughput.
	SDValue N0 = N->getOperand(0);
	if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	// Do not change the width of a volatile load.
	!cast<LoadSDNode>(N0)->isVolatile()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
	LN0->getPointerInfo(), LN0->getAlignment(),
	LN0->getMemOperand()->getFlags());

	// Make sure successors of the original load stay after it by updating them
	// to use the new Chain.
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));

	unsigned Opcode =
	(N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
	return DAG.getNode(Opcode, SDLoc(N), VT, Load);
	}

	return SDValue();
	}

	/// Fold a floating-point multiply by power of two into floating-point to
	/// fixed-point conversion.
	static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	if (!N->getValueType(0).isSimple())
	return SDValue();

	SDValue Op = N->getOperand(0);
	if (!Op.getValueType().isVector() \|\| !Op.getValueType().isSimple() \|\|
	Op.getOpcode() != ISD::FMUL)
	return SDValue();

	SDValue ConstVec = Op->getOperand(1);
	if (!isa<BuildVectorSDNode>(ConstVec))
	return SDValue();

	MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
	uint32_t FloatBits = FloatTy.getSizeInBits();
	if (FloatBits != 32 && FloatBits != 64)
	return SDValue();

	MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
	uint32_t IntBits = IntTy.getSizeInBits();
	if (IntBits != 16 && IntBits != 32 && IntBits != 64)
	return SDValue();

	// Avoid conversions where iN is larger than the float (e.g., float -> i64).
	if (IntBits > FloatBits)
	return SDValue();

	BitVector UndefElements;
	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
	int32_t Bits = IntBits == 64 ? 64 : 32;
	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
	if (C == -1 \|\| C == 0 \|\| C > Bits)
	return SDValue();

	MVT ResTy;
	unsigned NumLanes = Op.getValueType().getVectorNumElements();
	switch (NumLanes) {
	default:
	return SDValue();
	case 2:
	ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
	break;
	case 4:
	ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
	break;
	}

	if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
	return SDValue();

	assert((ResTy != MVT::v4i64 \|\| DCI.isBeforeLegalizeOps()) &&
	"Illegal vector type after legalization");

	SDLoc DL(N);
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
	: Intrinsic::aarch64_neon_vcvtfp2fxu;
	SDValue FixConv =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
	DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
	Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
	// We can handle smaller integers by generating an extra trunc.
	if (IntBits < FloatBits)
	FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);

	return FixConv;
	}

	/// Fold a floating-point divide by power of two into fixed-point to
	/// floating-point conversion.
	static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	SDValue Op = N->getOperand(0);
	unsigned Opc = Op->getOpcode();
	if (!Op.getValueType().isVector() \|\| !Op.getValueType().isSimple() \|\|
	!Op.getOperand(0).getValueType().isSimple() \|\|
	(Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
	return SDValue();

	SDValue ConstVec = N->getOperand(1);
	if (!isa<BuildVectorSDNode>(ConstVec))
	return SDValue();

	MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
	int32_t IntBits = IntTy.getSizeInBits();
	if (IntBits != 16 && IntBits != 32 && IntBits != 64)
	return SDValue();

	MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
	int32_t FloatBits = FloatTy.getSizeInBits();
	if (FloatBits != 32 && FloatBits != 64)
	return SDValue();

	// Avoid conversions where iN is larger than the float (e.g., i64 -> float).
	if (IntBits > FloatBits)
	return SDValue();

	BitVector UndefElements;
	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
	if (C == -1 \|\| C == 0 \|\| C > FloatBits)
	return SDValue();

	MVT ResTy;
	unsigned NumLanes = Op.getValueType().getVectorNumElements();
	switch (NumLanes) {
	default:
	return SDValue();
	case 2:
	ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
	break;
	case 4:
	ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
	break;
	}

	if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
	return SDValue();

	SDLoc DL(N);
	SDValue ConvInput = Op.getOperand(0);
	bool IsSigned = Opc == ISD::SINT_TO_FP;
	if (IntBits < FloatBits)
	ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
	ResTy, ConvInput);

	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
	: Intrinsic::aarch64_neon_vcvtfxu2fp;
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
	DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
	DAG.getConstant(C, DL, MVT::i32));
	}

	/// An EXTR instruction is made up of two shifts, ORed together. This helper
	/// searches for and classifies those shifts.
	static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
	bool &FromHi) {
	if (N.getOpcode() == ISD::SHL)
	FromHi = false;
	else if (N.getOpcode() == ISD::SRL)
	FromHi = true;
	else
	return false;

	if (!isa<ConstantSDNode>(N.getOperand(1)))
	return false;

	ShiftAmount = N->getConstantOperandVal(1);
	Src = N->getOperand(0);
	return true;
	}

	/// EXTR instruction extracts a contiguous chunk of bits from two existing
	/// registers viewed as a high/low pair. This function looks for the pattern:
	/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
	/// with an EXTR. Can't quite be done in TableGen because the two immediates
	/// aren't independent.
	static SDValue tryCombineToEXTR(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	assert(N->getOpcode() == ISD::OR && "Unexpected root");

	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	SDValue LHS;
	uint32_t ShiftLHS = 0;
	bool LHSFromHi = false;
	if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
	return SDValue();

	SDValue RHS;
	uint32_t ShiftRHS = 0;
	bool RHSFromHi = false;
	if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
	return SDValue();

	// If they're both trying to come from the high part of the register, they're
	// not really an EXTR.
	if (LHSFromHi == RHSFromHi)
	return SDValue();

	if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
	return SDValue();

	if (LHSFromHi) {
	std::swap(LHS, RHS);
	std::swap(ShiftLHS, ShiftRHS);
	}

	return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
	DAG.getConstant(ShiftRHS, DL, MVT::i64));
	}

	static SDValue tryCombineToBSL(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);
	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	if (!VT.isVector())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	if (N0.getOpcode() != ISD::AND)
	return SDValue();

	SDValue N1 = N->getOperand(1);
	if (N1.getOpcode() != ISD::AND)
	return SDValue();

	// We only have to look for constant vectors here since the general, variable
	// case can be handled in TableGen.
	unsigned Bits = VT.getScalarSizeInBits();
	uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
	for (int i = 1; i >= 0; --i)
	for (int j = 1; j >= 0; --j) {
	BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
	BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
	if (!BVN0 \|\| !BVN1)
	continue;

	bool FoundMatch = true;
	for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
	ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
	ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
	if (!CN0 \|\| !CN1 \|\|
	CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
	FoundMatch = false;
	break;
	}
	}

	if (FoundMatch)
	return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
	N0->getOperand(1 - i), N1->getOperand(1 - j));
	}

	return SDValue();
	}

	static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	// Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	if (SDValue Res = tryCombineToEXTR(N, DCI))
	return Res;

	if (SDValue Res = tryCombineToBSL(N, DCI))
	return Res;

	return SDValue();
	}

	static SDValue performANDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDValue LHS = N->getOperand(0);
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| !DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	BuildVectorSDNode *BVN =
	dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
	if (!BVN)
	return SDValue();

	// AND does not accept an immediate, so check if we can use a BIC immediate
	// instruction instead. We do this here instead of using a (and x, (mvni imm))
	// pattern in isel, because some immediates may be lowered to the preferred
	// (and x, (movi imm)) form, even though an mvni representation also exists.
	APInt DefBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	if (resolveBuildVector(BVN, DefBits, UndefBits)) {
	SDValue NewOp;

	DefBits = ~DefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
	DefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
	DefBits, &LHS)))
	return NewOp;

	UndefBits = ~UndefBits;
	if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
	UndefBits, &LHS)) \|\|
	(NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
	UndefBits, &LHS)))
	return NewOp;
	}

	return SDValue();
	}

	static SDValue performSRLCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);
	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	// Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
	// high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
	// to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
	SDValue N0 = N->getOperand(0);
	if (N0.getOpcode() == ISD::BSWAP) {
	SDLoc DL(N);
	SDValue N1 = N->getOperand(1);
	SDValue N00 = N0.getOperand(0);
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
	uint64_t ShiftAmt = C->getZExtValue();
	if (VT == MVT::i32 && ShiftAmt == 16 &&
	DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
	return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
	if (VT == MVT::i64 && ShiftAmt == 32 &&
	DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
	return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
	}
	}
	return SDValue();
	}

	static SDValue performBitcastCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// Wait 'til after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// Remove extraneous bitcasts around an extract_subvector.
	// For example,
	// (v4i16 (bitconvert
	// (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
	// becomes
	// (extract_subvector ((v8i16 ...), (i64 4)))

	// Only interested in 64-bit vectors as the ultimate result.
	EVT VT = N->getValueType(0);
	if (!VT.isVector())
	return SDValue();
	if (VT.getSimpleVT().getSizeInBits() != 64)
	return SDValue();
	// Is the operand an extract_subvector starting at the beginning or halfway
	// point of the vector? A low half may also come through as an
	// EXTRACT_SUBREG, so look for that, too.
	SDValue Op0 = N->getOperand(0);
	if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
	!(Op0->isMachineOpcode() &&
	Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
	return SDValue();
	uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
	if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
	return SDValue();
	} else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
	if (idx != AArch64::dsub)
	return SDValue();
	// The dsub reference is equivalent to a lane zero subvector reference.
	idx = 0;
	}
	// Look through the bitcast of the input to the extract.
	if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
	return SDValue();
	SDValue Source = Op0->getOperand(0)->getOperand(0);
	// If the source type has twice the number of elements as our destination
	// type, we know this is an extract of the high or low half of the vector.
	EVT SVT = Source->getValueType(0);
	if (!SVT.isVector() \|\|
	SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
	return SDValue();

	LLVM_DEBUG(
	dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");

	// Create the simplified form to just extract the low or high half of the
	// vector directly rather than bothering with the bitcasts.
	SDLoc dl(N);
	unsigned NumElements = VT.getVectorNumElements();
	if (idx) {
	SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
	} else {
	SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
	return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
	Source, SubReg),
	0);
	}
	}

	static SDValue performConcatVectorsCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);

	// Optimize concat_vectors of truncated vectors, where the intermediate
	// type is illegal, to avoid said illegality, e.g.,
	// (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
	// (v2i16 (truncate (v2i64)))))
	// ->
	// (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
	// (v4i32 (bitcast (v2i64))),
	// <0, 2, 4, 6>)))
	// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
	// on both input and result type, so we might generate worse code.
	// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
	if (N->getNumOperands() == 2 &&
	N0->getOpcode() == ISD::TRUNCATE &&
	N1->getOpcode() == ISD::TRUNCATE) {
	SDValue N00 = N0->getOperand(0);
	SDValue N10 = N1->getOperand(0);
	EVT N00VT = N00.getValueType();

	if (N00VT == N10.getValueType() &&
	(N00VT == MVT::v2i64 \|\| N00VT == MVT::v4i32) &&
	N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
	MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
	SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
	for (size_t i = 0; i < Mask.size(); ++i)
	Mask[i] = i * 2;
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getVectorShuffle(
	MidVT, dl,
	DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
	DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
	}
	}

	// Wait 'til after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
	// splat. The indexed instructions are going to be expecting a DUPLANE64, so
	// canonicalise to that.
	if (N0 == N1 && VT.getVectorNumElements() == 2) {
	assert(VT.getScalarSizeInBits() == 64);
	return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
	DAG.getConstant(0, dl, MVT::i64));
	}

	// Canonicalise concat_vectors so that the right-hand vector has as few
	// bit-casts as possible before its real operation. The primary matching
	// destination for these operations will be the narrowing "2" instructions,
	// which depend on the operation being performed on this right-hand vector.
	// For example,
	// (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
	// becomes
	// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))

	if (N1->getOpcode() != ISD::BITCAST)
	return SDValue();
	SDValue RHS = N1->getOperand(0);
	MVT RHSTy = RHS.getValueType().getSimpleVT();
	// If the RHS is not a vector, this is not the pattern we're looking for.
	if (!RHSTy.isVector())
	return SDValue();

	LLVM_DEBUG(
	dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");

	MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
	RHSTy.getVectorNumElements() * 2);
	return DAG.getNode(ISD::BITCAST, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
	DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
	RHS));
	}

	static SDValue tryCombineFixedPointConvert(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// Wait until after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();
	// Transform a scalar conversion of a value from a lane extract into a
	// lane extract of a vector conversion. E.g., from foo1 to foo2:
	// double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
	// double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
	//
	// The second form interacts better with instruction selection and the
	// register allocator to avoid cross-class register copies that aren't
	// coalescable due to a lane reference.

	// Check the operand and see if it originates from a lane extract.
	SDValue Op1 = N->getOperand(1);
	if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	// Yep, no additional predication needed. Perform the transform.
	SDValue IID = N->getOperand(0);
	SDValue Shift = N->getOperand(2);
	SDValue Vec = Op1.getOperand(0);
	SDValue Lane = Op1.getOperand(1);
	EVT ResTy = N->getValueType(0);
	EVT VecResTy;
	SDLoc DL(N);

	// The vector width should be 128 bits by the time we get here, even
	// if it started as 64 bits (the extract_vector handling will have
	// done so).
	assert(Vec.getValueSizeInBits() == 128 &&
	"unexpected vector size on extract_vector_elt!");
	if (Vec.getValueType() == MVT::v4i32)
	VecResTy = MVT::v4f32;
	else if (Vec.getValueType() == MVT::v2i64)
	VecResTy = MVT::v2f64;
	else
	llvm_unreachable("unexpected vector type!");

	SDValue Convert =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
	}
	return SDValue();
	}

	// AArch64 high-vector "long" operations are formed by performing the non-high
	// version on an extract_subvector of each operand which gets the high half:
	//
	// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
	//
	// However, there are cases which don't have an extract_high explicitly, but
	// have another operation that can be made compatible with one for free. For
	// example:
	//
	// (dupv64 scalar) --> (extract_high (dup128 scalar))
	//
	// This routine does the actual conversion of such DUPs, once outer routines
	// have determined that everything else is in order.
	// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
	// similarly here.
	static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
	switch (N.getOpcode()) {
	case AArch64ISD::DUP:
	case AArch64ISD::DUPLANE8:
	case AArch64ISD::DUPLANE16:
	case AArch64ISD::DUPLANE32:
	case AArch64ISD::DUPLANE64:
	case AArch64ISD::MOVI:
	case AArch64ISD::MOVIshift:
	case AArch64ISD::MOVIedit:
	case AArch64ISD::MOVImsl:
	case AArch64ISD::MVNIshift:
	case AArch64ISD::MVNImsl:
	break;
	default:
	// FMOV could be supported, but isn't very useful, as it would only occur
	// if you passed a bitcast' floating point immediate to an eligible long
	// integer op (addl, smull, ...).
	return SDValue();
	}

	MVT NarrowTy = N.getSimpleValueType();
	if (!NarrowTy.is64BitVector())
	return SDValue();

	MVT ElementTy = NarrowTy.getVectorElementType();
	unsigned NumElems = NarrowTy.getVectorNumElements();
	MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);

	SDLoc dl(N);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
	DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
	DAG.getConstant(NumElems, dl, MVT::i64));
	}

	static bool isEssentiallyExtractHighSubvector(SDValue N) {
	if (N.getOpcode() == ISD::BITCAST)
	N = N.getOperand(0);
	if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return false;
	return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
	N.getOperand(0).getValueType().getVectorNumElements() / 2;
	}

	/// Helper structure to keep track of ISD::SET_CC operands.
	struct GenericSetCCInfo {
	const SDValue *Opnd0;
	const SDValue *Opnd1;
	ISD::CondCode CC;
	};

	/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
	struct AArch64SetCCInfo {
	const SDValue *Cmp;
	AArch64CC::CondCode CC;
	};

	/// Helper structure to keep track of SetCC information.
	union SetCCInfo {
	GenericSetCCInfo Generic;
	AArch64SetCCInfo AArch64;
	};

	/// Helper structure to be able to read SetCC information. If set to
	/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
	/// GenericSetCCInfo.
	struct SetCCInfoAndKind {
	SetCCInfo Info;
	bool IsAArch64;
	};

	/// Check whether or not \p Op is a SET_CC operation, either a generic or
	/// an
	/// AArch64 lowered one.
	/// \p SetCCInfo is filled accordingly.
	/// \post SetCCInfo is meanginfull only when this function returns true.
	/// \return True when Op is a kind of SET_CC operation.
	static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
	// If this is a setcc, this is straight forward.
	if (Op.getOpcode() == ISD::SETCC) {
	SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
	SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
	SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SetCCInfo.IsAArch64 = false;
	return true;
	}
	// Otherwise, check if this is a matching csel instruction.
	// In other words:
	// - csel 1, 0, cc
	// - csel 0, 1, !cc
	if (Op.getOpcode() != AArch64ISD::CSEL)
	return false;
	// Set the information about the operands.
	// TODO: we want the operands of the Cmp not the csel
	SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
	SetCCInfo.IsAArch64 = true;
	SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

	// Check that the operands matches the constraints:
	// (1) Both operands must be constants.
	// (2) One must be 1 and the other must be 0.
	ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
	ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));

	// Check (1).
	if (!TValue \|\| !FValue)
	return false;

	// Check (2).
	if (!TValue->isOne()) {
	// Update the comparison when we are interested in !cc.
	std::swap(TValue, FValue);
	SetCCInfo.Info.AArch64.CC =
	AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
	}
	return TValue->isOne() && FValue->isNullValue();
	}

	// Returns true if Op is setcc or zext of setcc.
	static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
	if (isSetCC(Op, Info))
	return true;
	return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
	isSetCC(Op->getOperand(0), Info));
	}

	// The folding we want to perform is:
	// (add x, [zext] (setcc cc ...) )
	// -->
	// (csel x, (add x, 1), !cc ...)
	//
	// The latter will get matched to a CSINC instruction.
	static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
	assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
	SDValue LHS = Op->getOperand(0);
	SDValue RHS = Op->getOperand(1);
	SetCCInfoAndKind InfoAndKind;

	// If neither operand is a SET_CC, give up.
	if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
	std::swap(LHS, RHS);
	if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
	return SDValue();
	}

	// FIXME: This could be generatized to work for FP comparisons.
	EVT CmpVT = InfoAndKind.IsAArch64
	? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
	: InfoAndKind.Info.Generic.Opnd0->getValueType();
	if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
	return SDValue();

	SDValue CCVal;
	SDValue Cmp;
	SDLoc dl(Op);
	if (InfoAndKind.IsAArch64) {
	CCVal = DAG.getConstant(
	AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
	MVT::i32);
	Cmp = *InfoAndKind.Info.AArch64.Cmp;
	} else
	Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
	*InfoAndKind.Info.Generic.Opnd1,
	ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
	CCVal, DAG, dl);

	EVT VT = Op->getValueType(0);
	LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
	}

	// The basic add/sub long vector instructions have variants with "2" on the end
	// which act on the high-half of their inputs. They are normally matched by
	// patterns like:
	//
	// (add (zeroext (extract_high LHS)),
	// (zeroext (extract_high RHS)))
	// -> uaddl2 vD, vN, vM
	//
	// However, if one of the extracts is something like a duplicate, this
	// instruction can still be used profitably. This function puts the DAG into a
	// more appropriate form for those patterns to trigger.
	static SDValue performAddSubLongCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	if (!VT.is128BitVector()) {
	if (N->getOpcode() == ISD::ADD)
	return performSetccAddFolding(N, DAG);
	return SDValue();
	}

	// Make sure both branches are extended in the same way.
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
	LHS.getOpcode() != ISD::SIGN_EXTEND) \|\|
	LHS.getOpcode() != RHS.getOpcode())
	return SDValue();

	unsigned ExtType = LHS.getOpcode();

	// It's not worth doing if at least one of the inputs isn't already an
	// extract, but we don't know which it'll be so we have to try both.
	if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
	RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
	if (!RHS.getNode())
	return SDValue();

	RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
	} else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
	LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
	if (!LHS.getNode())
	return SDValue();

	LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
	}

	return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
	}

	// Massage DAGs which we can use the high-half "long" operations on into
	// something isel will recognize better. E.g.
	//
	// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
	// (aarch64_neon_umull (extract_high (v2i64 vec)))
	// (extract_high (v2i64 (dup128 scalar)))))
	//
	static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	assert(LHS.getValueType().is64BitVector() &&
	RHS.getValueType().is64BitVector() &&
	"unexpected shape for long operation");

	// Either node could be a DUP, but it's not worth doing both of them (you'd
	// just as well use the non-high version) so look for a corresponding extract
	// operation on the other "wing".
	if (isEssentiallyExtractHighSubvector(LHS)) {
	RHS = tryExtendDUPToExtractHigh(RHS, DAG);
	if (!RHS.getNode())
	return SDValue();
	} else if (isEssentiallyExtractHighSubvector(RHS)) {
	LHS = tryExtendDUPToExtractHigh(LHS, DAG);
	if (!LHS.getNode())
	return SDValue();
	}

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
	N->getOperand(0), LHS, RHS);
	}

	static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
	MVT ElemTy = N->getSimpleValueType(0).getScalarType();
	unsigned ElemBits = ElemTy.getSizeInBits();

	int64_t ShiftAmount;
	if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
	APInt SplatValue, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
	HasAnyUndefs, ElemBits) \|\|
	SplatBitSize != ElemBits)
	return SDValue();

	ShiftAmount = SplatValue.getSExtValue();
	} else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
	ShiftAmount = CVN->getSExtValue();
	} else
	return SDValue();

	unsigned Opcode;
	bool IsRightShift;
	switch (IID) {
	default:
	llvm_unreachable("Unknown shift intrinsic");
	case Intrinsic::aarch64_neon_sqshl:
	Opcode = AArch64ISD::SQSHL_I;
	IsRightShift = false;
	break;
	case Intrinsic::aarch64_neon_uqshl:
	Opcode = AArch64ISD::UQSHL_I;
	IsRightShift = false;
	break;
	case Intrinsic::aarch64_neon_srshl:
	Opcode = AArch64ISD::SRSHR_I;
	IsRightShift = true;
	break;
	case Intrinsic::aarch64_neon_urshl:
	Opcode = AArch64ISD::URSHR_I;
	IsRightShift = true;
	break;
	case Intrinsic::aarch64_neon_sqshlu:
	Opcode = AArch64ISD::SQSHLU_I;
	IsRightShift = false;
	break;
	}

	if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
	SDLoc dl(N);
	return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
	DAG.getConstant(-ShiftAmount, dl, MVT::i32));
	} else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
	SDLoc dl(N);
	return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
	DAG.getConstant(ShiftAmount, dl, MVT::i32));
	}

	return SDValue();
	}

	// The CRC32[BH] instructions ignore the high bits of their data operand. Since
	// the intrinsics must be legal and take an i32, this means there's almost
	// certainly going to be a zext in the DAG which we can eliminate.
	static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
	SDValue AndN = N->getOperand(2);
	if (AndN.getOpcode() != ISD::AND)
	return SDValue();

	ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
	if (!CMask \|\| CMask->getZExtValue() != Mask)
	return SDValue();

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
	N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
	}

	static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
	SelectionDAG &DAG) {
	SDLoc dl(N);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
	DAG.getNode(Opc, dl,
	N->getOperand(1).getSimpleValueType(),
	N->getOperand(1)),
	DAG.getConstant(0, dl, MVT::i64));
	}

	static SDValue performIntrinsicCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	SelectionDAG &DAG = DCI.DAG;
	unsigned IID = getIntrinsicID(N);
	switch (IID) {
	default:
	break;
	case Intrinsic::aarch64_neon_vcvtfxs2fp:
	case Intrinsic::aarch64_neon_vcvtfxu2fp:
	return tryCombineFixedPointConvert(N, DCI, DAG);
	case Intrinsic::aarch64_neon_saddv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
	case Intrinsic::aarch64_neon_uaddv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
	case Intrinsic::aarch64_neon_sminv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
	case Intrinsic::aarch64_neon_uminv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
	case Intrinsic::aarch64_neon_smaxv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
	case Intrinsic::aarch64_neon_umaxv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
	case Intrinsic::aarch64_neon_fmax:
	return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fmin:
	return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fmaxnm:
	return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fminnm:
	return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_smull:
	case Intrinsic::aarch64_neon_umull:
	case Intrinsic::aarch64_neon_pmull:
	case Intrinsic::aarch64_neon_sqdmull:
	return tryCombineLongOpWithDup(IID, N, DCI, DAG);
	case Intrinsic::aarch64_neon_sqshl:
	case Intrinsic::aarch64_neon_uqshl:
	case Intrinsic::aarch64_neon_sqshlu:
	case Intrinsic::aarch64_neon_srshl:
	case Intrinsic::aarch64_neon_urshl:
	return tryCombineShiftImm(IID, N, DAG);
	case Intrinsic::aarch64_crc32b:
	case Intrinsic::aarch64_crc32cb:
	return tryCombineCRC32(0xff, N, DAG);
	case Intrinsic::aarch64_crc32h:
	case Intrinsic::aarch64_crc32ch:
	return tryCombineCRC32(0xffff, N, DAG);
	}
	return SDValue();
	}

	static SDValue performExtendCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
	// we can convert that DUP into another extract_high (of a bigger DUP), which
	// helps the backend to decide that an sabdl2 would be useful, saving a real
	// extract_high operation.
	if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
	N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
	SDNode *ABDNode = N->getOperand(0).getNode();
	unsigned IID = getIntrinsicID(ABDNode);
	if (IID == Intrinsic::aarch64_neon_sabd \|\|
	IID == Intrinsic::aarch64_neon_uabd) {
	SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
	if (!NewABD.getNode())
	return SDValue();

	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
	NewABD);
	}
	}

	// This is effectively a custom type legalization for AArch64.
	//
	// Type legalization will split an extend of a small, legal, type to a larger
	// illegal type by first splitting the destination type, often creating
	// illegal source types, which then get legalized in isel-confusing ways,
	// leading to really terrible codegen. E.g.,
	// %result = v8i32 sext v8i8 %value
	// becomes
	// %losrc = extract_subreg %value, ...
	// %hisrc = extract_subreg %value, ...
	// %lo = v4i32 sext v4i8 %losrc
	// %hi = v4i32 sext v4i8 %hisrc
	// Things go rapidly downhill from there.
	//
	// For AArch64, the [sz]ext vector instructions can only go up one element
	// size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
	// take two instructions.
	//
	// This implies that the most efficient way to do the extend from v8i8
	// to two v4i32 values is to first extend the v8i8 to v8i16, then do
	// the normal splitting to happen for the v8i16->v8i32.

	// This is pre-legalization to catch some cases where the default
	// type legalization will create ill-tempered code.
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	// We're only interested in cleaning things up for non-legal vector types
	// here. If both the source and destination are legal, things will just
	// work naturally without any fiddling.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT ResVT = N->getValueType(0);
	if (!ResVT.isVector() \|\| TLI.isTypeLegal(ResVT))
	return SDValue();
	// If the vector type isn't a simple VT, it's beyond the scope of what
	// we're worried about here. Let legalization do its thing and hope for
	// the best.
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src->getValueType(0);
	if (!ResVT.isSimple() \|\| !SrcVT.isSimple())
	return SDValue();

	// If the source VT is a 64-bit vector, we can play games and get the
	// better results we want.
	if (SrcVT.getSizeInBits() != 64)
	return SDValue();

	unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
	unsigned ElementCount = SrcVT.getVectorNumElements();
	SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
	SDLoc DL(N);
	Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);

	// Now split the rest of the operation into two halves, each with a 64
	// bit source.
	EVT LoVT, HiVT;
	SDValue Lo, Hi;
	unsigned NumElements = ResVT.getVectorNumElements();
	assert(!(NumElements & 1) && "Splitting vector, but not in half!");
	LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
	ResVT.getVectorElementType(), NumElements / 2);

	EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
	LoVT.getVectorNumElements());
	Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
	DAG.getConstant(0, DL, MVT::i64));
	Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
	DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64));
	Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
	Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);

	// Now combine the parts back together so we still have a single result
	// like the combiner expects.
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
	}

	static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
	SDValue SplatVal, unsigned NumVecElts) {
	assert(!St.isTruncatingStore() && "cannot split truncating vector store");
	unsigned OrigAlignment = St.getAlignment();
	unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;

	// Create scalar stores. This is at least as good as the code sequence for a
	// split unaligned store which is a dup.s, ext.b, and two stores.
	// Most of the time the three stores should be replaced by store pair
	// instructions (stp).
	SDLoc DL(&St);
	SDValue BasePtr = St.getBasePtr();
	uint64_t BaseOffset = 0;

	const MachinePointerInfo &PtrInfo = St.getPointerInfo();
	SDValue NewST1 =
	DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
	OrigAlignment, St.getMemOperand()->getFlags());

	// As this in ISel, we will not merge this add which may degrade results.
	if (BasePtr->getOpcode() == ISD::ADD &&
	isa<ConstantSDNode>(BasePtr->getOperand(1))) {
	BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
	BasePtr = BasePtr->getOperand(0);
	}

	unsigned Offset = EltOffset;
	while (--NumVecElts) {
	unsigned Alignment = MinAlign(OrigAlignment, Offset);
	SDValue OffsetPtr =
	DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
	DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
	NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
	PtrInfo.getWithOffset(Offset), Alignment,
	St.getMemOperand()->getFlags());
	Offset += EltOffset;
	}
	return NewST1;
	}

	/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
	/// load store optimizer pass will merge them to store pair stores. This should
	/// be better than a movi to create the vector zero followed by a vector store
	/// if the zero constant is not re-used, since one instructions and one register
	/// live range will be removed.
	///
	/// For example, the final generated code should be:
	///
	/// stp xzr, xzr, [x0]
	///
	/// instead of:
	///
	/// movi v0.2d, #0
	/// str q0, [x0]
	///
	static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
	SDValue StVal = St.getValue();
	EVT VT = StVal.getValueType();

	// It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
	// 2, 3 or 4 i32 elements.
	int NumVecElts = VT.getVectorNumElements();
	if (!(((NumVecElts == 2 \|\| NumVecElts == 3) &&
	VT.getVectorElementType().getSizeInBits() == 64) \|\|
	((NumVecElts == 2 \|\| NumVecElts == 3 \|\| NumVecElts == 4) &&
	VT.getVectorElementType().getSizeInBits() == 32)))
	return SDValue();

	if (StVal.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// If the zero constant has more than one use then the vector store could be
	// better since the constant mov will be amortized and stp q instructions
	// should be able to be formed.
	if (!StVal.hasOneUse())
	return SDValue();

	// If the store is truncating then it's going down to i16 or smaller, which
	// means it can be implemented in a single store anyway.
	if (St.isTruncatingStore())
	return SDValue();

	// If the immediate offset of the address operand is too large for the stp
	// instruction, then bail out.
	if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
	int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
	if (Offset < -512 \|\| Offset > 504)
	return SDValue();
	}

	for (int I = 0; I < NumVecElts; ++I) {
	SDValue EltVal = StVal.getOperand(I);
	if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
	return SDValue();
	}

	// Use a CopyFromReg WZR/XZR here to prevent
	// DAGCombiner::MergeConsecutiveStores from undoing this transformation.
	SDLoc DL(&St);
	unsigned ZeroReg;
	EVT ZeroVT;
	if (VT.getVectorElementType().getSizeInBits() == 32) {
	ZeroReg = AArch64::WZR;
	ZeroVT = MVT::i32;
	} else {
	ZeroReg = AArch64::XZR;
	ZeroVT = MVT::i64;
	}
	SDValue SplatVal =
	DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
	}

	/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
	/// value. The load store optimizer pass will merge them to store pair stores.
	/// This has better performance than a splat of the scalar followed by a split
	/// vector store. Even if the stores are not merged it is four stores vs a dup,
	/// followed by an ext.b and two stores.
	static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
	SDValue StVal = St.getValue();
	EVT VT = StVal.getValueType();

	// Don't replace floating point stores, they possibly won't be transformed to
	// stp because of the store pair suppress pass.
	if (VT.isFloatingPoint())
	return SDValue();

	// We can express a splat as store pair(s) for 2 or 4 elements.
	unsigned NumVecElts = VT.getVectorNumElements();
	if (NumVecElts != 4 && NumVecElts != 2)
	return SDValue();

	// If the store is truncating then it's going down to i16 or smaller, which
	// means it can be implemented in a single store anyway.
	if (St.isTruncatingStore())
	return SDValue();

	// Check that this is a splat.
	// Make sure that each of the relevant vector element locations are inserted
	// to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
	std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
	SDValue SplatVal;
	for (unsigned I = 0; I < NumVecElts; ++I) {
	// Check for insert vector elements.
	if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
	return SDValue();

	// Check that same value is inserted at each vector element.
	if (I == 0)
	SplatVal = StVal.getOperand(1);
	else if (StVal.getOperand(1) != SplatVal)
	return SDValue();

	// Check insert element index.
	ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
	if (!CIndex)
	return SDValue();
	uint64_t IndexVal = CIndex->getZExtValue();
	if (IndexVal >= NumVecElts)
	return SDValue();
	IndexNotInserted.reset(IndexVal);

	StVal = StVal.getOperand(0);
	}
	// Check that all vector element locations were inserted to.
	if (IndexNotInserted.any())
	return SDValue();

	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
	}

	static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {

	StoreSDNode *S = cast<StoreSDNode>(N);
	if (S->isVolatile() \|\| S->isIndexed())
	return SDValue();

	SDValue StVal = S->getValue();
	EVT VT = StVal.getValueType();
	if (!VT.isVector())
	return SDValue();

	// If we get a splat of zeros, convert this vector store to a store of
	// scalars. They will be merged into store pairs of xzr thereby removing one
	// instruction and one register.
	if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
	return ReplacedZeroSplat;

	// FIXME: The logic for deciding if an unaligned store should be split should
	// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
	// a call to that function here.

	if (!Subtarget->isMisaligned128StoreSlow())
	return SDValue();

	// Don't split at -Oz.
	if (DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
	// those up regresses performance on micro-benchmarks and olden/bh.
	if (VT.getVectorNumElements() < 2 \|\| VT == MVT::v2i64)
	return SDValue();

	// Split unaligned 16B stores. They are terrible for performance.
	// Don't split stores with alignment of 1 or 2. Code that uses clang vector
	// extensions can use this to mark that it does not want splitting to happen
	// (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
	// eliminating alignment hazards is only 1 in 8 for alignment of 2.
	if (VT.getSizeInBits() != 128 \|\| S->getAlignment() >= 16 \|\|
	S->getAlignment() <= 2)
	return SDValue();

	// If we get a splat of a scalar convert this vector store to a store of
	// scalars. They will be merged into store pairs thereby removing two
	// instructions.
	if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
	return ReplacedSplat;

	SDLoc DL(S);
	unsigned NumElts = VT.getVectorNumElements() / 2;
	// Split VT into two.
	EVT HalfVT =
	EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
	SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
	DAG.getConstant(0, DL, MVT::i64));
	SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
	DAG.getConstant(NumElts, DL, MVT::i64));
	SDValue BasePtr = S->getBasePtr();
	SDValue NewST1 =
	DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
	S->getAlignment(), S->getMemOperand()->getFlags());
	SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
	DAG.getConstant(8, DL, MVT::i64));
	return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
	S->getPointerInfo(), S->getAlignment(),
	S->getMemOperand()->getFlags());
	}

	/// Target-specific DAG combine function for post-increment LD1 (lane) and
	/// post-increment LD1R.
	static SDValue performPostLD1Combine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	bool IsLaneOp) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	unsigned LoadIdx = IsLaneOp ? 1 : 0;
	SDNode *LD = N->getOperand(LoadIdx).getNode();
	// If it is not LOAD, can not do such combine.
	if (LD->getOpcode() != ISD::LOAD)
	return SDValue();

	// The vector lane must be a constant in the LD1LANE opcode.
	SDValue Lane;
	if (IsLaneOp) {
	Lane = N->getOperand(2);
	auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
	if (!LaneC \|\| LaneC->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();
	}

	LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
	EVT MemVT = LoadSDN->getMemoryVT();
	// Check if memory operand is the same type as the vector element.
	if (MemVT != VT.getVectorElementType())
	return SDValue();

	// Check if there are other uses. If so, do not combine as it will introduce
	// an extra load.
	for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
	++UI) {
	if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
	continue;
	if (*UI != N)
	return SDValue();
	}

	SDValue Addr = LD->getOperand(1);
	SDValue Vector = N->getOperand(0);
	// Search for a use of the address operand that is an increment.
	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
	Addr.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User->getOpcode() != ISD::ADD
	\|\| UI.getUse().getResNo() != Addr.getResNo())
	continue;

	// If the increment is a constant, it must match the memory ref size.
	SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
	uint32_t IncVal = CInc->getZExtValue();
	unsigned NumBytes = VT.getScalarSizeInBits() / 8;
	if (IncVal != NumBytes)
	continue;
	Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
	}

	// To avoid cycle construction make sure that neither the load nor the add
	// are predecessors to each other or the Vector.
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Visited.insert(N);
	Worklist.push_back(User);
	Worklist.push_back(LD);
	Worklist.push_back(Vector.getNode());
	if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) \|\|
	SDNode::hasPredecessorHelper(User, Visited, Worklist))
	continue;

	SmallVector<SDValue, 8> Ops;
	Ops.push_back(LD->getOperand(0)); // Chain
	if (IsLaneOp) {
	Ops.push_back(Vector); // The vector to be inserted
	Ops.push_back(Lane); // The lane to be inserted in the vector
	}
	Ops.push_back(Addr);
	Ops.push_back(Inc);

	EVT Tys[3] = { VT, MVT::i64, MVT::Other };
	SDVTList SDTys = DAG.getVTList(Tys);
	unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
	SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
	MemVT,
	LoadSDN->getMemOperand());

	// Update the uses.
	SDValue NewResults[] = {
	SDValue(LD, 0), // The result of load
	SDValue(UpdN.getNode(), 2) // Chain
	};
	DCI.CombineTo(LD, NewResults);
	DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
	DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register

	break;
	}
	return SDValue();
	}

	/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
	/// address translation.
	static bool performTBISimplification(SDValue Addr,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	APInt DemandedMask = APInt::getLowBitsSet(64, 56);
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
	DCI.CommitTargetLoweringOpt(TLO);
	return true;
	}
	return false;
	}

	static SDValue performSTORECombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
	return Split;

	if (Subtarget->supportsAddressTopByteIgnored() &&
	performTBISimplification(N->getOperand(2), DCI, DAG))
	return SDValue(N, 0);

	return SDValue();
	}


	/// Target-specific DAG combine function for NEON load/store intrinsics
	/// to merge base address updates.
	static SDValue performNEONPostLDSTCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	unsigned AddrOpIdx = N->getNumOperands() - 1;
	SDValue Addr = N->getOperand(AddrOpIdx);

	// Search for a use of the address operand that is an increment.
	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
	UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User->getOpcode() != ISD::ADD \|\|
	UI.getUse().getResNo() != Addr.getResNo())
	continue;

	// Check that the add is independent of the load/store. Otherwise, folding
	// it would create a cycle.
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Visited.insert(Addr.getNode());
	Worklist.push_back(N);
	Worklist.push_back(User);
	if (SDNode::hasPredecessorHelper(N, Visited, Worklist) \|\|
	SDNode::hasPredecessorHelper(User, Visited, Worklist))
	continue;

	// Find the new opcode for the updating load/store.
	bool IsStore = false;
	bool IsLaneOp = false;
	bool IsDupOp = false;
	unsigned NewOpc = 0;
	unsigned NumVecs = 0;
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default: llvm_unreachable("unexpected intrinsic for Neon base update");
	case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
	NumVecs = 2; break;
	case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
	NumVecs = 3; break;
	case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
	NumVecs = 4; break;
	case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
	NumVecs = 2; IsStore = true; break;
	case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
	NumVecs = 3; IsStore = true; break;
	case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
	NumVecs = 4; IsStore = true; break;
	case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
	NumVecs = 2; break;
	case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
	NumVecs = 3; break;
	case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
	NumVecs = 4; break;
	case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
	NumVecs = 2; IsStore = true; break;
	case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
	NumVecs = 3; IsStore = true; break;
	case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
	NumVecs = 4; IsStore = true; break;
	case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
	NumVecs = 2; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
	NumVecs = 3; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
	NumVecs = 4; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
	NumVecs = 2; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
	NumVecs = 3; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
	NumVecs = 4; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
	NumVecs = 2; IsStore = true; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
	NumVecs = 3; IsStore = true; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
	NumVecs = 4; IsStore = true; IsLaneOp = true; break;
	}

	EVT VecTy;
	if (IsStore)
	VecTy = N->getOperand(2).getValueType();
	else
	VecTy = N->getValueType(0);

	// If the increment is a constant, it must match the memory ref size.
	SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
	uint32_t IncVal = CInc->getZExtValue();
	unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
	if (IsLaneOp \|\| IsDupOp)
	NumBytes /= VecTy.getVectorNumElements();
	if (IncVal != NumBytes)
	continue;
	Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
	}
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(N->getOperand(0)); // Incoming chain
	// Load lane and store have vector list as input.
	if (IsLaneOp \|\| IsStore)
	for (unsigned i = 2; i < AddrOpIdx; ++i)
	Ops.push_back(N->getOperand(i));
	Ops.push_back(Addr); // Base register
	Ops.push_back(Inc);

	// Return Types.
	EVT Tys[6];
	unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
	unsigned n;
	for (n = 0; n < NumResultVecs; ++n)
	Tys[n] = VecTy;
	Tys[n++] = MVT::i64; // Type of write back register
	Tys[n] = MVT::Other; // Type of the chain
	SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));

	MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
	SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
	MemInt->getMemoryVT(),
	MemInt->getMemOperand());

	// Update the uses.
	std::vector<SDValue> NewResults;
	for (unsigned i = 0; i < NumResultVecs; ++i) {
	NewResults.push_back(SDValue(UpdN.getNode(), i));
	}
	NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
	DCI.CombineTo(N, NewResults);
	DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));

	break;
	}
	return SDValue();
	}

	// Checks to see if the value is the prescribed width and returns information
	// about its extension mode.
	static
	bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
	ExtType = ISD::NON_EXTLOAD;
	switch(V.getNode()->getOpcode()) {
	default:
	return false;
	case ISD::LOAD: {
	LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
	if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
	\|\| (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
	ExtType = LoadNode->getExtensionType();
	return true;
	}
	return false;
	}
	case ISD::AssertSext: {
	VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
	if ((TypeNode->getVT() == MVT::i8 && width == 8)
	\|\| (TypeNode->getVT() == MVT::i16 && width == 16)) {
	ExtType = ISD::SEXTLOAD;
	return true;
	}
	return false;
	}
	case ISD::AssertZext: {
	VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
	if ((TypeNode->getVT() == MVT::i8 && width == 8)
	\|\| (TypeNode->getVT() == MVT::i16 && width == 16)) {
	ExtType = ISD::ZEXTLOAD;
	return true;
	}
	return false;
	}
	case ISD::Constant:
	case ISD::TargetConstant: {
	return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
	1LL << (width - 1);
	}
	}

	return true;
	}

	// This function does a whole lot of voodoo to determine if the tests are
	// equivalent without and with a mask. Essentially what happens is that given a
	// DAG resembling:
	//
	// +-------------+ +-------------+ +-------------+ +-------------+
	// \| Input \| \| AddConstant \| \| CompConstant\| \| CC \|
	// +-------------+ +-------------+ +-------------+ +-------------+
	// \| \| \| \|
	// V V \| +----------+
	// +-------------+ +----+ \| \|
	// \| ADD \| \|0xff\| \| \|
	// +-------------+ +----+ \| \|
	// \| \| \| \|
	// V V \| \|
	// +-------------+ \| \|
	// \| AND \| \| \|
	// +-------------+ \| \|
	// \| \| \|
	// +-----+ \| \|
	// \| \| \|
	// V V V
	// +-------------+
	// \| CMP \|
	// +-------------+
	//
	// The AND node may be safely removed for some combinations of inputs. In
	// particular we need to take into account the extension type of the Input,
	// the exact values of AddConstant, CompConstant, and CC, along with the nominal
	// width of the input (this can work for any width inputs, the above graph is
	// specific to 8 bits.
	//
	// The specific equations were worked out by generating output tables for each
	// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
	// problem was simplified by working with 4 bit inputs, which means we only
	// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
	// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
	// patterns present in both extensions (0,7). For every distinct set of
	// AddConstant and CompConstants bit patterns we can consider the masked and
	// unmasked versions to be equivalent if the result of this function is true for
	// all 16 distinct bit patterns of for the current extension type of Input (w0).
	//
	// sub w8, w0, w1
	// and w10, w8, #0x0f
	// cmp w8, w2
	// cset w9, AArch64CC
	// cmp w10, w2
	// cset w11, AArch64CC
	// cmp w9, w11
	// cset w0, eq
	// ret
	//
	// Since the above function shows when the outputs are equivalent it defines
	// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
	// would be expensive to run during compiles. The equations below were written
	// in a test harness that confirmed they gave equivalent outputs to the above
	// for all inputs function, so they can be used determine if the removal is
	// legal instead.
	//
	// isEquivalentMaskless() is the code for testing if the AND can be removed
	// factored out of the DAG recognition as the DAG can take several forms.

	static bool isEquivalentMaskless(unsigned CC, unsigned width,
	ISD::LoadExtType ExtType, int AddConstant,
	int CompConstant) {
	// By being careful about our equations and only writing the in term
	// symbolic values and well known constants (0, 1, -1, MaxUInt) we can
	// make them generally applicable to all bit widths.
	int MaxUInt = (1 << width);

	// For the purposes of these comparisons sign extending the type is
	// equivalent to zero extending the add and displacing it by half the integer
	// width. Provided we are careful and make sure our equations are valid over
	// the whole range we can just adjust the input and avoid writing equations
	// for sign extended inputs.
	if (ExtType == ISD::SEXTLOAD)
	AddConstant -= (1 << (width-1));

	switch(CC) {
	case AArch64CC::LE:
	case AArch64CC::GT:
	if ((AddConstant == 0) \|\|
	(CompConstant == MaxUInt - 1 && AddConstant < 0) \|\|
	(AddConstant >= 0 && CompConstant < 0) \|\|
	(AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
	return true;
	break;
	case AArch64CC::LT:
	case AArch64CC::GE:
	if ((AddConstant == 0) \|\|
	(AddConstant >= 0 && CompConstant <= 0) \|\|
	(AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
	return true;
	break;
	case AArch64CC::HI:
	case AArch64CC::LS:
	if ((AddConstant >= 0 && CompConstant < 0) \|\|
	(AddConstant <= 0 && CompConstant >= -1 &&
	CompConstant < AddConstant + MaxUInt))
	return true;
	break;
	case AArch64CC::PL:
	case AArch64CC::MI:
	if ((AddConstant == 0) \|\|
	(AddConstant > 0 && CompConstant <= 0) \|\|
	(AddConstant < 0 && CompConstant <= AddConstant))
	return true;
	break;
	case AArch64CC::LO:
	case AArch64CC::HS:
	if ((AddConstant >= 0 && CompConstant <= 0) \|\|
	(AddConstant <= 0 && CompConstant >= 0 &&
	CompConstant <= AddConstant + MaxUInt))
	return true;
	break;
	case AArch64CC::EQ:
	case AArch64CC::NE:
	if ((AddConstant > 0 && CompConstant < 0) \|\|
	(AddConstant < 0 && CompConstant >= 0 &&
	CompConstant < AddConstant + MaxUInt) \|\|
	(AddConstant >= 0 && CompConstant >= 0 &&
	CompConstant >= AddConstant) \|\|
	(AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
	return true;
	break;
	case AArch64CC::VS:
	case AArch64CC::VC:
	case AArch64CC::AL:
	case AArch64CC::NV:
	return true;
	case AArch64CC::Invalid:
	break;
	}

	return false;
	}

	static
	SDValue performCONDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG, unsigned CCIndex,
	unsigned CmpIndex) {
	unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
	SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
	unsigned CondOpcode = SubsNode->getOpcode();

	if (CondOpcode != AArch64ISD::SUBS)
	return SDValue();

	// There is a SUBS feeding this condition. Is it fed by a mask we can
	// use?

	SDNode *AndNode = SubsNode->getOperand(0).getNode();
	unsigned MaskBits = 0;

	if (AndNode->getOpcode() != ISD::AND)
	return SDValue();

	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
	uint32_t CNV = CN->getZExtValue();
	if (CNV == 255)
	MaskBits = 8;
	else if (CNV == 65535)
	MaskBits = 16;
	}

	if (!MaskBits)
	return SDValue();

	SDValue AddValue = AndNode->getOperand(0);

	if (AddValue.getOpcode() != ISD::ADD)
	return SDValue();

	// The basic dag structure is correct, grab the inputs and validate them.

	SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
	SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
	SDValue SubsInputValue = SubsNode->getOperand(1);

	// The mask is present and the provenance of all the values is a smaller type,
	// lets see if the mask is superfluous.

	if (!isa<ConstantSDNode>(AddInputValue2.getNode()) \|\|
	!isa<ConstantSDNode>(SubsInputValue.getNode()))
	return SDValue();

	ISD::LoadExtType ExtType;

	if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) \|\|
	!checkValueWidth(AddInputValue2, MaskBits, ExtType) \|\|
	!checkValueWidth(AddInputValue1, MaskBits, ExtType) )
	return SDValue();

	if(!isEquivalentMaskless(CC, MaskBits, ExtType,
	cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
	cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
	return SDValue();

	// The AND is not necessary, remove it.

	SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
	SubsNode->getValueType(1));
	SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };

	SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
	DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());

	return SDValue(N, 0);
	}

	// Optimize compare with zero and branch.
	static SDValue performBRCONDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
	// will not be produced, as they are conditional branch instructions that do
	// not set flags.
	if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
	return SDValue();

	if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
	N = NV.getNode();
	SDValue Chain = N->getOperand(0);
	SDValue Dest = N->getOperand(1);
	SDValue CCVal = N->getOperand(2);
	SDValue Cmp = N->getOperand(3);

	assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
	unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
	if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
	return SDValue();

	unsigned CmpOpc = Cmp.getOpcode();
	if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
	return SDValue();

	// Only attempt folding if there is only one use of the flag and no use of the
	// value.
	if (!Cmp->hasNUsesOfValue(0, 0) \|\| !Cmp->hasNUsesOfValue(1, 1))
	return SDValue();

	SDValue LHS = Cmp.getOperand(0);
	SDValue RHS = Cmp.getOperand(1);

	assert(LHS.getValueType() == RHS.getValueType() &&
	"Expected the value type to be the same for both operands!");
	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
	return SDValue();

	if (isNullConstant(LHS))
	std::swap(LHS, RHS);

	if (!isNullConstant(RHS))
	return SDValue();

	if (LHS.getOpcode() == ISD::SHL \|\| LHS.getOpcode() == ISD::SRA \|\|
	LHS.getOpcode() == ISD::SRL)
	return SDValue();

	// Fold the compare into the branch instruction.
	SDValue BR;
	if (CC == AArch64CC::EQ)
	BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
	else
	BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);

	// Do not add new nodes to DAG combiner worklist.
	DCI.CombineTo(N, BR, false);

	return SDValue();
	}

	// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
	// as well as whether the test should be inverted. This code is required to
	// catch these cases (as opposed to standard dag combines) because
	// AArch64ISD::TBZ is matched during legalization.
	static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
	SelectionDAG &DAG) {

	if (!Op->hasOneUse())
	return Op;

	// We don't handle undef/constant-fold cases below, as they should have
	// already been taken care of (e.g. and of 0, test of undefined shifted bits,
	// etc.)

	// (tbz (trunc x), b) -> (tbz x, b)
	// This case is just here to enable more of the below cases to be caught.
	if (Op->getOpcode() == ISD::TRUNCATE &&
	Bit < Op->getValueType(0).getSizeInBits()) {
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}

	// (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
	if (Op->getOpcode() == ISD::ANY_EXTEND &&
	Bit < Op->getOperand(0).getValueSizeInBits()) {
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}

	if (Op->getNumOperands() != 2)
	return Op;

	auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
	if (!C)
	return Op;

	switch (Op->getOpcode()) {
	default:
	return Op;

	// (tbz (and x, m), b) -> (tbz x, b)
	case ISD::AND:
	if ((C->getZExtValue() >> Bit) & 1)
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	return Op;

	// (tbz (shl x, c), b) -> (tbz x, b-c)
	case ISD::SHL:
	if (C->getZExtValue() <= Bit &&
	(Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
	Bit = Bit - C->getZExtValue();
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	return Op;

	// (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
	case ISD::SRA:
	Bit = Bit + C->getZExtValue();
	if (Bit >= Op->getValueType(0).getSizeInBits())
	Bit = Op->getValueType(0).getSizeInBits() - 1;
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);

	// (tbz (srl x, c), b) -> (tbz x, b+c)
	case ISD::SRL:
	if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
	Bit = Bit + C->getZExtValue();
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	return Op;

	// (tbz (xor x, -1), b) -> (tbnz x, b)
	case ISD::XOR:
	if ((C->getZExtValue() >> Bit) & 1)
	Invert = !Invert;
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	}

	// Optimize test single bit zero/non-zero and branch.
	static SDValue performTBZCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
	bool Invert = false;
	SDValue TestSrc = N->getOperand(1);
	SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);

	if (TestSrc == NewTestSrc)
	return SDValue();

	unsigned NewOpc = N->getOpcode();
	if (Invert) {
	if (NewOpc == AArch64ISD::TBZ)
	NewOpc = AArch64ISD::TBNZ;
	else {
	assert(NewOpc == AArch64ISD::TBNZ);
	NewOpc = AArch64ISD::TBZ;
	}
	}

	SDLoc DL(N);
	return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
	DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
	}

	// vselect (v1i1 setcc) ->
	// vselect (v1iXX setcc) (XX is the size of the compared operand type)
	// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
	// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
	// such VSELECT.
	static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	EVT CCVT = N0.getValueType();

	if (N0.getOpcode() != ISD::SETCC \|\| CCVT.getVectorNumElements() != 1 \|\|
	CCVT.getVectorElementType() != MVT::i1)
	return SDValue();

	EVT ResVT = N->getValueType(0);
	EVT CmpVT = N0.getOperand(0).getValueType();
	// Only combine when the result type is of the same size as the compared
	// operands.
	if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
	return SDValue();

	SDValue IfTrue = N->getOperand(1);
	SDValue IfFalse = N->getOperand(2);
	SDValue SetCC =
	DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
	N0.getOperand(0), N0.getOperand(1),
	cast<CondCodeSDNode>(N0.getOperand(2))->get());
	return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
	IfTrue, IfFalse);
	}

	/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
	/// the compare-mask instructions rather than going via NZCV, even if LHS and
	/// RHS are really scalar. This replaces any scalar setcc in the above pattern
	/// with a vector one followed by a DUP shuffle on the result.
	static SDValue performSelectCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDValue N0 = N->getOperand(0);
	EVT ResVT = N->getValueType(0);

	if (N0.getOpcode() != ISD::SETCC)
	return SDValue();

	// Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
	// scalar SetCCResultType. We also don't expect vectors, because we assume
	// that selects fed by vector SETCCs are canonicalized to VSELECT.
	assert((N0.getValueType() == MVT::i1 \|\| N0.getValueType() == MVT::i32) &&
	"Scalar-SETCC feeding SELECT has unexpected result type!");

	// If NumMaskElts == 0, the comparison is larger than select result. The
	// largest real NEON comparison is 64-bits per lane, which means the result is
	// at most 32-bits and an illegal vector. Just bail out for now.
	EVT SrcVT = N0.getOperand(0).getValueType();

	// Don't try to do this optimization when the setcc itself has i1 operands.
	// There are no legal vectors of i1, so this would be pointless.
	if (SrcVT == MVT::i1)
	return SDValue();

	int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
	if (!ResVT.isVector() \|\| NumMaskElts == 0)
	return SDValue();

	SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
	EVT CCVT = SrcVT.changeVectorElementTypeToInteger();

	// Also bail out if the vector CCVT isn't the same size as ResVT.
	// This can happen if the SETCC operand size doesn't divide the ResVT size
	// (e.g., f64 vs v3f32).
	if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
	return SDValue();

	// Make sure we didn't create illegal types, if we're not supposed to.
	assert(DCI.isBeforeLegalize() \|\|
	DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));

	// First perform a vector comparison, where lane 0 is the one we're interested
	// in.
	SDLoc DL(N0);
	SDValue LHS =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
	SDValue RHS =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
	SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));

	// Now duplicate the comparison mask we want across all other lanes.
	SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
	SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
	Mask = DAG.getNode(ISD::BITCAST, DL,
	ResVT.changeVectorElementTypeToInteger(), Mask);

	return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
	}

	/// Get rid of unnecessary NVCASTs (that don't change the type).
	static SDValue performNVCASTCombine(SDNode *N) {
	if (N->getValueType(0) == N->getOperand(0).getValueType())
	return N->getOperand(0);

	return SDValue();
	}

	// If all users of the globaladdr are of the form (globaladdr + constant), find
	// the smallest constant, fold it into the globaladdr's offset and rewrite the
	// globaladdr as (globaladdr + constant) - constant.
	static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget,
	const TargetMachine &TM) {
	auto *GN = cast<GlobalAddressSDNode>(N);
	if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
	AArch64II::MO_NO_FLAG)
	return SDValue();

	uint64_t MinOffset = -1ull;
	for (SDNode *N : GN->uses()) {
	if (N->getOpcode() != ISD::ADD)
	return SDValue();
	auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
	if (!C)
	C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();
	MinOffset = std::min(MinOffset, C->getZExtValue());
	}
	uint64_t Offset = MinOffset + GN->getOffset();

	// Require that the new offset is larger than the existing one. Otherwise, we
	// can end up oscillating between two possible DAGs, for example,
	// (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
	if (Offset <= uint64_t(GN->getOffset()))
	return SDValue();

	// Check whether folding this offset is legal. It must not go out of bounds of
	// the referenced object to avoid violating the code model, and must be
	// smaller than 2^21 because this is the largest offset expressible in all
	// object formats.
	//
	// This check also prevents us from folding negative offsets, which will end
	// up being treated in the same way as large positive ones. They could also
	// cause code model violations, and aren't really common enough to matter.
	if (Offset >= (1 << 21))
	return SDValue();

	const GlobalValue *GV = GN->getGlobal();
	Type *T = GV->getValueType();
	if (!T->isSized() \|\|
	Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
	return SDValue();

	SDLoc DL(GN);
	SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
	return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
	DAG.getConstant(MinOffset, DL, MVT::i64));
	}

	SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default:
	LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
	break;
	case ISD::ADD:
	case ISD::SUB:
	return performAddSubLongCombine(N, DCI, DAG);
	case ISD::XOR:
	return performXorCombine(N, DAG, DCI, Subtarget);
	case ISD::MUL:
	return performMulCombine(N, DAG, DCI, Subtarget);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return performIntToFpCombine(N, DAG, Subtarget);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	return performFpToIntCombine(N, DAG, DCI, Subtarget);
	case ISD::FDIV:
	return performFDivCombine(N, DAG, DCI, Subtarget);
	case ISD::OR:
	return performORCombine(N, DCI, Subtarget);
	case ISD::AND:
	return performANDCombine(N, DCI);
	case ISD::SRL:
	return performSRLCombine(N, DCI);
	case ISD::INTRINSIC_WO_CHAIN:
	return performIntrinsicCombine(N, DCI, Subtarget);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::SIGN_EXTEND:
	return performExtendCombine(N, DCI, DAG);
	case ISD::BITCAST:
	return performBitcastCombine(N, DCI, DAG);
	case ISD::CONCAT_VECTORS:
	return performConcatVectorsCombine(N, DCI, DAG);
	case ISD::SELECT:
	return performSelectCombine(N, DCI);
	case ISD::VSELECT:
	return performVSelectCombine(N, DCI.DAG);
	case ISD::LOAD:
	if (performTBISimplification(N->getOperand(1), DCI, DAG))
	return SDValue(N, 0);
	break;
	case ISD::STORE:
	return performSTORECombine(N, DCI, DAG, Subtarget);
	case AArch64ISD::BRCOND:
	return performBRCONDCombine(N, DCI, DAG);
	case AArch64ISD::TBNZ:
	case AArch64ISD::TBZ:
	return performTBZCombine(N, DCI, DAG);
	case AArch64ISD::CSEL:
	return performCONDCombine(N, DCI, DAG, 2, 3);
	case AArch64ISD::DUP:
	return performPostLD1Combine(N, DCI, false);
	case AArch64ISD::NVCAST:
	return performNVCASTCombine(N);
	case ISD::INSERT_VECTOR_ELT:
	return performPostLD1Combine(N, DCI, true);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN:
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_ld4:
	case Intrinsic::aarch64_neon_ld1x2:
	case Intrinsic::aarch64_neon_ld1x3:
	case Intrinsic::aarch64_neon_ld1x4:
	case Intrinsic::aarch64_neon_ld2lane:
	case Intrinsic::aarch64_neon_ld3lane:
	case Intrinsic::aarch64_neon_ld4lane:
	case Intrinsic::aarch64_neon_ld2r:
	case Intrinsic::aarch64_neon_ld3r:
	case Intrinsic::aarch64_neon_ld4r:
	case Intrinsic::aarch64_neon_st2:
	case Intrinsic::aarch64_neon_st3:
	case Intrinsic::aarch64_neon_st4:
	case Intrinsic::aarch64_neon_st1x2:
	case Intrinsic::aarch64_neon_st1x3:
	case Intrinsic::aarch64_neon_st1x4:
	case Intrinsic::aarch64_neon_st2lane:
	case Intrinsic::aarch64_neon_st3lane:
	case Intrinsic::aarch64_neon_st4lane:
	return performNEONPostLDSTCombine(N, DCI, DAG);
	default:
	break;
	}
	break;
	case ISD::GlobalAddress:
	return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
	}
	return SDValue();
	}

	// Check if the return value is used as only a return value, as otherwise
	// we can't perform a tail-call. In particular, we need to check for
	// target ISD nodes that are returns and any other "odd" constructs
	// that the generic analysis code won't necessarily catch.
	bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
	SDValue &Chain) const {
	if (N->getNumValues() != 1)
	return false;
	if (!N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
	MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode *Node : Copy->uses()) {
	if (Node->getOpcode() != AArch64ISD::RET_FLAG)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	// Return whether the an instruction can potentially be optimized to a tail
	// call. This will cause the optimizers to attempt to move, or duplicate,
	// return instructions to help enable tail call optimizations for this
	// instruction.
	bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	return CI->isTailCall();
	}

	bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	bool &IsInc,
	SelectionDAG &DAG) const {
	if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
	return false;

	Base = Op->getOperand(0);
	// All of the indexed addressing mode instructions take a signed
	// 9 bit immediate offset.
	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
	int64_t RHSC = RHS->getSExtValue();
	if (Op->getOpcode() == ISD::SUB)
	RHSC = -(uint64_t)RHSC;
	if (!isInt<9>(RHSC))
	return false;
	IsInc = (Op->getOpcode() == ISD::ADD);
	Offset = Op->getOperand(1);
	return true;
	}
	return false;
	}

	bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const {
	EVT VT;
	SDValue Ptr;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	Ptr = ST->getBasePtr();
	} else
	return false;

	bool IsInc;
	if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
	return false;
	AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
	return true;
	}

	bool AArch64TargetLowering::getPostIndexedAddressParts(
	SDNode N, SDNode Op, SDValue &Base, SDValue &Offset,
	ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
	EVT VT;
	SDValue Ptr;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	Ptr = ST->getBasePtr();
	} else
	return false;

	bool IsInc;
	if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
	return false;
	// Post-indexing updates the base, so it's not a valid transform
	// if that's not the same as the load's pointer.
	if (Ptr != Base)
	return false;
	AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
	return true;
	}

	static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Op = N->getOperand(0);

	if (N->getValueType(0) != MVT::i16 \|\| Op.getValueType() != MVT::f16)
	return;

	Op = SDValue(
	DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
	DAG.getUNDEF(MVT::i32), Op,
	DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
	0);
	Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
	}

	static void ReplaceReductionResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG, unsigned InterOp,
	unsigned AcrossOp) {
	EVT LoVT, HiVT;
	SDValue Lo, Hi;
	SDLoc dl(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
	SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
	Results.push_back(SplitVal);
	}

	static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
	DAG.getNode(ISD::SRL, DL, MVT::i128, N,
	DAG.getConstant(64, DL, MVT::i64)));
	return std::make_pair(Lo, Hi);
	}

	// Create an even/odd pair of X registers holding integer value V.
	static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
	SDLoc dl(V.getNode());
	SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
	SDValue VHi = DAG.getAnyExtOrTrunc(
	DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
	dl, MVT::i64);
	if (DAG.getDataLayout().isBigEndian())
	std::swap (VLo, VHi);
	SDValue RegClass =
	DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
	SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
	SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
	const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
	return SDValue(
	DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
	}

	static void ReplaceCMP_SWAP_128Results(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	assert(N->getValueType(0) == MVT::i128 &&
	"AtomicCmpSwap on types less than 128 should be legal");

	if (Subtarget->hasLSE()) {
	// LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
	// so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
	SDValue Ops[] = {
	createGPRPairNode(DAG, N->getOperand(2)), // Compare value
	createGPRPairNode(DAG, N->getOperand(3)), // Store value
	N->getOperand(1), // Ptr
	N->getOperand(0), // Chain in
	};

	MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();

	unsigned Opcode;
	switch (MemOp->getOrdering()) {
	case AtomicOrdering::Monotonic:
	Opcode = AArch64::CASPX;
	break;
	case AtomicOrdering::Acquire:
	Opcode = AArch64::CASPAX;
	break;
	case AtomicOrdering::Release:
	Opcode = AArch64::CASPLX;
	break;
	case AtomicOrdering::AcquireRelease:
	case AtomicOrdering::SequentiallyConsistent:
	Opcode = AArch64::CASPALX;
	break;
	default:
	llvm_unreachable("Unexpected ordering!");
	}

	MachineSDNode *CmpSwap = DAG.getMachineNode(
	Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
	DAG.setNodeMemRefs(CmpSwap, {MemOp});

	unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
	if (DAG.getDataLayout().isBigEndian())
	std::swap(SubReg1, SubReg2);
	Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
	SDValue(CmpSwap, 0)));
	Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
	SDValue(CmpSwap, 0)));
	Results.push_back(SDValue(CmpSwap, 1)); // Chain out
	return;
	}

	auto Desired = splitInt128(N->getOperand(2), DAG);
	auto New = splitInt128(N->getOperand(3), DAG);
	SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
	New.first, New.second, N->getOperand(0)};
	SDNode *CmpSwap = DAG.getMachineNode(
	AArch64::CMP_SWAP_128, SDLoc(N),
	DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);

	MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
	DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});

	Results.push_back(SDValue(CmpSwap, 0));
	Results.push_back(SDValue(CmpSwap, 1));
	Results.push_back(SDValue(CmpSwap, 3));
	}

	void AArch64TargetLowering::ReplaceNodeResults(
	SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Don't know how to custom expand this");
	case ISD::BITCAST:
	ReplaceBITCASTResults(N, Results, DAG);
	return;
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
	return;

	case AArch64ISD::SADDV:
	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
	return;
	case AArch64ISD::UADDV:
	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
	return;
	case AArch64ISD::SMINV:
	ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
	return;
	case AArch64ISD::UMINV:
	ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
	return;
	case AArch64ISD::SMAXV:
	ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
	return;
	case AArch64ISD::UMAXV:
	ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
	return;
	case ISD::FP_TO_UINT:
	case ISD::FP_TO_SINT:
	assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
	// Let normal code take care of it by not adding anything to Results.
	return;
	case ISD::ATOMIC_CMP_SWAP:
	ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
	return;
	}
	}

	bool AArch64TargetLowering::useLoadStackGuardNode() const {
	if (Subtarget->isTargetAndroid() \|\| Subtarget->isTargetFuchsia())
	return TargetLowering::useLoadStackGuardNode();
	return true;
	}

	unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
	// reciprocal if there are three or more FDIVs.
	return 3;
	}

	TargetLoweringBase::LegalizeTypeAction
	AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
	// During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
	// v4i16, v2i32 instead of to promote.
	if (VT == MVT::v1i8 \|\| VT == MVT::v1i16 \|\| VT == MVT::v1i32 \|\|
	VT == MVT::v1f32)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	// Loads and stores less than 128-bits are already atomic; ones above that
	// are doomed anyway, so defer to the default libcall and blame the OS when
	// things go wrong.
	bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
	return Size == 128;
	}

	// Loads and stores less than 128-bits are already atomic; ones above that
	// are doomed anyway, so defer to the default libcall and blame the OS when
	// things go wrong.
	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	unsigned Size = LI->getType()->getPrimitiveSizeInBits();
	return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
	}

	// For the real atomic operations, we have ldxr/stxr up to 128 bits,
	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	if (AI->isFloatingPointOperation())
	return AtomicExpansionKind::CmpXChg;

	unsigned Size = AI->getType()->getPrimitiveSizeInBits();
	if (Size > 128) return AtomicExpansionKind::None;
	// Nand not supported in LSE.
	if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
	// Leave 128 bits to LLSC.
	return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
	}

	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
	AtomicCmpXchgInst *AI) const {
	// If subtarget has LSE, leave cmpxchg intact for codegen.
	if (Subtarget->hasLSE())
	return AtomicExpansionKind::None;
	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
	// implement cmpxchg without spilling. If the address being exchanged is also
	// on the stack and close enough to the spill slot, this can lead to a
	// situation where the monitor always gets cleared and the atomic operation
	// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
	if (getTargetMachine().getOptLevel() == 0)
	return AtomicExpansionKind::None;
	return AtomicExpansionKind::LLSC;
	}

	Value AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value Addr,
	AtomicOrdering Ord) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
	bool IsAcquire = isAcquireOrStronger(Ord);

	// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
	// intrinsic must return {i64, i64} and we have to recombine them into a
	// single i128 here.
	if (ValTy->getPrimitiveSizeInBits() == 128) {
	Intrinsic::ID Int =
	IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
	Function *Ldxr = Intrinsic::getDeclaration(M, Int);

	Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
	Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");

	Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
	Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
	Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
	Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
	return Builder.CreateOr(
	Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
	}

	Type *Tys[] = { Addr->getType() };
	Intrinsic::ID Int =
	IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
	Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);

	Type *EltTy = cast<PointerType>(Addr->getType())->getElementType();

	const DataLayout &DL = M->getDataLayout();
	IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy));
	Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);

	return Builder.CreateBitCast(Trunc, EltTy);
	}

	void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
	IRBuilder<> &Builder) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
	}

	Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
	Value Val, Value Addr,
	AtomicOrdering Ord) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	bool IsRelease = isReleaseOrStronger(Ord);

	// Since the intrinsics must have legal type, the i128 intrinsics take two
	// parameters: "i64, i64". We must marshal Val into the appropriate form
	// before the call.
	if (Val->getType()->getPrimitiveSizeInBits() == 128) {
	Intrinsic::ID Int =
	IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
	Function *Stxr = Intrinsic::getDeclaration(M, Int);
	Type *Int64Ty = Type::getInt64Ty(M->getContext());

	Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
	Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
	Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
	return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
	}

	Intrinsic::ID Int =
	IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
	Type *Tys[] = { Addr->getType() };
	Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);

	const DataLayout &DL = M->getDataLayout();
	IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
	Val = Builder.CreateBitCast(Val, IntValTy);

	return Builder.CreateCall(Stxr,
	{Builder.CreateZExtOrBitCast(
	Val, Stxr->getFunctionType()->getParamType(0)),
	Addr});
	}

	bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
	Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
	return Ty->isArrayTy();
	}

	bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
	EVT) const {
	return false;
	}

	static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
	Module *M = IRB.GetInsertBlock()->getParent()->getParent();
	Function *ThreadPointerFunc =
	Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
	return IRB.CreatePointerCast(
	IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
	Offset),
	IRB.getInt8PtrTy()->getPointerTo(0));
	}

	Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// Android provides a fixed TLS slot for the stack cookie. See the definition
	// of TLS_SLOT_STACK_GUARD in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget->isTargetAndroid())
	return UseTlsOffset(IRB, 0x28);

	// Fuchsia is similar.
	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
	if (Subtarget->isTargetFuchsia())
	return UseTlsOffset(IRB, -0x10);

	return TargetLowering::getIRStackGuard(IRB);
	}

	void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
	// MSVC CRT provides functionalities for stack protection.
	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
	"__security_check_cookie", Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext()));
	if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
	F->setCallingConv(CallingConv::Win64);
	F->addAttribute(1, Attribute::AttrKind::InReg);
	}
	return;
	}
	TargetLowering::insertSSPDeclarations(M);
	}

	Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
	return M.getGlobalVariable("__security_cookie");
	return TargetLowering::getSDagStackGuard(M);
	}

	Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
	return M.getFunction("__security_check_cookie");
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget->isTargetAndroid())
	return UseTlsOffset(IRB, 0x48);

	// Fuchsia is similar.
	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
	if (Subtarget->isTargetFuchsia())
	return UseTlsOffset(IRB, -0x8);

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	// Only sink 'and' mask to cmp use block if it is masking a single bit, since
	// this is likely to be fold the and/cmp/br into a single tbz instruction. It
	// may be beneficial to sink in other cases, but we would have to check that
	// the cmp would not get folded into the br to form a cbz for these to be
	// beneficial.
	ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
	if (!Mask)
	return false;
	return Mask->getValue().isPowerOf2();
	}

	void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	// Update IsSplitCSR in AArch64unctionInfo.
	AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void AArch64TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (AArch64::GPR64RegClass.contains(*I))
	RC = &AArch64::GPR64RegClass;
	else if (AArch64::FPR64RegClass.contains(*I))
	RC = &AArch64::FPR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	unsigned NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(Entry->getParent()->getFunction().hasFnAttribute(
	Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on AArch64 is expensive. However, when aggressively
	// optimizing for code size, we prefer to use a div instruction, as it is
	// usually smaller than the alternative sequence.
	// The exception to this is vector division. Since AArch64 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize =
	Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
	// We want inc-of-add for scalars and sub-of-not for vectors.
	return VT.isScalarInteger();
	}

	bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
	return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
	}

	unsigned
	AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
	if (Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows())
	return getPointerTy(DL).getSizeInBits();

	return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
	}

	void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
	MF.getFrameInfo().computeMaxCallFrameSize(MF);
	TargetLoweringBase::finalizeLowering(MF);
	}

	// Unlike X86, we let frame lowering assign offsets to all catch objects.
	bool AArch64TargetLowering::needsFixedCatchObjects() const {
	return false;
	}
	Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64InstrInfo.td
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64InstrInfo.td (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64InstrInfo.td (revision 351303)
	@@ -1,6943 +1,6943 @@
	//=- AArch64InstrInfo.td - Describe the AArch64 Instructions -- tablegen --=//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// AArch64 Instruction definitions.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// ARM Instruction Predicate Definitions.
	//
	def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
	AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
	def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
	AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
	def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">,
	AssemblerPredicate<"HasV8_3aOps", "armv8.3a">;
	def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">,
	AssemblerPredicate<"HasV8_4aOps", "armv8.4a">;
	def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">,
	AssemblerPredicate<"HasV8_5aOps", "armv8.5a">;
	def HasVH : Predicate<"Subtarget->hasVH()">,
	AssemblerPredicate<"FeatureVH", "vh">;

	def HasLOR : Predicate<"Subtarget->hasLOR()">,
	AssemblerPredicate<"FeatureLOR", "lor">;

	def HasPA : Predicate<"Subtarget->hasPA()">,
	AssemblerPredicate<"FeaturePA", "pa">;

	def HasJS : Predicate<"Subtarget->hasJS()">,
	AssemblerPredicate<"FeatureJS", "jsconv">;

	def HasCCIDX : Predicate<"Subtarget->hasCCIDX()">,
	AssemblerPredicate<"FeatureCCIDX", "ccidx">;

	def HasComplxNum : Predicate<"Subtarget->hasComplxNum()">,
	AssemblerPredicate<"FeatureComplxNum", "complxnum">;

	def HasNV : Predicate<"Subtarget->hasNV()">,
	AssemblerPredicate<"FeatureNV", "nv">;

	def HasRASv8_4 : Predicate<"Subtarget->hasRASv8_4()">,
	AssemblerPredicate<"FeatureRASv8_4", "rasv8_4">;

	def HasMPAM : Predicate<"Subtarget->hasMPAM()">,
	AssemblerPredicate<"FeatureMPAM", "mpam">;

	def HasDIT : Predicate<"Subtarget->hasDIT()">,
	AssemblerPredicate<"FeatureDIT", "dit">;

	def HasTRACEV8_4 : Predicate<"Subtarget->hasTRACEV8_4()">,
	AssemblerPredicate<"FeatureTRACEV8_4", "tracev8.4">;

	def HasAM : Predicate<"Subtarget->hasAM()">,
	AssemblerPredicate<"FeatureAM", "am">;

	def HasSEL2 : Predicate<"Subtarget->hasSEL2()">,
	AssemblerPredicate<"FeatureSEL2", "sel2">;

	def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">,
	AssemblerPredicate<"FeatureTLB_RMI", "tlb-rmi">;

	def HasFMI : Predicate<"Subtarget->hasFMI()">,
	AssemblerPredicate<"FeatureFMI", "fmi">;

	def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPCImm()">,
	AssemblerPredicate<"FeatureRCPC_IMMO", "rcpc-immo">;

	def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
	AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
	def HasNEON : Predicate<"Subtarget->hasNEON()">,
	AssemblerPredicate<"FeatureNEON", "neon">;
	def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
	AssemblerPredicate<"FeatureCrypto", "crypto">;
	def HasSM4 : Predicate<"Subtarget->hasSM4()">,
	AssemblerPredicate<"FeatureSM4", "sm4">;
	def HasSHA3 : Predicate<"Subtarget->hasSHA3()">,
	AssemblerPredicate<"FeatureSHA3", "sha3">;
	def HasSHA2 : Predicate<"Subtarget->hasSHA2()">,
	AssemblerPredicate<"FeatureSHA2", "sha2">;
	def HasAES : Predicate<"Subtarget->hasAES()">,
	AssemblerPredicate<"FeatureAES", "aes">;
	def HasDotProd : Predicate<"Subtarget->hasDotProd()">,
	AssemblerPredicate<"FeatureDotProd", "dotprod">;
	def HasCRC : Predicate<"Subtarget->hasCRC()">,
	AssemblerPredicate<"FeatureCRC", "crc">;
	def HasLSE : Predicate<"Subtarget->hasLSE()">,
	AssemblerPredicate<"FeatureLSE", "lse">;
	def HasRAS : Predicate<"Subtarget->hasRAS()">,
	AssemblerPredicate<"FeatureRAS", "ras">;
	def HasRDM : Predicate<"Subtarget->hasRDM()">,
	AssemblerPredicate<"FeatureRDM", "rdm">;
	def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">;
	def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
	AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
	def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">,
	AssemblerPredicate<"FeatureFP16FML", "fp16fml">;
	def HasSPE : Predicate<"Subtarget->hasSPE()">,
	AssemblerPredicate<"FeatureSPE", "spe">;
	def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">,
	AssemblerPredicate<"FeatureFuseAES",
	"fuse-aes">;
	def HasSVE : Predicate<"Subtarget->hasSVE()">,
	AssemblerPredicate<"FeatureSVE", "sve">;
	def HasSVE2 : Predicate<"Subtarget->hasSVE2()">,
	AssemblerPredicate<"FeatureSVE2", "sve2">;
	def HasSVE2AES : Predicate<"Subtarget->hasSVE2AES()">,
	AssemblerPredicate<"FeatureSVE2AES", "sve2-aes">;
	def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">,
	AssemblerPredicate<"FeatureSVE2SM4", "sve2-sm4">;
	def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">,
	AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">;
	def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">,
	- AssemblerPredicate<"FeatureSVE2BitPerm", "bitperm">;
	+ AssemblerPredicate<"FeatureSVE2BitPerm", "sve2-bitperm">;
	def HasRCPC : Predicate<"Subtarget->hasRCPC()">,
	AssemblerPredicate<"FeatureRCPC", "rcpc">;
	def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">,
	AssemblerPredicate<"FeatureAltFPCmp", "altnzcv">;
	def HasFRInt3264 : Predicate<"Subtarget->hasFRInt3264()">,
	AssemblerPredicate<"FeatureFRInt3264", "frint3264">;
	def HasSB : Predicate<"Subtarget->hasSB()">,
	AssemblerPredicate<"FeatureSB", "sb">;
	def HasPredRes : Predicate<"Subtarget->hasPredRes()">,
	AssemblerPredicate<"FeaturePredRes", "predres">;
	def HasCCDP : Predicate<"Subtarget->hasCCDP()">,
	AssemblerPredicate<"FeatureCacheDeepPersist", "ccdp">;
	def HasBTI : Predicate<"Subtarget->hasBTI()">,
	AssemblerPredicate<"FeatureBranchTargetId", "bti">;
	def HasMTE : Predicate<"Subtarget->hasMTE()">,
	AssemblerPredicate<"FeatureMTE", "mte">;
	def IsLE : Predicate<"Subtarget->isLittleEndian()">;
	def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
	def IsWindows : Predicate<"Subtarget->isTargetWindows()">;
	def UseAlternateSExtLoadCVTF32
	: Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;

	def UseNegativeImmediates
	: Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates",
	"NegativeImmediates">;

	def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
	SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
	SDTCisInt<1>]>>;


	//===----------------------------------------------------------------------===//
	// AArch64-specific DAG Nodes.
	//

	// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
	def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
	[SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>,
	SDTCisInt<0>, SDTCisVT<1, i32>]>;

	// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
	def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
	[SDTCisSameAs<0, 1>,
	SDTCisSameAs<0, 2>,
	SDTCisInt<0>,
	SDTCisVT<3, i32>]>;

	// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
	def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
	[SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>,
	SDTCisInt<0>,
	SDTCisVT<1, i32>,
	SDTCisVT<4, i32>]>;

	def SDT_AArch64Brcond : SDTypeProfile<0, 3,
	[SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
	SDTCisVT<2, i32>]>;
	def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
	def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
	SDTCisVT<2, OtherVT>]>;


	def SDT_AArch64CSel : SDTypeProfile<1, 4,
	[SDTCisSameAs<0, 1>,
	SDTCisSameAs<0, 2>,
	SDTCisInt<3>,
	SDTCisVT<4, i32>]>;
	def SDT_AArch64CCMP : SDTypeProfile<1, 5,
	[SDTCisVT<0, i32>,
	SDTCisInt<1>,
	SDTCisSameAs<1, 2>,
	SDTCisInt<3>,
	SDTCisInt<4>,
	SDTCisVT<5, i32>]>;
	def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
	[SDTCisVT<0, i32>,
	SDTCisFP<1>,
	SDTCisSameAs<1, 2>,
	SDTCisInt<3>,
	SDTCisInt<4>,
	SDTCisVT<5, i32>]>;
	def SDT_AArch64FCmp : SDTypeProfile<0, 2,
	[SDTCisFP<0>,
	SDTCisSameAs<0, 1>]>;
	def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
	def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
	def SDT_AArch64Zip : SDTypeProfile<1, 2, [SDTCisVec<0>,
	SDTCisSameAs<0, 1>,
	SDTCisSameAs<0, 2>]>;
	def SDT_AArch64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
	def SDT_AArch64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
	def SDT_AArch64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
	SDTCisInt<2>, SDTCisInt<3>]>;
	def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
	def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
	SDTCisSameAs<0,2>, SDTCisInt<3>]>;
	def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;

	def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
	def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>;
	def SDT_AArch64fcmp : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
	def SDT_AArch64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
	SDTCisSameAs<0,2>]>;
	def SDT_AArch64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
	SDTCisSameAs<0,2>,
	SDTCisSameAs<0,3>]>;
	def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
	def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;

	def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;

	def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
	SDTCisPtrTy<1>]>;

	// Generates the general dynamic sequences, i.e.
	// adrp x0, :tlsdesc:var
	// ldr x1, [x0, #:tlsdesc_lo12:var]
	// add x0, x0, #:tlsdesc_lo12:var
	// .tlsdesccall var
	// blr x1

	// (the TPIDR_EL0 offset is put directly in X0, hence no "result" here)
	// number of operands (the variable)
	def SDT_AArch64TLSDescCallSeq : SDTypeProfile<0,1,
	[SDTCisPtrTy<0>]>;

	def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
	[SDTCisVT<0, i64>, SDTCisVT<1, i32>,
	SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
	SDTCisSameAs<1, 4>]>;


	// Node definitions.
	def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
	def AArch64adr : SDNode<"AArch64ISD::ADR", SDTIntUnaryOp, []>;
	def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
	def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
	def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
	SDCallSeqStart<[ SDTCisVT<0, i32>,
	SDTCisVT<1, i32> ]>,
	[SDNPHasChain, SDNPOutGlue]>;
	def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END",
	SDCallSeqEnd<[ SDTCisVT<0, i32>,
	SDTCisVT<1, i32> ]>,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64call : SDNode<"AArch64ISD::CALL",
	SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
	SDNPVariadic]>;
	def AArch64brcond : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
	[SDNPHasChain]>;
	def AArch64cbz : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
	[SDNPHasChain]>;
	def AArch64cbnz : SDNode<"AArch64ISD::CBNZ", SDT_AArch64cbz,
	[SDNPHasChain]>;
	def AArch64tbz : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz,
	[SDNPHasChain]>;
	def AArch64tbnz : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz,
	[SDNPHasChain]>;


	def AArch64csel : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>;
	def AArch64csinv : SDNode<"AArch64ISD::CSINV", SDT_AArch64CSel>;
	def AArch64csneg : SDNode<"AArch64ISD::CSNEG", SDT_AArch64CSel>;
	def AArch64csinc : SDNode<"AArch64ISD::CSINC", SDT_AArch64CSel>;
	def AArch64retflag : SDNode<"AArch64ISD::RET_FLAG", SDTNone,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
	def AArch64adc : SDNode<"AArch64ISD::ADC", SDTBinaryArithWithFlagsIn >;
	def AArch64sbc : SDNode<"AArch64ISD::SBC", SDTBinaryArithWithFlagsIn>;
	def AArch64add_flag : SDNode<"AArch64ISD::ADDS", SDTBinaryArithWithFlagsOut,
	[SDNPCommutative]>;
	def AArch64sub_flag : SDNode<"AArch64ISD::SUBS", SDTBinaryArithWithFlagsOut>;
	def AArch64and_flag : SDNode<"AArch64ISD::ANDS", SDTBinaryArithWithFlagsOut,
	[SDNPCommutative]>;
	def AArch64adc_flag : SDNode<"AArch64ISD::ADCS", SDTBinaryArithWithFlagsInOut>;
	def AArch64sbc_flag : SDNode<"AArch64ISD::SBCS", SDTBinaryArithWithFlagsInOut>;

	def AArch64ccmp : SDNode<"AArch64ISD::CCMP", SDT_AArch64CCMP>;
	def AArch64ccmn : SDNode<"AArch64ISD::CCMN", SDT_AArch64CCMP>;
	def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>;

	def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;

	def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;

	def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
	def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
	def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
	def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
	def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;

	def AArch64zip1 : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>;
	def AArch64zip2 : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>;
	def AArch64uzp1 : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>;
	def AArch64uzp2 : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>;
	def AArch64trn1 : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>;
	def AArch64trn2 : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>;

	def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>;
	def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>;
	def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>;
	def AArch64mvni_shift : SDNode<"AArch64ISD::MVNIshift", SDT_AArch64MOVIshift>;
	def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
	def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
	def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;

	def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
	def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
	def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
	def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;

	def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
	def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>;
	def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>;
	def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>;
	def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>;
	def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>;
	def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>;
	def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;

	def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
	def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
	def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;

	def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
	def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
	def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>;
	def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>;
	def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>;

	def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>;
	def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>;
	def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>;

	def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>;
	def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>;
	def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>;
	def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>;
	def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>;
	def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
	(AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>;

	def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>;
	def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>;
	def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>;
	def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
	def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;

	def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
	def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;

	def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>;

	def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;

	def AArch64Prefetch : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH,
	[SDNPHasChain, SDNPSideEffect]>;

	def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
	def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;

	def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ",
	SDT_AArch64TLSDescCallSeq,
	[SDNPInGlue, SDNPOutGlue, SDNPHasChain,
	SDNPVariadic]>;


	def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
	SDT_AArch64WrapperLarge>;

	def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>;

	def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
	SDTCisSameAs<1, 2>]>;
	def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
	def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;

	def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
	def AArch64frecps : SDNode<"AArch64ISD::FRECPS", SDTFPBinOp>;
	def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>;
	def AArch64frsqrts : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>;

	def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
	def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
	def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>;
	def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>;
	def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;
	def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;

	def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
	def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
	def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
	def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
	def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;

	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//

	// AArch64 Instruction Predicate Definitions.
	// We could compute these on a per-module basis but doing so requires accessing
	// the Function object through the <Target>Subtarget and objections were raised
	// to that (see post-commit review comments for r301750).
	let RecomputePerFunction = 1 in {
	def ForCodeSize : Predicate<"MF->getFunction().hasOptSize()">;
	def NotForCodeSize : Predicate<"!MF->getFunction().hasOptSize()">;
	// Avoid generating STRQro if it is slow, unless we're optimizing for code size.
	def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() \|\| MF->getFunction().hasOptSize()">;

	def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
	def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
	}

	include "AArch64InstrFormats.td"
	include "SVEInstrFormats.td"

	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Miscellaneous instructions.
	//===----------------------------------------------------------------------===//

	let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
	// We set Sched to empty list because we expect these instructions to simply get
	// removed in most cases.
	def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
	[(AArch64callseq_start timm:$amt1, timm:$amt2)]>,
	Sched<[]>;
	def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
	[(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
	Sched<[]>;
	} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1

	let isReMaterializable = 1, isCodeGenOnly = 1 in {
	// FIXME: The following pseudo instructions are only needed because remat
	// cannot handle multiple instructions. When that changes, they can be
	// removed, along with the AArch64Wrapper node.

	let AddedComplexity = 10 in
	def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
	[(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>,
	Sched<[WriteLDAdr]>;

	// The MOVaddr instruction should match only when the add is not folded
	// into a load or store address.
	def MOVaddr
	: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
	[(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi),
	tglobaladdr:$low))]>,
	Sched<[WriteAdrAdr]>;
	def MOVaddrJT
	: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
	[(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi),
	tjumptable:$low))]>,
	Sched<[WriteAdrAdr]>;
	def MOVaddrCP
	: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
	[(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi),
	tconstpool:$low))]>,
	Sched<[WriteAdrAdr]>;
	def MOVaddrBA
	: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
	[(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi),
	tblockaddress:$low))]>,
	Sched<[WriteAdrAdr]>;
	def MOVaddrTLS
	: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
	[(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi),
	tglobaltlsaddr:$low))]>,
	Sched<[WriteAdrAdr]>;
	def MOVaddrEXT
	: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
	[(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
	texternalsym:$low))]>,
	Sched<[WriteAdrAdr]>;
	// Normally AArch64addlow either gets folded into a following ldr/str,
	// or together with an adrp into MOVaddr above. For cases with TLS, it
	// might appear without either of them, so allow lowering it into a plain
	// add.
	def ADDlowTLS
	: Pseudo<(outs GPR64:$dst), (ins GPR64:$src, i64imm:$low),
	[(set GPR64:$dst, (AArch64addlow GPR64:$src,
	tglobaltlsaddr:$low))]>,
	Sched<[WriteAdr]>;

	} // isReMaterializable, isCodeGenOnly

	def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr),
	(LOADgot tglobaltlsaddr:$addr)>;

	def : Pat<(AArch64LOADgot texternalsym:$addr),
	(LOADgot texternalsym:$addr)>;

	def : Pat<(AArch64LOADgot tconstpool:$addr),
	(LOADgot tconstpool:$addr)>;

	// 32-bit jump table destination is actually only 2 instructions since we can
	// use the table itself as a PC-relative base. But optimization occurs after
	// branch relaxation so be pessimistic.
	let Size = 12, Constraints = "@earlyclobber $dst,@earlyclobber $scratch" in {
	def JumpTableDest32 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
	(ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>,
	Sched<[]>;
	def JumpTableDest16 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
	(ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>,
	Sched<[]>;
	def JumpTableDest8 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
	(ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>,
	Sched<[]>;
	}

	// Space-consuming pseudo to aid testing of placement and reachability
	// algorithms. Immediate operand is the number of bytes this "instruction"
	// occupies; register operands can be used to enforce dependency and constrain
	// the scheduler.
	let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
	def SPACE : Pseudo<(outs GPR64:$Rd), (ins i32imm:$size, GPR64:$Rn),
	[(set GPR64:$Rd, (int_aarch64_space imm:$size, GPR64:$Rn))]>,
	Sched<[]>;

	let hasSideEffects = 1, isCodeGenOnly = 1 in {
	def SpeculationSafeValueX
	: Pseudo<(outs GPR64:$dst), (ins GPR64:$src), []>, Sched<[]>;
	def SpeculationSafeValueW
	: Pseudo<(outs GPR32:$dst), (ins GPR32:$src), []>, Sched<[]>;
	}


	//===----------------------------------------------------------------------===//
	// System instructions.
	//===----------------------------------------------------------------------===//

	def HINT : HintI<"hint">;
	def : InstAlias<"nop", (HINT 0b000)>;
	def : InstAlias<"yield",(HINT 0b001)>;
	def : InstAlias<"wfe", (HINT 0b010)>;
	def : InstAlias<"wfi", (HINT 0b011)>;
	def : InstAlias<"sev", (HINT 0b100)>;
	def : InstAlias<"sevl", (HINT 0b101)>;
	def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>;
	def : InstAlias<"csdb", (HINT 20)>;
	def : InstAlias<"bti", (HINT 32)>, Requires<[HasBTI]>;
	def : InstAlias<"bti $op", (HINT btihint_op:$op)>, Requires<[HasBTI]>;

	// v8.2a Statistical Profiling extension
	def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>;

	// As far as LLVM is concerned this writes to the system's exclusive monitors.
	let mayLoad = 1, mayStore = 1 in
	def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;

	// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
	// model patterns with sufficiently fine granularity.
	let mayLoad = ?, mayStore = ? in {
	def DMB : CRmSystemI<barrier_op, 0b101, "dmb",
	[(int_aarch64_dmb (i32 imm32_0_15:$CRm))]>;

	def DSB : CRmSystemI<barrier_op, 0b100, "dsb",
	[(int_aarch64_dsb (i32 imm32_0_15:$CRm))]>;

	def ISB : CRmSystemI<barrier_op, 0b110, "isb",
	[(int_aarch64_isb (i32 imm32_0_15:$CRm))]>;

	def TSB : CRmSystemI<barrier_op, 0b010, "tsb", []> {
	let CRm = 0b0010;
	let Inst{12} = 0;
	let Predicates = [HasTRACEV8_4];
	}
	}

	// ARMv8.2-A Dot Product
	let Predicates = [HasDotProd] in {
	defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>;
	defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>;
	defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>;
	defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>;
	}

	// ARMv8.2-A FP16 Fused Multiply-Add Long
	let Predicates = [HasNEON, HasFP16FML] in {
	defm FMLAL : SIMDThreeSameVectorFML<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>;
	defm FMLSL : SIMDThreeSameVectorFML<0, 1, 0b101, "fmlsl", int_aarch64_neon_fmlsl>;
	defm FMLAL2 : SIMDThreeSameVectorFML<1, 0, 0b001, "fmlal2", int_aarch64_neon_fmlal2>;
	defm FMLSL2 : SIMDThreeSameVectorFML<1, 0, 0b101, "fmlsl2", int_aarch64_neon_fmlsl2>;
	defm FMLALlane : SIMDThreeSameVectorFMLIndex<0, 0b0000, "fmlal", int_aarch64_neon_fmlal>;
	defm FMLSLlane : SIMDThreeSameVectorFMLIndex<0, 0b0100, "fmlsl", int_aarch64_neon_fmlsl>;
	defm FMLAL2lane : SIMDThreeSameVectorFMLIndex<1, 0b1000, "fmlal2", int_aarch64_neon_fmlal2>;
	defm FMLSL2lane : SIMDThreeSameVectorFMLIndex<1, 0b1100, "fmlsl2", int_aarch64_neon_fmlsl2>;
	}

	// Armv8.2-A Crypto extensions
	let Predicates = [HasSHA3] in {
	def SHA512H : CryptoRRRTied<0b0, 0b00, "sha512h">;
	def SHA512H2 : CryptoRRRTied<0b0, 0b01, "sha512h2">;
	def SHA512SU0 : CryptoRRTied_2D<0b0, 0b00, "sha512su0">;
	def SHA512SU1 : CryptoRRRTied_2D<0b0, 0b10, "sha512su1">;
	def RAX1 : CryptoRRR_2D<0b0,0b11, "rax1">;
	def EOR3 : CryptoRRRR_16B<0b00, "eor3">;
	def BCAX : CryptoRRRR_16B<0b01, "bcax">;
	def XAR : CryptoRRRi6<"xar">;
	} // HasSHA3

	let Predicates = [HasSM4] in {
	def SM3TT1A : CryptoRRRi2Tied<0b0, 0b00, "sm3tt1a">;
	def SM3TT1B : CryptoRRRi2Tied<0b0, 0b01, "sm3tt1b">;
	def SM3TT2A : CryptoRRRi2Tied<0b0, 0b10, "sm3tt2a">;
	def SM3TT2B : CryptoRRRi2Tied<0b0, 0b11, "sm3tt2b">;
	def SM3SS1 : CryptoRRRR_4S<0b10, "sm3ss1">;
	def SM3PARTW1 : CryptoRRRTied_4S<0b1, 0b00, "sm3partw1">;
	def SM3PARTW2 : CryptoRRRTied_4S<0b1, 0b01, "sm3partw2">;
	def SM4ENCKEY : CryptoRRR_4S<0b1, 0b10, "sm4ekey">;
	def SM4E : CryptoRRTied_4S<0b0, 0b01, "sm4e">;
	} // HasSM4

	let Predicates = [HasRCPC] in {
	// v8.3 Release Consistent Processor Consistent support, optional in v8.2.
	def LDAPRB : RCPCLoad<0b00, "ldaprb", GPR32>;
	def LDAPRH : RCPCLoad<0b01, "ldaprh", GPR32>;
	def LDAPRW : RCPCLoad<0b10, "ldapr", GPR32>;
	def LDAPRX : RCPCLoad<0b11, "ldapr", GPR64>;
	}

	// v8.3a complex add and multiply-accumulate. No predicate here, that is done
	// inside the multiclass as the FP16 versions need different predicates.
	defm FCMLA : SIMDThreeSameVectorTiedComplexHSD<1, 0b110, complexrotateop,
	"fcmla", null_frag>;
	defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd,
	"fcadd", null_frag>;
	defm FCMLA : SIMDIndexedTiedComplexHSD<1, 0, 1, complexrotateop, "fcmla",
	null_frag>;

	// v8.3a Pointer Authentication
	// These instructions inhabit part of the hint space and so can be used for
	// armv8 targets
	let Uses = [LR], Defs = [LR] in {
	def PACIAZ : SystemNoOperands<0b000, "paciaz">;
	def PACIBZ : SystemNoOperands<0b010, "pacibz">;
	def AUTIAZ : SystemNoOperands<0b100, "autiaz">;
	def AUTIBZ : SystemNoOperands<0b110, "autibz">;
	}
	let Uses = [LR, SP], Defs = [LR] in {
	def PACIASP : SystemNoOperands<0b001, "paciasp">;
	def PACIBSP : SystemNoOperands<0b011, "pacibsp">;
	def AUTIASP : SystemNoOperands<0b101, "autiasp">;
	def AUTIBSP : SystemNoOperands<0b111, "autibsp">;
	}
	let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in {
	def PACIA1716 : SystemNoOperands<0b000, "pacia1716">;
	def PACIB1716 : SystemNoOperands<0b010, "pacib1716">;
	def AUTIA1716 : SystemNoOperands<0b100, "autia1716">;
	def AUTIB1716 : SystemNoOperands<0b110, "autib1716">;
	}

	let Uses = [LR], Defs = [LR], CRm = 0b0000 in {
	def XPACLRI : SystemNoOperands<0b111, "xpaclri">;
	}

	// These pointer authentication isntructions require armv8.3a
	let Predicates = [HasPA] in {
	multiclass SignAuth<bits<3> prefix, bits<3> prefix_z, string asm> {
	def IA : SignAuthOneData<prefix, 0b00, !strconcat(asm, "ia")>;
	def IB : SignAuthOneData<prefix, 0b01, !strconcat(asm, "ib")>;
	def DA : SignAuthOneData<prefix, 0b10, !strconcat(asm, "da")>;
	def DB : SignAuthOneData<prefix, 0b11, !strconcat(asm, "db")>;
	def IZA : SignAuthZero<prefix_z, 0b00, !strconcat(asm, "iza")>;
	def DZA : SignAuthZero<prefix_z, 0b10, !strconcat(asm, "dza")>;
	def IZB : SignAuthZero<prefix_z, 0b01, !strconcat(asm, "izb")>;
	def DZB : SignAuthZero<prefix_z, 0b11, !strconcat(asm, "dzb")>;
	}

	defm PAC : SignAuth<0b000, 0b010, "pac">;
	defm AUT : SignAuth<0b001, 0b011, "aut">;

	def XPACI : SignAuthZero<0b100, 0b00, "xpaci">;
	def XPACD : SignAuthZero<0b100, 0b01, "xpacd">;
	def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>;

	// Combined Instructions
	def BRAA : AuthBranchTwoOperands<0, 0, "braa">;
	def BRAB : AuthBranchTwoOperands<0, 1, "brab">;
	def BLRAA : AuthBranchTwoOperands<1, 0, "blraa">;
	def BLRAB : AuthBranchTwoOperands<1, 1, "blrab">;

	def BRAAZ : AuthOneOperand<0b000, 0, "braaz">;
	def BRABZ : AuthOneOperand<0b000, 1, "brabz">;
	def BLRAAZ : AuthOneOperand<0b001, 0, "blraaz">;
	def BLRABZ : AuthOneOperand<0b001, 1, "blrabz">;

	let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
	def RETAA : AuthReturn<0b010, 0, "retaa">;
	def RETAB : AuthReturn<0b010, 1, "retab">;
	def ERETAA : AuthReturn<0b100, 0, "eretaa">;
	def ERETAB : AuthReturn<0b100, 1, "eretab">;
	}

	defm LDRAA : AuthLoad<0, "ldraa", simm10Scaled>;
	defm LDRAB : AuthLoad<1, "ldrab", simm10Scaled>;

	}

	// v8.3a floating point conversion for javascript
	let Predicates = [HasJS, HasFPARMv8] in
	def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32,
	"fjcvtzs",
	[(set GPR32:$Rd,
	(int_aarch64_fjcvtzs FPR64:$Rn))]> {
	let Inst{31} = 0;
	} // HasJS, HasFPARMv8

	// v8.4 Flag manipulation instructions
	let Predicates = [HasFMI] in {
	def CFINV : SimpleSystemI<0, (ins), "cfinv", "">, Sched<[WriteSys]> {
	let Inst{20-5} = 0b0000001000000000;
	}
	def SETF8 : BaseFlagManipulation<0, 0, (ins GPR32:$Rn), "setf8", "{\t$Rn}">;
	def SETF16 : BaseFlagManipulation<0, 1, (ins GPR32:$Rn), "setf16", "{\t$Rn}">;
	def RMIF : FlagRotate<(ins GPR64:$Rn, uimm6:$imm, imm0_15:$mask), "rmif",
	"{\t$Rn, $imm, $mask}">;
	} // HasFMI

	// v8.5 flag manipulation instructions
	let Predicates = [HasAltNZCV], Uses = [NZCV], Defs = [NZCV] in {

	def XAFLAG : PstateWriteSimple<(ins), "xaflag", "">, Sched<[WriteSys]> {
	let Inst{18-16} = 0b000;
	let Inst{11-8} = 0b0000;
	let Unpredictable{11-8} = 0b1111;
	let Inst{7-5} = 0b001;
	}

	def AXFLAG : PstateWriteSimple<(ins), "axflag", "">, Sched<[WriteSys]> {
	let Inst{18-16} = 0b000;
	let Inst{11-8} = 0b0000;
	let Unpredictable{11-8} = 0b1111;
	let Inst{7-5} = 0b010;
	}
	} // HasAltNZCV


	// Armv8.5-A speculation barrier
	def SB : SimpleSystemI<0, (ins), "sb", "">, Sched<[]> {
	let Inst{20-5} = 0b0001100110000111;
	let Unpredictable{11-8} = 0b1111;
	let Predicates = [HasSB];
	let hasSideEffects = 1;
	}

	def : InstAlias<"clrex", (CLREX 0xf)>;
	def : InstAlias<"isb", (ISB 0xf)>;
	def : InstAlias<"ssbb", (DSB 0)>;
	def : InstAlias<"pssbb", (DSB 4)>;

	def MRS : MRSI;
	def MSR : MSRI;
	def MSRpstateImm1 : MSRpstateImm0_1;
	def MSRpstateImm4 : MSRpstateImm0_15;

	// The thread pointer (on Linux, at least, where this has been implemented) is
	// TPIDR_EL0.
	def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
	[(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>;

	let Uses = [ X9 ], Defs = [ X16, X17, LR, NZCV ] in {
	def HWASAN_CHECK_MEMACCESS : Pseudo<
	(outs), (ins GPR64noip:$ptr, i32imm:$accessinfo),
	[(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 imm:$accessinfo))]>,
	Sched<[]>;
	}

	// The cycle counter PMC register is PMCCNTR_EL0.
	let Predicates = [HasPerfMon] in
	def : Pat<(readcyclecounter), (MRS 0xdce8)>;

	// FPCR register
	def : Pat<(i64 (int_aarch64_get_fpcr)), (MRS 0xda20)>;

	// Generic system instructions
	def SYSxt : SystemXtI<0, "sys">;
	def SYSLxt : SystemLXtI<1, "sysl">;

	def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
	(SYSxt imm0_7:$op1, sys_cr_op:$Cn,
	sys_cr_op:$Cm, imm0_7:$op2, XZR)>;

	//===----------------------------------------------------------------------===//
	// Move immediate instructions.
	//===----------------------------------------------------------------------===//

	defm MOVK : InsertImmediate<0b11, "movk">;
	defm MOVN : MoveImmediate<0b00, "movn">;

	let PostEncoderMethod = "fixMOVZ" in
	defm MOVZ : MoveImmediate<0b10, "movz">;

	// First group of aliases covers an implicit "lsl #0".
	def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0), 0>;
	def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0), 0>;
	def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
	def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
	def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
	def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;

	// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
	def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
	def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
	def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
	def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;

	def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
	def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
	def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
	def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;

	def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48), 0>;
	def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32), 0>;
	def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16), 0>;
	def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0), 0>;

	def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
	def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;

	def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
	def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;

	def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16), 0>;
	def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0), 0>;

	// Final group of aliases covers true "mov $Rd, $imm" cases.
	multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
	int width, int shift> {
	def _asmoperand : AsmOperandClass {
	let Name = basename # width # "_lsl" # shift # "MovAlias";
	let PredicateMethod = "is" # basename # "MovAlias<" # width # ", "
	# shift # ">";
	let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">";
	}

	def _movimm : Operand<i32> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand");
	}

	def : InstAlias<"mov $Rd, $imm",
	(INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>;
	}

	defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>;
	defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>;

	defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>;
	defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>;
	defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>;
	defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>;

	defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>;
	defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>;

	defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>;
	defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>;
	defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>;
	defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>;

	let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
	isAsCheapAsAMove = 1 in {
	// FIXME: The following pseudo instructions are only needed because remat
	// cannot handle multiple instructions. When that changes, we can select
	// directly to the real instructions and get rid of these pseudos.

	def MOVi32imm
	: Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
	[(set GPR32:$dst, imm:$src)]>,
	Sched<[WriteImm]>;
	def MOVi64imm
	: Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
	[(set GPR64:$dst, imm:$src)]>,
	Sched<[WriteImm]>;
	} // isReMaterializable, isCodeGenOnly

	// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the
	// eventual expansion code fewer bits to worry about getting right. Marshalling
	// the types is a little tricky though:
	def i64imm_32bit : ImmLeaf<i64, [{
	return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
	}]>;

	def s64imm_32bit : ImmLeaf<i64, [{
	int64_t Imm64 = static_cast<int64_t>(Imm);
	return Imm64 >= std::numeric_limits<int32_t>::min() &&
	Imm64 <= std::numeric_limits<int32_t>::max();
	}]>;

	def trunc_imm : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32);
	}]>;

	def gi_trunc_imm : GICustomOperandRenderer<"renderTruncImm">,
	GISDNodeXFormEquiv<trunc_imm>;

	def : Pat<(i64 i64imm_32bit:$src),
	(SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;

	// Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model).
	def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
	return CurDAG->getTargetConstant(
	N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
	}]>;

	def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
	return CurDAG->getTargetConstant(
	N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
	}]>;


	def : Pat<(f32 fpimm:$in),
	(COPY_TO_REGCLASS (MOVi32imm (bitcast_fpimm_to_i32 f32:$in)), FPR32)>;
	def : Pat<(f64 fpimm:$in),
	(COPY_TO_REGCLASS (MOVi64imm (bitcast_fpimm_to_i64 f64:$in)), FPR64)>;


	// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
	// sequences.
	def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
	tglobaladdr:$g1, tglobaladdr:$g0),
	(MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g0, 0),
	tglobaladdr:$g1, 16),
	tglobaladdr:$g2, 32),
	tglobaladdr:$g3, 48)>;

	def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
	tblockaddress:$g1, tblockaddress:$g0),
	(MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g0, 0),
	tblockaddress:$g1, 16),
	tblockaddress:$g2, 32),
	tblockaddress:$g3, 48)>;

	def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
	tconstpool:$g1, tconstpool:$g0),
	(MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g0, 0),
	tconstpool:$g1, 16),
	tconstpool:$g2, 32),
	tconstpool:$g3, 48)>;

	def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
	tjumptable:$g1, tjumptable:$g0),
	(MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g0, 0),
	tjumptable:$g1, 16),
	tjumptable:$g2, 32),
	tjumptable:$g3, 48)>;


	//===----------------------------------------------------------------------===//
	// Arithmetic instructions.
	//===----------------------------------------------------------------------===//

	// Add/subtract with carry.
	defm ADC : AddSubCarry<0, "adc", "adcs", AArch64adc, AArch64adc_flag>;
	defm SBC : AddSubCarry<1, "sbc", "sbcs", AArch64sbc, AArch64sbc_flag>;

	def : InstAlias<"ngc $dst, $src", (SBCWr GPR32:$dst, WZR, GPR32:$src)>;
	def : InstAlias<"ngc $dst, $src", (SBCXr GPR64:$dst, XZR, GPR64:$src)>;
	def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
	def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;

	// Add/subtract
	defm ADD : AddSub<0, "add", "sub", add>;
	defm SUB : AddSub<1, "sub", "add">;

	def : InstAlias<"mov $dst, $src",
	(ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
	def : InstAlias<"mov $dst, $src",
	(ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
	def : InstAlias<"mov $dst, $src",
	(ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
	def : InstAlias<"mov $dst, $src",
	(ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;

	defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn", "subs", "cmp">;
	defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp", "adds", "cmn">;

	// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
	def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
	(SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
	def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
	(SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
	def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
	(SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
	def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
	(SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
	def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
	(SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
	def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
	(SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
	let AddedComplexity = 1 in {
	def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
	(SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
	def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
	(SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
	}

	// Because of the immediate format for add/sub-imm instructions, the
	// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
	// These patterns capture that transformation.
	let AddedComplexity = 1 in {
	def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
	(SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
	def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
	(SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
	def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
	(ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
	def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
	(ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
	}

	// Because of the immediate format for add/sub-imm instructions, the
	// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
	// These patterns capture that transformation.
	let AddedComplexity = 1 in {
	def : Pat<(AArch64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
	(SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
	def : Pat<(AArch64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
	(SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
	def : Pat<(AArch64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
	(ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
	def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
	(ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
	}

	def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
	def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
	def : InstAlias<"neg $dst, $src$shift",
	(SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
	def : InstAlias<"neg $dst, $src$shift",
	(SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;

	def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
	def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
	def : InstAlias<"negs $dst, $src$shift",
	(SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
	def : InstAlias<"negs $dst, $src$shift",
	(SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;


	// Unsigned/Signed divide
	defm UDIV : Div<0, "udiv", udiv>;
	defm SDIV : Div<1, "sdiv", sdiv>;

	def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr GPR32:$Rn, GPR32:$Rm)>;
	def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr GPR64:$Rn, GPR64:$Rm)>;
	def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr GPR32:$Rn, GPR32:$Rm)>;
	def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr GPR64:$Rn, GPR64:$Rm)>;

	// Variable shift
	defm ASRV : Shift<0b10, "asr", sra>;
	defm LSLV : Shift<0b00, "lsl", shl>;
	defm LSRV : Shift<0b01, "lsr", srl>;
	defm RORV : Shift<0b11, "ror", rotr>;

	def : ShiftAlias<"asrv", ASRVWr, GPR32>;
	def : ShiftAlias<"asrv", ASRVXr, GPR64>;
	def : ShiftAlias<"lslv", LSLVWr, GPR32>;
	def : ShiftAlias<"lslv", LSLVXr, GPR64>;
	def : ShiftAlias<"lsrv", LSRVWr, GPR32>;
	def : ShiftAlias<"lsrv", LSRVXr, GPR64>;
	def : ShiftAlias<"rorv", RORVWr, GPR32>;
	def : ShiftAlias<"rorv", RORVXr, GPR64>;

	// Multiply-add
	let AddedComplexity = 5 in {
	defm MADD : MulAccum<0, "madd", add>;
	defm MSUB : MulAccum<1, "msub", sub>;

	def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
	(MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
	def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
	(MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;

	def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
	(MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
	def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
	(MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
	def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)),
	(MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
	def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)),
	(MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
	} // AddedComplexity = 5

	let AddedComplexity = 5 in {
	def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
	def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
	def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
	def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;

	def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
	(SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
	def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
	(UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;

	def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
	(SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
	def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
	(UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;

	def : Pat<(i64 (mul (sext GPR32:$Rn), (s64imm_32bit:$C))),
	(SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
	def : Pat<(i64 (mul (zext GPR32:$Rn), (i64imm_32bit:$C))),
	(UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
	def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C))),
	(SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
	(MOVi32imm (trunc_imm imm:$C)), XZR)>;

	def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
	(SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
	def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
	(UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
	def : Pat<(i64 (ineg (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)))),
	(SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
	(MOVi32imm (trunc_imm imm:$C)), XZR)>;

	def : Pat<(i64 (add (mul (sext GPR32:$Rn), (s64imm_32bit:$C)), GPR64:$Ra)),
	(SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
	def : Pat<(i64 (add (mul (zext GPR32:$Rn), (i64imm_32bit:$C)), GPR64:$Ra)),
	(UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
	def : Pat<(i64 (add (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)),
	GPR64:$Ra)),
	(SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
	(MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;

	def : Pat<(i64 (sub GPR64:$Ra, (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
	(SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
	def : Pat<(i64 (sub GPR64:$Ra, (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
	(UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
	def : Pat<(i64 (sub GPR64:$Ra, (mul (sext_inreg GPR64:$Rn, i32),
	(s64imm_32bit:$C)))),
	(SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
	(MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
	} // AddedComplexity = 5

	def : MulAccumWAlias<"mul", MADDWrrr>;
	def : MulAccumXAlias<"mul", MADDXrrr>;
	def : MulAccumWAlias<"mneg", MSUBWrrr>;
	def : MulAccumXAlias<"mneg", MSUBXrrr>;
	def : WideMulAccumAlias<"smull", SMADDLrrr>;
	def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
	def : WideMulAccumAlias<"umull", UMADDLrrr>;
	def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;

	// Multiply-high
	def SMULHrr : MulHi<0b010, "smulh", mulhs>;
	def UMULHrr : MulHi<0b110, "umulh", mulhu>;

	// CRC32
	def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_aarch64_crc32b, "crc32b">;
	def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_aarch64_crc32h, "crc32h">;
	def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_aarch64_crc32w, "crc32w">;
	def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_aarch64_crc32x, "crc32x">;

	def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_aarch64_crc32cb, "crc32cb">;
	def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">;
	def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">;
	def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;

	// v8.1 atomic CAS
	defm CAS : CompareAndSwap<0, 0, "">;
	defm CASA : CompareAndSwap<1, 0, "a">;
	defm CASL : CompareAndSwap<0, 1, "l">;
	defm CASAL : CompareAndSwap<1, 1, "al">;

	// v8.1 atomic CASP
	defm CASP : CompareAndSwapPair<0, 0, "">;
	defm CASPA : CompareAndSwapPair<1, 0, "a">;
	defm CASPL : CompareAndSwapPair<0, 1, "l">;
	defm CASPAL : CompareAndSwapPair<1, 1, "al">;

	// v8.1 atomic SWP
	defm SWP : Swap<0, 0, "">;
	defm SWPA : Swap<1, 0, "a">;
	defm SWPL : Swap<0, 1, "l">;
	defm SWPAL : Swap<1, 1, "al">;

	// v8.1 atomic LD<OP>(register). Performs load and then ST<OP>(register)
	defm LDADD : LDOPregister<0b000, "add", 0, 0, "">;
	defm LDADDA : LDOPregister<0b000, "add", 1, 0, "a">;
	defm LDADDL : LDOPregister<0b000, "add", 0, 1, "l">;
	defm LDADDAL : LDOPregister<0b000, "add", 1, 1, "al">;

	defm LDCLR : LDOPregister<0b001, "clr", 0, 0, "">;
	defm LDCLRA : LDOPregister<0b001, "clr", 1, 0, "a">;
	defm LDCLRL : LDOPregister<0b001, "clr", 0, 1, "l">;
	defm LDCLRAL : LDOPregister<0b001, "clr", 1, 1, "al">;

	defm LDEOR : LDOPregister<0b010, "eor", 0, 0, "">;
	defm LDEORA : LDOPregister<0b010, "eor", 1, 0, "a">;
	defm LDEORL : LDOPregister<0b010, "eor", 0, 1, "l">;
	defm LDEORAL : LDOPregister<0b010, "eor", 1, 1, "al">;

	defm LDSET : LDOPregister<0b011, "set", 0, 0, "">;
	defm LDSETA : LDOPregister<0b011, "set", 1, 0, "a">;
	defm LDSETL : LDOPregister<0b011, "set", 0, 1, "l">;
	defm LDSETAL : LDOPregister<0b011, "set", 1, 1, "al">;

	defm LDSMAX : LDOPregister<0b100, "smax", 0, 0, "">;
	defm LDSMAXA : LDOPregister<0b100, "smax", 1, 0, "a">;
	defm LDSMAXL : LDOPregister<0b100, "smax", 0, 1, "l">;
	defm LDSMAXAL : LDOPregister<0b100, "smax", 1, 1, "al">;

	defm LDSMIN : LDOPregister<0b101, "smin", 0, 0, "">;
	defm LDSMINA : LDOPregister<0b101, "smin", 1, 0, "a">;
	defm LDSMINL : LDOPregister<0b101, "smin", 0, 1, "l">;
	defm LDSMINAL : LDOPregister<0b101, "smin", 1, 1, "al">;

	defm LDUMAX : LDOPregister<0b110, "umax", 0, 0, "">;
	defm LDUMAXA : LDOPregister<0b110, "umax", 1, 0, "a">;
	defm LDUMAXL : LDOPregister<0b110, "umax", 0, 1, "l">;
	defm LDUMAXAL : LDOPregister<0b110, "umax", 1, 1, "al">;

	defm LDUMIN : LDOPregister<0b111, "umin", 0, 0, "">;
	defm LDUMINA : LDOPregister<0b111, "umin", 1, 0, "a">;
	defm LDUMINL : LDOPregister<0b111, "umin", 0, 1, "l">;
	defm LDUMINAL : LDOPregister<0b111, "umin", 1, 1, "al">;

	// v8.1 atomic ST<OP>(register) as aliases to "LD<OP>(register) when Rt=xZR"
	defm : STOPregister<"stadd","LDADD">; // STADDx
	defm : STOPregister<"stclr","LDCLR">; // STCLRx
	defm : STOPregister<"steor","LDEOR">; // STEORx
	defm : STOPregister<"stset","LDSET">; // STSETx
	defm : STOPregister<"stsmax","LDSMAX">;// STSMAXx
	defm : STOPregister<"stsmin","LDSMIN">;// STSMINx
	defm : STOPregister<"stumax","LDUMAX">;// STUMAXx
	defm : STOPregister<"stumin","LDUMIN">;// STUMINx

	// v8.5 Memory Tagging Extension
	let Predicates = [HasMTE] in {

	def IRG : BaseTwoOperand<0b0100, GPR64sp, "irg", int_aarch64_irg, GPR64sp, GPR64>,
	Sched<[]>{
	let Inst{31} = 1;
	}
	def GMI : BaseTwoOperand<0b0101, GPR64, "gmi", int_aarch64_gmi, GPR64sp>, Sched<[]>{
	let Inst{31} = 1;
	let isNotDuplicable = 1;
	}
	def ADDG : AddSubG<0, "addg", null_frag>;
	def SUBG : AddSubG<1, "subg", null_frag>;

	def : InstAlias<"irg $dst, $src", (IRG GPR64sp:$dst, GPR64sp:$src, XZR), 1>;

	def SUBP : SUBP<0, "subp", int_aarch64_subp>, Sched<[]>;
	def SUBPS : SUBP<1, "subps", null_frag>, Sched<[]>{
	let Defs = [NZCV];
	}

	def : InstAlias<"cmpp $lhs, $rhs", (SUBPS XZR, GPR64sp:$lhs, GPR64sp:$rhs), 0>;

	def LDG : MemTagLoad<"ldg", "\t$Rt, [$Rn, $offset]">;

	def : Pat<(int_aarch64_addg (am_indexedu6s128 GPR64sp:$Rn, uimm6s16:$imm6), imm0_15:$imm4),
	(ADDG GPR64sp:$Rn, imm0_63:$imm6, imm0_15:$imm4)>;
	def : Pat<(int_aarch64_ldg GPR64:$Rt, (am_indexeds9s128 GPR64sp:$Rn, simm9s16:$offset)),
	(LDG GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;

	def : InstAlias<"ldg $Rt, [$Rn]", (LDG GPR64:$Rt, GPR64sp:$Rn, 0), 1>;

	def LDGM : MemTagVector<1, "ldgm", "\t$Rt, [$Rn]",
	(outs GPR64:$Rt), (ins GPR64sp:$Rn)>;
	def STGM : MemTagVector<0, "stgm", "\t$Rt, [$Rn]",
	(outs), (ins GPR64:$Rt, GPR64sp:$Rn)>;
	def STZGM : MemTagVector<0, "stzgm", "\t$Rt, [$Rn]",
	(outs), (ins GPR64:$Rt, GPR64sp:$Rn)> {
	let Inst{23} = 0;
	}

	defm STG : MemTagStore<0b00, "stg">;
	defm STZG : MemTagStore<0b01, "stzg">;
	defm ST2G : MemTagStore<0b10, "st2g">;
	defm STZ2G : MemTagStore<0b11, "stz2g">;

	def : Pat<(AArch64stg GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)),
	(STGOffset $Rn, $Rm, $imm)>;
	def : Pat<(AArch64stzg GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)),
	(STZGOffset $Rn, $Rm, $imm)>;
	def : Pat<(AArch64st2g GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)),
	(ST2GOffset $Rn, $Rm, $imm)>;
	def : Pat<(AArch64stz2g GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)),
	(STZ2GOffset $Rn, $Rm, $imm)>;

	defm STGP : StorePairOffset <0b01, 0, GPR64z, simm7s16, "stgp">;
	def STGPpre : StorePairPreIdx <0b01, 0, GPR64z, simm7s16, "stgp">;
	def STGPpost : StorePairPostIdx<0b01, 0, GPR64z, simm7s16, "stgp">;

	def : Pat<(int_aarch64_stg GPR64:$Rt, (am_indexeds9s128 GPR64sp:$Rn, simm9s16:$offset)),
	(STGOffset GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)>;

	def : Pat<(int_aarch64_stgp (am_indexed7s128 GPR64sp:$Rn, simm7s16:$imm), GPR64:$Rt, GPR64:$Rt2),
	(STGPi $Rt, $Rt2, $Rn, $imm)>;

	def IRGstack
	: Pseudo<(outs GPR64sp:$Rd), (ins GPR64sp:$Rsp, GPR64:$Rm), []>,
	Sched<[]>;
	def TAGPstack
	: Pseudo<(outs GPR64sp:$Rd), (ins GPR64sp:$Rn, uimm6s16:$imm6, GPR64sp:$Rm, imm0_15:$imm4), []>,
	Sched<[]>;

	// Explicit SP in the first operand prevents ShrinkWrap optimization
	// from leaving this instruction out of the stack frame. When IRGstack
	// is transformed into IRG, this operand is replaced with the actual
	// register / expression for the tagged base pointer of the current function.
	def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>;

	// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address.
	// $Rn_wback is one past the end of the range.
	let isCodeGenOnly=1, mayStore=1 in {
	def STGloop
	: Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
	[], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
	Sched<[WriteAdr, WriteST]>;

	def STZGloop
	: Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
	[], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
	Sched<[WriteAdr, WriteST]>;
	}

	} // Predicates = [HasMTE]

	//===----------------------------------------------------------------------===//
	// Logical instructions.
	//===----------------------------------------------------------------------===//

	// (immediate)
	defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag, "bics">;
	defm AND : LogicalImm<0b00, "and", and, "bic">;
	defm EOR : LogicalImm<0b10, "eor", xor, "eon">;
	defm ORR : LogicalImm<0b01, "orr", or, "orn">;

	// FIXME: these aliases are canonical sometimes (when movz can't be
	// used). Actually, it seems to be working right now, but putting logical_immXX
	// here is a bit dodgy on the AsmParser side too.
	def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
	logical_imm32:$imm), 0>;
	def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
	logical_imm64:$imm), 0>;


	// (register)
	defm ANDS : LogicalRegS<0b11, 0, "ands", AArch64and_flag>;
	defm BICS : LogicalRegS<0b11, 1, "bics",
	BinOpFrag<(AArch64and_flag node:$LHS, (not node:$RHS))>>;
	defm AND : LogicalReg<0b00, 0, "and", and>;
	defm BIC : LogicalReg<0b00, 1, "bic",
	BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
	defm EON : LogicalReg<0b10, 1, "eon",
	BinOpFrag<(not (xor node:$LHS, node:$RHS))>>;
	defm EOR : LogicalReg<0b10, 0, "eor", xor>;
	defm ORN : LogicalReg<0b01, 1, "orn",
	BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
	defm ORR : LogicalReg<0b01, 0, "orr", or>;

	def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
	def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;

	def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
	def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;

	def : InstAlias<"mvn $Wd, $Wm$sh",
	(ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
	def : InstAlias<"mvn $Xd, $Xm$sh",
	(ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;

	def : InstAlias<"tst $src1, $src2",
	(ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
	def : InstAlias<"tst $src1, $src2",
	(ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;

	def : InstAlias<"tst $src1, $src2",
	(ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
	def : InstAlias<"tst $src1, $src2",
	(ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;

	def : InstAlias<"tst $src1, $src2$sh",
	(ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
	def : InstAlias<"tst $src1, $src2$sh",
	(ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;


	def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
	def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;


	//===----------------------------------------------------------------------===//
	// One operand data processing instructions.
	//===----------------------------------------------------------------------===//

	defm CLS : OneOperandData<0b101, "cls">;
	defm CLZ : OneOperandData<0b100, "clz", ctlz>;
	defm RBIT : OneOperandData<0b000, "rbit", bitreverse>;

	def REV16Wr : OneWRegData<0b001, "rev16",
	UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
	def REV16Xr : OneXRegData<0b001, "rev16", null_frag>;

	def : Pat<(cttz GPR32:$Rn),
	(CLZWr (RBITWr GPR32:$Rn))>;
	def : Pat<(cttz GPR64:$Rn),
	(CLZXr (RBITXr GPR64:$Rn))>;
	def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
	(i32 1))),
	(CLSWr GPR32:$Rn)>;
	def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
	(i64 1))),
	(CLSXr GPR64:$Rn)>;

	// Unlike the other one operand instructions, the instructions with the "rev"
	// mnemonic do not just different in the size bit, but actually use different
	// opcode bits for the different sizes.
	def REVWr : OneWRegData<0b010, "rev", bswap>;
	def REVXr : OneXRegData<0b011, "rev", bswap>;
	def REV32Xr : OneXRegData<0b010, "rev32",
	UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;

	def : InstAlias<"rev64 $Rd, $Rn", (REVXr GPR64:$Rd, GPR64:$Rn), 0>;

	// The bswap commutes with the rotr so we want a pattern for both possible
	// orders.
	def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
	def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;

	//===----------------------------------------------------------------------===//
	// Bitfield immediate extraction instruction.
	//===----------------------------------------------------------------------===//
	let hasSideEffects = 0 in
	defm EXTR : ExtractImm<"extr">;
	def : InstAlias<"ror $dst, $src, $shift",
	(EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
	def : InstAlias<"ror $dst, $src, $shift",
	(EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;

	def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
	(EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
	def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
	(EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;

	//===----------------------------------------------------------------------===//
	// Other bitfield immediate instructions.
	//===----------------------------------------------------------------------===//
	let hasSideEffects = 0 in {
	defm BFM : BitfieldImmWith2RegArgs<0b01, "bfm">;
	defm SBFM : BitfieldImm<0b00, "sbfm">;
	defm UBFM : BitfieldImm<0b10, "ubfm">;
	}

	def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = 31 - N->getZExtValue();
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	// min(7, 31 - shift_amt)
	def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = 31 - N->getZExtValue();
	enc = enc > 7 ? 7 : enc;
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	// min(15, 31 - shift_amt)
	def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = 31 - N->getZExtValue();
	enc = enc > 15 ? 15 : enc;
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = 63 - N->getZExtValue();
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	// min(7, 63 - shift_amt)
	def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = 63 - N->getZExtValue();
	enc = enc > 7 ? 7 : enc;
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	// min(15, 63 - shift_amt)
	def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = 63 - N->getZExtValue();
	enc = enc > 15 ? 15 : enc;
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	// min(31, 63 - shift_amt)
	def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = 63 - N->getZExtValue();
	enc = enc > 31 ? 31 : enc;
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
	(UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
	(i64 (i32shift_b imm0_31:$imm)))>;
	def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
	(UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
	(i64 (i64shift_b imm0_63:$imm)))>;

	let AddedComplexity = 10 in {
	def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
	(SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
	def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
	(SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
	}

	def : InstAlias<"asr $dst, $src, $shift",
	(SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
	def : InstAlias<"asr $dst, $src, $shift",
	(SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
	def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
	def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
	def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
	def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
	def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;

	def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
	(UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
	def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
	(UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;

	def : InstAlias<"lsr $dst, $src, $shift",
	(UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
	def : InstAlias<"lsr $dst, $src, $shift",
	(UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
	def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
	def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
	def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
	def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
	def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;

	//===----------------------------------------------------------------------===//
	// Conditional comparison instructions.
	//===----------------------------------------------------------------------===//
	defm CCMN : CondComparison<0, "ccmn", AArch64ccmn>;
	defm CCMP : CondComparison<1, "ccmp", AArch64ccmp>;

	//===----------------------------------------------------------------------===//
	// Conditional select instructions.
	//===----------------------------------------------------------------------===//
	defm CSEL : CondSelect<0, 0b00, "csel">;

	def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
	defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
	defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
	defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;

	def : Pat<(AArch64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
	(CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
	def : Pat<(AArch64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
	(CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
	def : Pat<(AArch64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
	(CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
	def : Pat<(AArch64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
	(CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
	def : Pat<(AArch64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
	(CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
	def : Pat<(AArch64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
	(CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;

	def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
	(CSINCWr WZR, WZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
	(CSINCXr XZR, XZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel GPR32:$tval, (i32 1), (i32 imm:$cc), NZCV),
	(CSINCWr GPR32:$tval, WZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel GPR64:$tval, (i64 1), (i32 imm:$cc), NZCV),
	(CSINCXr GPR64:$tval, XZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel (i32 1), GPR32:$fval, (i32 imm:$cc), NZCV),
	(CSINCWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
	def : Pat<(AArch64csel (i64 1), GPR64:$fval, (i32 imm:$cc), NZCV),
	(CSINCXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
	def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
	(CSINVWr WZR, WZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
	(CSINVXr XZR, XZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel GPR32:$tval, (i32 -1), (i32 imm:$cc), NZCV),
	(CSINVWr GPR32:$tval, WZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel GPR64:$tval, (i64 -1), (i32 imm:$cc), NZCV),
	(CSINVXr GPR64:$tval, XZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV),
	(CSINVWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
	def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV),
	(CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;

	// The inverse of the condition code from the alias instruction is what is used
	// in the aliased instruction. The parser all ready inverts the condition code
	// for these aliases.
	def : InstAlias<"cset $dst, $cc",
	(CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
	def : InstAlias<"cset $dst, $cc",
	(CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;

	def : InstAlias<"csetm $dst, $cc",
	(CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
	def : InstAlias<"csetm $dst, $cc",
	(CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;

	def : InstAlias<"cinc $dst, $src, $cc",
	(CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
	def : InstAlias<"cinc $dst, $src, $cc",
	(CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;

	def : InstAlias<"cinv $dst, $src, $cc",
	(CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
	def : InstAlias<"cinv $dst, $src, $cc",
	(CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;

	def : InstAlias<"cneg $dst, $src, $cc",
	(CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
	def : InstAlias<"cneg $dst, $src, $cc",
	(CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;

	//===----------------------------------------------------------------------===//
	// PC-relative instructions.
	//===----------------------------------------------------------------------===//
	let isReMaterializable = 1 in {
	let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
	def ADR : ADRI<0, "adr", adrlabel,
	[(set GPR64:$Xd, (AArch64adr tglobaladdr:$label))]>;
	} // hasSideEffects = 0

	def ADRP : ADRI<1, "adrp", adrplabel,
	[(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
	} // isReMaterializable = 1

	// page address of a constant pool entry, block address
	def : Pat<(AArch64adr tconstpool:$cp), (ADR tconstpool:$cp)>;
	def : Pat<(AArch64adr tblockaddress:$cp), (ADR tblockaddress:$cp)>;
	def : Pat<(AArch64adr texternalsym:$sym), (ADR texternalsym:$sym)>;
	def : Pat<(AArch64adr tjumptable:$sym), (ADR tjumptable:$sym)>;
	def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
	def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
	def : Pat<(AArch64adrp texternalsym:$sym), (ADRP texternalsym:$sym)>;

	//===----------------------------------------------------------------------===//
	// Unconditional branch (register) instructions.
	//===----------------------------------------------------------------------===//

	let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
	def RET : BranchReg<0b0010, "ret", []>;
	def DRPS : SpecialReturn<0b0101, "drps">;
	def ERET : SpecialReturn<0b0100, "eret">;
	} // isReturn = 1, isTerminator = 1, isBarrier = 1

	// Default to the LR register.
	def : InstAlias<"ret", (RET LR)>;

	let isCall = 1, Defs = [LR], Uses = [SP] in {
	def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
	} // isCall

	let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
	def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
	} // isBranch, isTerminator, isBarrier, isIndirectBranch

	// Create a separate pseudo-instruction for codegen to use so that we don't
	// flag lr as used in every function. It'll be restored before the RET by the
	// epilogue if it's legitimately used.
	def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]>,
	Sched<[WriteBrReg]> {
	let isTerminator = 1;
	let isBarrier = 1;
	let isReturn = 1;
	}

	// This is a directive-like pseudo-instruction. The purpose is to insert an
	// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
	// (which in the usual case is a BLR).
	let hasSideEffects = 1 in
	def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> {
	let AsmString = ".tlsdesccall $sym";
	}

	// Pseudo instruction to tell the streamer to emit a 'B' character into the
	// augmentation string.
	def EMITBKEY : Pseudo<(outs), (ins), []>, Sched<[]> {}

	// FIXME: maybe the scratch register used shouldn't be fixed to X1?
	// FIXME: can "hasSideEffects be dropped?
	let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1,
	isCodeGenOnly = 1 in
	def TLSDESC_CALLSEQ
	: Pseudo<(outs), (ins i64imm:$sym),
	[(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>,
	Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
	def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
	(TLSDESC_CALLSEQ texternalsym:$sym)>;

	//===----------------------------------------------------------------------===//
	// Conditional branch (immediate) instruction.
	//===----------------------------------------------------------------------===//
	def Bcc : BranchCond;

	//===----------------------------------------------------------------------===//
	// Compare-and-branch instructions.
	//===----------------------------------------------------------------------===//
	defm CBZ : CmpBranch<0, "cbz", AArch64cbz>;
	defm CBNZ : CmpBranch<1, "cbnz", AArch64cbnz>;

	//===----------------------------------------------------------------------===//
	// Test-bit-and-branch instructions.
	//===----------------------------------------------------------------------===//
	defm TBZ : TestBranch<0, "tbz", AArch64tbz>;
	defm TBNZ : TestBranch<1, "tbnz", AArch64tbnz>;

	//===----------------------------------------------------------------------===//
	// Unconditional branch (immediate) instructions.
	//===----------------------------------------------------------------------===//
	let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
	def B : BranchImm<0, "b", [(br bb:$addr)]>;
	} // isBranch, isTerminator, isBarrier

	let isCall = 1, Defs = [LR], Uses = [SP] in {
	def BL : CallImm<1, "bl", [(AArch64call tglobaladdr:$addr)]>;
	} // isCall
	def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;

	//===----------------------------------------------------------------------===//
	// Exception generation instructions.
	//===----------------------------------------------------------------------===//
	let isTrap = 1 in {
	def BRK : ExceptionGeneration<0b001, 0b00, "brk">;
	}
	def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
	def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
	def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
	def HLT : ExceptionGeneration<0b010, 0b00, "hlt">;
	def HVC : ExceptionGeneration<0b000, 0b10, "hvc">;
	def SMC : ExceptionGeneration<0b000, 0b11, "smc">;
	def SVC : ExceptionGeneration<0b000, 0b01, "svc">;

	// DCPSn defaults to an immediate operand of zero if unspecified.
	def : InstAlias<"dcps1", (DCPS1 0)>;
	def : InstAlias<"dcps2", (DCPS2 0)>;
	def : InstAlias<"dcps3", (DCPS3 0)>;

	def UDF : UDFType<0, "udf">;

	//===----------------------------------------------------------------------===//
	// Load instructions.
	//===----------------------------------------------------------------------===//

	// Pair (indexed, offset)
	defm LDPW : LoadPairOffset<0b00, 0, GPR32z, simm7s4, "ldp">;
	defm LDPX : LoadPairOffset<0b10, 0, GPR64z, simm7s8, "ldp">;
	defm LDPS : LoadPairOffset<0b00, 1, FPR32Op, simm7s4, "ldp">;
	defm LDPD : LoadPairOffset<0b01, 1, FPR64Op, simm7s8, "ldp">;
	defm LDPQ : LoadPairOffset<0b10, 1, FPR128Op, simm7s16, "ldp">;

	defm LDPSW : LoadPairOffset<0b01, 0, GPR64z, simm7s4, "ldpsw">;

	// Pair (pre-indexed)
	def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32z, simm7s4, "ldp">;
	def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64z, simm7s8, "ldp">;
	def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32Op, simm7s4, "ldp">;
	def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64Op, simm7s8, "ldp">;
	def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128Op, simm7s16, "ldp">;

	def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64z, simm7s4, "ldpsw">;

	// Pair (post-indexed)
	def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32z, simm7s4, "ldp">;
	def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64z, simm7s8, "ldp">;
	def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32Op, simm7s4, "ldp">;
	def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64Op, simm7s8, "ldp">;
	def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128Op, simm7s16, "ldp">;

	def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64z, simm7s4, "ldpsw">;


	// Pair (no allocate)
	defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32z, simm7s4, "ldnp">;
	defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64z, simm7s8, "ldnp">;
	defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32Op, simm7s4, "ldnp">;
	defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64Op, simm7s8, "ldnp">;
	defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128Op, simm7s16, "ldnp">;

	//---
	// (register offset)
	//---

	// Integer
	defm LDRBB : Load8RO<0b00, 0, 0b01, GPR32, "ldrb", i32, zextloadi8>;
	defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>;
	defm LDRW : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
	defm LDRX : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;

	// Floating-point
	defm LDRB : Load8RO<0b00, 1, 0b01, FPR8Op, "ldr", untyped, load>;
	defm LDRH : Load16RO<0b01, 1, 0b01, FPR16Op, "ldr", f16, load>;
	defm LDRS : Load32RO<0b10, 1, 0b01, FPR32Op, "ldr", f32, load>;
	defm LDRD : Load64RO<0b11, 1, 0b01, FPR64Op, "ldr", f64, load>;
	defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128Op, "ldr", f128, load>;

	// Load sign-extended half-word
	defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
	defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>;

	// Load sign-extended byte
	defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>;
	defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>;

	// Load sign-extended word
	defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;

	// Pre-fetch.
	defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;

	// For regular load, we do not have any alignment requirement.
	// Thus, it is safe to directly map the vector loads with interesting
	// addressing modes.
	// FIXME: We could do the same for bitconvert to floating point vectors.
	multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
	ValueType ScalTy, ValueType VecTy,
	Instruction LOADW, Instruction LOADX,
	SubRegIndex sub> {
	def : Pat<(VecTy (scalar_to_vector (ScalTy
	(loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
	(INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
	(LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
	sub)>;

	def : Pat<(VecTy (scalar_to_vector (ScalTy
	(loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
	(INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
	(LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
	sub)>;
	}

	let AddedComplexity = 10 in {
	defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v8i8, LDRBroW, LDRBroX, bsub>;
	defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>;

	defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
	defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;

	defm : ScalToVecROLoadPat<ro16, load, i32, v4f16, LDRHroW, LDRHroX, hsub>;
	defm : ScalToVecROLoadPat<ro16, load, i32, v8f16, LDRHroW, LDRHroX, hsub>;

	defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>;
	defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>;

	defm : ScalToVecROLoadPat<ro32, load, f32, v2f32, LDRSroW, LDRSroX, ssub>;
	defm : ScalToVecROLoadPat<ro32, load, f32, v4f32, LDRSroW, LDRSroX, ssub>;

	defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>;

	defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>;


	def : Pat <(v1i64 (scalar_to_vector (i64
	(load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
	ro_Wextend64:$extend))))),
	(LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;

	def : Pat <(v1i64 (scalar_to_vector (i64
	(load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
	ro_Xextend64:$extend))))),
	(LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
	}

	// Match all load 64 bits width whose type is compatible with FPR64
	multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
	Instruction LOADW, Instruction LOADX> {

	def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
	(LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;

	def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
	(LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
	}

	let AddedComplexity = 10 in {
	let Predicates = [IsLE] in {
	// We must do vector loads with LD1 in big-endian.
	defm : VecROLoadPat<ro64, v2i32, LDRDroW, LDRDroX>;
	defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
	defm : VecROLoadPat<ro64, v8i8, LDRDroW, LDRDroX>;
	defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
	defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>;
	}

	defm : VecROLoadPat<ro64, v1i64, LDRDroW, LDRDroX>;
	defm : VecROLoadPat<ro64, v1f64, LDRDroW, LDRDroX>;

	// Match all load 128 bits width whose type is compatible with FPR128
	let Predicates = [IsLE] in {
	// We must do vector loads with LD1 in big-endian.
	defm : VecROLoadPat<ro128, v2i64, LDRQroW, LDRQroX>;
	defm : VecROLoadPat<ro128, v2f64, LDRQroW, LDRQroX>;
	defm : VecROLoadPat<ro128, v4i32, LDRQroW, LDRQroX>;
	defm : VecROLoadPat<ro128, v4f32, LDRQroW, LDRQroX>;
	defm : VecROLoadPat<ro128, v8i16, LDRQroW, LDRQroX>;
	defm : VecROLoadPat<ro128, v8f16, LDRQroW, LDRQroX>;
	defm : VecROLoadPat<ro128, v16i8, LDRQroW, LDRQroX>;
	}
	} // AddedComplexity = 10

	// zextload -> i64
	multiclass ExtLoadTo64ROPat<ROAddrMode ro, SDPatternOperator loadop,
	Instruction INSTW, Instruction INSTX> {
	def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
	(SUBREG_TO_REG (i64 0),
	(INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
	sub_32)>;

	def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
	(SUBREG_TO_REG (i64 0),
	(INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
	sub_32)>;
	}

	let AddedComplexity = 10 in {
	defm : ExtLoadTo64ROPat<ro8, zextloadi8, LDRBBroW, LDRBBroX>;
	defm : ExtLoadTo64ROPat<ro16, zextloadi16, LDRHHroW, LDRHHroX>;
	defm : ExtLoadTo64ROPat<ro32, zextloadi32, LDRWroW, LDRWroX>;

	// zextloadi1 -> zextloadi8
	defm : ExtLoadTo64ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;

	// extload -> zextload
	defm : ExtLoadTo64ROPat<ro8, extloadi8, LDRBBroW, LDRBBroX>;
	defm : ExtLoadTo64ROPat<ro16, extloadi16, LDRHHroW, LDRHHroX>;
	defm : ExtLoadTo64ROPat<ro32, extloadi32, LDRWroW, LDRWroX>;

	// extloadi1 -> zextloadi8
	defm : ExtLoadTo64ROPat<ro8, extloadi1, LDRBBroW, LDRBBroX>;
	}


	// zextload -> i64
	multiclass ExtLoadTo32ROPat<ROAddrMode ro, SDPatternOperator loadop,
	Instruction INSTW, Instruction INSTX> {
	def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
	(INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;

	def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
	(INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;

	}

	let AddedComplexity = 10 in {
	// extload -> zextload
	defm : ExtLoadTo32ROPat<ro8, extloadi8, LDRBBroW, LDRBBroX>;
	defm : ExtLoadTo32ROPat<ro16, extloadi16, LDRHHroW, LDRHHroX>;
	defm : ExtLoadTo32ROPat<ro32, extloadi32, LDRWroW, LDRWroX>;

	// zextloadi1 -> zextloadi8
	defm : ExtLoadTo32ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
	}

	//---
	// (unsigned immediate)
	//---
	defm LDRX : LoadUI<0b11, 0, 0b01, GPR64z, uimm12s8, "ldr",
	[(set GPR64z:$Rt,
	(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
	defm LDRW : LoadUI<0b10, 0, 0b01, GPR32z, uimm12s4, "ldr",
	[(set GPR32z:$Rt,
	(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
	defm LDRB : LoadUI<0b00, 1, 0b01, FPR8Op, uimm12s1, "ldr",
	[(set FPR8Op:$Rt,
	(load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
	defm LDRH : LoadUI<0b01, 1, 0b01, FPR16Op, uimm12s2, "ldr",
	[(set (f16 FPR16Op:$Rt),
	(load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
	defm LDRS : LoadUI<0b10, 1, 0b01, FPR32Op, uimm12s4, "ldr",
	[(set (f32 FPR32Op:$Rt),
	(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
	defm LDRD : LoadUI<0b11, 1, 0b01, FPR64Op, uimm12s8, "ldr",
	[(set (f64 FPR64Op:$Rt),
	(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
	defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
	[(set (f128 FPR128Op:$Rt),
	(load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;

	// For regular load, we do not have any alignment requirement.
	// Thus, it is safe to directly map the vector loads with interesting
	// addressing modes.
	// FIXME: We could do the same for bitconvert to floating point vectors.
	def : Pat <(v8i8 (scalar_to_vector (i32
	(extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
	(INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
	(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
	def : Pat <(v16i8 (scalar_to_vector (i32
	(extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
	def : Pat <(v4i16 (scalar_to_vector (i32
	(extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
	(INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
	(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
	def : Pat <(v8i16 (scalar_to_vector (i32
	(extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
	(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
	(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
	def : Pat <(v2i32 (scalar_to_vector (i32
	(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
	(INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
	(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
	def : Pat <(v4i32 (scalar_to_vector (i32
	(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
	(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
	def : Pat <(v1i64 (scalar_to_vector (i64
	(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat <(v2i64 (scalar_to_vector (i64
	(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
	(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;

	// Match all load 64 bits width whose type is compatible with FPR64
	let Predicates = [IsLE] in {
	// We must use LD1 to perform vector loads in big-endian.
	def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
	}
	def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;

	// Match all load 128 bits width whose type is compatible with FPR128
	let Predicates = [IsLE] in {
	// We must use LD1 to perform vector loads in big-endian.
	def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
	}
	def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;

	defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh",
	[(set GPR32:$Rt,
	(zextloadi16 (am_indexed16 GPR64sp:$Rn,
	uimm12s2:$offset)))]>;
	defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb",
	[(set GPR32:$Rt,
	(zextloadi8 (am_indexed8 GPR64sp:$Rn,
	uimm12s1:$offset)))]>;
	// zextload -> i64
	def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
	def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;

	// zextloadi1 -> zextloadi8
	def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
	(LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
	def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;

	// extload -> zextload
	def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
	(LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
	def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
	(LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
	def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
	(LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
	def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
	def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
	def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
	def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;

	// load sign-extended half-word
	defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh",
	[(set GPR32:$Rt,
	(sextloadi16 (am_indexed16 GPR64sp:$Rn,
	uimm12s2:$offset)))]>;
	defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh",
	[(set GPR64:$Rt,
	(sextloadi16 (am_indexed16 GPR64sp:$Rn,
	uimm12s2:$offset)))]>;

	// load sign-extended byte
	defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb",
	[(set GPR32:$Rt,
	(sextloadi8 (am_indexed8 GPR64sp:$Rn,
	uimm12s1:$offset)))]>;
	defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb",
	[(set GPR64:$Rt,
	(sextloadi8 (am_indexed8 GPR64sp:$Rn,
	uimm12s1:$offset)))]>;

	// load sign-extended word
	defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
	[(set GPR64:$Rt,
	(sextloadi32 (am_indexed32 GPR64sp:$Rn,
	uimm12s4:$offset)))]>;

	// load zero-extended word
	def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;

	// Pre-fetch.
	def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
	[(AArch64Prefetch imm:$Rt,
	(am_indexed64 GPR64sp:$Rn,
	uimm12s8:$offset))]>;

	def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;

	//---
	// (literal)

	def alignedglobal : PatLeaf<(iPTR iPTR:$label), [{
	if (auto *G = dyn_cast<GlobalAddressSDNode>(N)) {
	const DataLayout &DL = MF->getDataLayout();
	unsigned Align = G->getGlobal()->getPointerAlignment(DL);
	return Align >= 4 && G->getOffset() % 4 == 0;
	}
	if (auto *C = dyn_cast<ConstantPoolSDNode>(N))
	return C->getAlignment() >= 4 && C->getOffset() % 4 == 0;
	return false;
	}]>;

	def LDRWl : LoadLiteral<0b00, 0, GPR32z, "ldr",
	[(set GPR32z:$Rt, (load (AArch64adr alignedglobal:$label)))]>;
	def LDRXl : LoadLiteral<0b01, 0, GPR64z, "ldr",
	[(set GPR64z:$Rt, (load (AArch64adr alignedglobal:$label)))]>;
	def LDRSl : LoadLiteral<0b00, 1, FPR32Op, "ldr",
	[(set (f32 FPR32Op:$Rt), (load (AArch64adr alignedglobal:$label)))]>;
	def LDRDl : LoadLiteral<0b01, 1, FPR64Op, "ldr",
	[(set (f64 FPR64Op:$Rt), (load (AArch64adr alignedglobal:$label)))]>;
	def LDRQl : LoadLiteral<0b10, 1, FPR128Op, "ldr",
	[(set (f128 FPR128Op:$Rt), (load (AArch64adr alignedglobal:$label)))]>;

	// load sign-extended word
	def LDRSWl : LoadLiteral<0b10, 0, GPR64z, "ldrsw",
	[(set GPR64z:$Rt, (sextloadi32 (AArch64adr alignedglobal:$label)))]>;

	let AddedComplexity = 20 in {
	def : Pat<(i64 (zextloadi32 (AArch64adr alignedglobal:$label))),
	(SUBREG_TO_REG (i64 0), (LDRWl $label), sub_32)>;
	}

	// prefetch
	def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
	// [(AArch64Prefetch imm:$Rt, tglobaladdr:$label)]>;

	//---
	// (unscaled immediate)
	defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64z, "ldur",
	[(set GPR64z:$Rt,
	(load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32z, "ldur",
	[(set GPR32z:$Rt,
	(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur",
	[(set FPR8Op:$Rt,
	(load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur",
	[(set FPR16Op:$Rt,
	(load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32Op, "ldur",
	[(set (f32 FPR32Op:$Rt),
	(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64Op, "ldur",
	[(set (f64 FPR64Op:$Rt),
	(load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128Op, "ldur",
	[(set (f128 FPR128Op:$Rt),
	(load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;

	defm LDURHH
	: LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh",
	[(set GPR32:$Rt,
	(zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURBB
	: LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb",
	[(set GPR32:$Rt,
	(zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;

	// Match all load 64 bits width whose type is compatible with FPR64
	let Predicates = [IsLE] in {
	def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
	(LDURDi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
	(LDURDi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
	(LDURDi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
	(LDURDi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v4f16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
	(LDURDi GPR64sp:$Rn, simm9:$offset)>;
	}
	def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
	(LDURDi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
	(LDURDi GPR64sp:$Rn, simm9:$offset)>;

	// Match all load 128 bits width whose type is compatible with FPR128
	let Predicates = [IsLE] in {
	def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
	(LDURQi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
	(LDURQi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
	(LDURQi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
	(LDURQi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
	(LDURQi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
	(LDURQi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v8f16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
	(LDURQi GPR64sp:$Rn, simm9:$offset)>;
	}

	// anyext -> zext
	def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
	(LDURHHi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(LDURBBi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(LDURBBi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	// unscaled zext
	def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
	(LDURHHi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(LDURBBi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(LDURBBi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;


	//---
	// LDR mnemonics fall back to LDUR for negative or unaligned offsets.

	// Define new assembler match classes as we want to only match these when
	// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
	// associate a DiagnosticType either, as we want the diagnostic for the
	// canonical form (the scaled operand) to take precedence.
	class SImm9OffsetOperand<int Width> : AsmOperandClass {
	let Name = "SImm9OffsetFB" # Width;
	let PredicateMethod = "isSImm9OffsetFB<" # Width # ">";
	let RenderMethod = "addImmOperands";
	}

	def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>;
	def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>;
	def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>;
	def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>;
	def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>;

	def simm9_offset_fb8 : Operand<i64> {
	let ParserMatchClass = SImm9OffsetFB8Operand;
	}
	def simm9_offset_fb16 : Operand<i64> {
	let ParserMatchClass = SImm9OffsetFB16Operand;
	}
	def simm9_offset_fb32 : Operand<i64> {
	let ParserMatchClass = SImm9OffsetFB32Operand;
	}
	def simm9_offset_fb64 : Operand<i64> {
	let ParserMatchClass = SImm9OffsetFB64Operand;
	}
	def simm9_offset_fb128 : Operand<i64> {
	let ParserMatchClass = SImm9OffsetFB128Operand;
	}

	def : InstAlias<"ldr $Rt, [$Rn, $offset]",
	(LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
	def : InstAlias<"ldr $Rt, [$Rn, $offset]",
	(LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
	def : InstAlias<"ldr $Rt, [$Rn, $offset]",
	(LDURBi FPR8Op:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
	def : InstAlias<"ldr $Rt, [$Rn, $offset]",
	(LDURHi FPR16Op:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
	def : InstAlias<"ldr $Rt, [$Rn, $offset]",
	(LDURSi FPR32Op:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
	def : InstAlias<"ldr $Rt, [$Rn, $offset]",
	(LDURDi FPR64Op:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
	def : InstAlias<"ldr $Rt, [$Rn, $offset]",
	(LDURQi FPR128Op:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;

	// zextload -> i64
	def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;

	// load sign-extended half-word
	defm LDURSHW
	: LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh",
	[(set GPR32:$Rt,
	(sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURSHX
	: LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh",
	[(set GPR64:$Rt,
	(sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;

	// load sign-extended byte
	defm LDURSBW
	: LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb",
	[(set GPR32:$Rt,
	(sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURSBX
	: LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb",
	[(set GPR64:$Rt,
	(sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;

	// load sign-extended word
	defm LDURSW
	: LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw",
	[(set GPR64:$Rt,
	(sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;

	// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
	def : InstAlias<"ldrb $Rt, [$Rn, $offset]",
	(LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
	def : InstAlias<"ldrh $Rt, [$Rn, $offset]",
	(LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
	def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
	(LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
	def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
	(LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
	def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
	(LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
	def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
	(LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
	def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
	(LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;

	// Pre-fetch.
	defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
	[(AArch64Prefetch imm:$Rt,
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;

	//---
	// (unscaled immediate, unprivileged)
	defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
	defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;

	defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
	defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;

	// load sign-extended half-word
	defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
	defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;

	// load sign-extended byte
	defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
	defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;

	// load sign-extended word
	defm LDTRSW : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;

	//---
	// (immediate pre-indexed)
	def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32z, "ldr">;
	def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64z, "ldr">;
	def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8Op, "ldr">;
	def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16Op, "ldr">;
	def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32Op, "ldr">;
	def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64Op, "ldr">;
	def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128Op, "ldr">;

	// load sign-extended half-word
	def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32z, "ldrsh">;
	def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64z, "ldrsh">;

	// load sign-extended byte
	def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32z, "ldrsb">;
	def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64z, "ldrsb">;

	// load zero-extended byte
	def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32z, "ldrb">;
	def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32z, "ldrh">;

	// load sign-extended word
	def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64z, "ldrsw">;

	//---
	// (immediate post-indexed)
	def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32z, "ldr">;
	def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64z, "ldr">;
	def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8Op, "ldr">;
	def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16Op, "ldr">;
	def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32Op, "ldr">;
	def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64Op, "ldr">;
	def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128Op, "ldr">;

	// load sign-extended half-word
	def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32z, "ldrsh">;
	def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64z, "ldrsh">;

	// load sign-extended byte
	def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32z, "ldrsb">;
	def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64z, "ldrsb">;

	// load zero-extended byte
	def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32z, "ldrb">;
	def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32z, "ldrh">;

	// load sign-extended word
	def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64z, "ldrsw">;

	//===----------------------------------------------------------------------===//
	// Store instructions.
	//===----------------------------------------------------------------------===//

	// Pair (indexed, offset)
	// FIXME: Use dedicated range-checked addressing mode operand here.
	defm STPW : StorePairOffset<0b00, 0, GPR32z, simm7s4, "stp">;
	defm STPX : StorePairOffset<0b10, 0, GPR64z, simm7s8, "stp">;
	defm STPS : StorePairOffset<0b00, 1, FPR32Op, simm7s4, "stp">;
	defm STPD : StorePairOffset<0b01, 1, FPR64Op, simm7s8, "stp">;
	defm STPQ : StorePairOffset<0b10, 1, FPR128Op, simm7s16, "stp">;

	// Pair (pre-indexed)
	def STPWpre : StorePairPreIdx<0b00, 0, GPR32z, simm7s4, "stp">;
	def STPXpre : StorePairPreIdx<0b10, 0, GPR64z, simm7s8, "stp">;
	def STPSpre : StorePairPreIdx<0b00, 1, FPR32Op, simm7s4, "stp">;
	def STPDpre : StorePairPreIdx<0b01, 1, FPR64Op, simm7s8, "stp">;
	def STPQpre : StorePairPreIdx<0b10, 1, FPR128Op, simm7s16, "stp">;

	// Pair (pre-indexed)
	def STPWpost : StorePairPostIdx<0b00, 0, GPR32z, simm7s4, "stp">;
	def STPXpost : StorePairPostIdx<0b10, 0, GPR64z, simm7s8, "stp">;
	def STPSpost : StorePairPostIdx<0b00, 1, FPR32Op, simm7s4, "stp">;
	def STPDpost : StorePairPostIdx<0b01, 1, FPR64Op, simm7s8, "stp">;
	def STPQpost : StorePairPostIdx<0b10, 1, FPR128Op, simm7s16, "stp">;

	// Pair (no allocate)
	defm STNPW : StorePairNoAlloc<0b00, 0, GPR32z, simm7s4, "stnp">;
	defm STNPX : StorePairNoAlloc<0b10, 0, GPR64z, simm7s8, "stnp">;
	defm STNPS : StorePairNoAlloc<0b00, 1, FPR32Op, simm7s4, "stnp">;
	defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">;
	defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;

	//---
	// (Register offset)

	// Integer
	defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;
	defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;
	defm STRW : Store32RO<0b10, 0, 0b00, GPR32, "str", i32, store>;
	defm STRX : Store64RO<0b11, 0, 0b00, GPR64, "str", i64, store>;


	// Floating-point
	defm STRB : Store8RO< 0b00, 1, 0b00, FPR8Op, "str", untyped, store>;
	defm STRH : Store16RO<0b01, 1, 0b00, FPR16Op, "str", f16, store>;
	defm STRS : Store32RO<0b10, 1, 0b00, FPR32Op, "str", f32, store>;
	defm STRD : Store64RO<0b11, 1, 0b00, FPR64Op, "str", f64, store>;
	defm STRQ : Store128RO<0b00, 1, 0b10, FPR128Op, "str", f128, store>;

	let Predicates = [UseSTRQro], AddedComplexity = 10 in {
	def : Pat<(store (f128 FPR128:$Rt),
	(ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
	ro_Wextend128:$extend)),
	(STRQroW FPR128:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend)>;
	def : Pat<(store (f128 FPR128:$Rt),
	(ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
	ro_Xextend128:$extend)),
	(STRQroX FPR128:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Wextend128:$extend)>;
	}

	multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop,
	Instruction STRW, Instruction STRX> {

	def : Pat<(storeop GPR64:$Rt,
	(ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
	(STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32),
	GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;

	def : Pat<(storeop GPR64:$Rt,
	(ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
	(STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32),
	GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
	}

	let AddedComplexity = 10 in {
	// truncstore i64
	defm : TruncStoreFrom64ROPat<ro8, truncstorei8, STRBBroW, STRBBroX>;
	defm : TruncStoreFrom64ROPat<ro16, truncstorei16, STRHHroW, STRHHroX>;
	defm : TruncStoreFrom64ROPat<ro32, truncstorei32, STRWroW, STRWroX>;
	}

	multiclass VecROStorePat<ROAddrMode ro, ValueType VecTy, RegisterClass FPR,
	Instruction STRW, Instruction STRX> {
	def : Pat<(store (VecTy FPR:$Rt),
	(ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
	(STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;

	def : Pat<(store (VecTy FPR:$Rt),
	(ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
	(STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
	}

	let AddedComplexity = 10 in {
	// Match all store 64 bits width whose type is compatible with FPR64
	let Predicates = [IsLE] in {
	// We must use ST1 to store vectors in big-endian.
	defm : VecROStorePat<ro64, v2i32, FPR64, STRDroW, STRDroX>;
	defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
	defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
	defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
	defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>;
	}

	defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
	defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>;

	// Match all store 128 bits width whose type is compatible with FPR128
	let Predicates = [IsLE, UseSTRQro] in {
	// We must use ST1 to store vectors in big-endian.
	defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>;
	defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>;
	defm : VecROStorePat<ro128, v4i32, FPR128, STRQroW, STRQroX>;
	defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
	defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
	defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
	defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>;
	}
	} // AddedComplexity = 10

	// Match stores from lane 0 to the appropriate subreg's store.
	multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop,
	ValueType VecTy, ValueType STy,
	SubRegIndex SubRegIdx,
	Instruction STRW, Instruction STRX> {

	def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
	(ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
	(STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
	GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;

	def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
	(ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
	(STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
	GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
	}

	let AddedComplexity = 19 in {
	defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>;
	defm : VecROStoreLane0Pat<ro16, store, v8f16, f16, hsub, STRHroW, STRHroX>;
	defm : VecROStoreLane0Pat<ro32, store, v4i32, i32, ssub, STRSroW, STRSroX>;
	defm : VecROStoreLane0Pat<ro32, store, v4f32, f32, ssub, STRSroW, STRSroX>;
	defm : VecROStoreLane0Pat<ro64, store, v2i64, i64, dsub, STRDroW, STRDroX>;
	defm : VecROStoreLane0Pat<ro64, store, v2f64, f64, dsub, STRDroW, STRDroX>;
	}

	//---
	// (unsigned immediate)
	defm STRX : StoreUIz<0b11, 0, 0b00, GPR64z, uimm12s8, "str",
	[(store GPR64z:$Rt,
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
	defm STRW : StoreUIz<0b10, 0, 0b00, GPR32z, uimm12s4, "str",
	[(store GPR32z:$Rt,
	(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
	defm STRB : StoreUI<0b00, 1, 0b00, FPR8Op, uimm12s1, "str",
	[(store FPR8Op:$Rt,
	(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
	defm STRH : StoreUI<0b01, 1, 0b00, FPR16Op, uimm12s2, "str",
	[(store (f16 FPR16Op:$Rt),
	(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
	defm STRS : StoreUI<0b10, 1, 0b00, FPR32Op, uimm12s4, "str",
	[(store (f32 FPR32Op:$Rt),
	(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
	defm STRD : StoreUI<0b11, 1, 0b00, FPR64Op, uimm12s8, "str",
	[(store (f64 FPR64Op:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
	defm STRQ : StoreUI<0b00, 1, 0b10, FPR128Op, uimm12s16, "str", []>;

	defm STRHH : StoreUIz<0b01, 0, 0b00, GPR32z, uimm12s2, "strh",
	[(truncstorei16 GPR32z:$Rt,
	(am_indexed16 GPR64sp:$Rn,
	uimm12s2:$offset))]>;
	defm STRBB : StoreUIz<0b00, 0, 0b00, GPR32z, uimm12s1, "strb",
	[(truncstorei8 GPR32z:$Rt,
	(am_indexed8 GPR64sp:$Rn,
	uimm12s1:$offset))]>;

	let AddedComplexity = 10 in {

	// Match all store 64 bits width whose type is compatible with FPR64
	def : Pat<(store (v1i64 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
	(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(store (v1f64 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
	(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;

	let Predicates = [IsLE] in {
	// We must use ST1 to store vectors in big-endian.
	def : Pat<(store (v2f32 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
	(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(store (v8i8 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
	(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(store (v4i16 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
	(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(store (v2i32 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
	(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(store (v4f16 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
	(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
	}

	// Match all store 128 bits width whose type is compatible with FPR128
	def : Pat<(store (f128 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;

	let Predicates = [IsLE] in {
	// We must use ST1 to store vectors in big-endian.
	def : Pat<(store (v4f32 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(store (v2f64 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(store (v16i8 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(store (v8i16 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(store (v4i32 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(store (v2i64 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(store (v8f16 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
	}

	// truncstore i64
	def : Pat<(truncstorei32 GPR64:$Rt,
	(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
	(STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>;
	def : Pat<(truncstorei16 GPR64:$Rt,
	(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
	(STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>;
	def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
	(STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>;

	} // AddedComplexity = 10

	// Match stores from lane 0 to the appropriate subreg's store.
	multiclass VecStoreLane0Pat<Operand UIAddrMode, SDPatternOperator storeop,
	ValueType VTy, ValueType STy,
	SubRegIndex SubRegIdx, Operand IndexType,
	Instruction STR> {
	def : Pat<(storeop (STy (vector_extract (VTy VecListOne128:$Vt), 0)),
	(UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
	(STR (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
	GPR64sp:$Rn, IndexType:$offset)>;
	}

	let AddedComplexity = 19 in {
	defm : VecStoreLane0Pat<am_indexed16, truncstorei16, v8i16, i32, hsub, uimm12s2, STRHui>;
	defm : VecStoreLane0Pat<am_indexed16, store, v8f16, f16, hsub, uimm12s2, STRHui>;
	defm : VecStoreLane0Pat<am_indexed32, store, v4i32, i32, ssub, uimm12s4, STRSui>;
	defm : VecStoreLane0Pat<am_indexed32, store, v4f32, f32, ssub, uimm12s4, STRSui>;
	defm : VecStoreLane0Pat<am_indexed64, store, v2i64, i64, dsub, uimm12s8, STRDui>;
	defm : VecStoreLane0Pat<am_indexed64, store, v2f64, f64, dsub, uimm12s8, STRDui>;
	}

	//---
	// (unscaled immediate)
	defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64z, "stur",
	[(store GPR64z:$Rt,
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32z, "stur",
	[(store GPR32z:$Rt,
	(am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8Op, "stur",
	[(store FPR8Op:$Rt,
	(am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16Op, "stur",
	[(store (f16 FPR16Op:$Rt),
	(am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32Op, "stur",
	[(store (f32 FPR32Op:$Rt),
	(am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64Op, "stur",
	[(store (f64 FPR64Op:$Rt),
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128Op, "stur",
	[(store (f128 FPR128Op:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32z, "sturh",
	[(truncstorei16 GPR32z:$Rt,
	(am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32z, "sturb",
	[(truncstorei8 GPR32z:$Rt,
	(am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;

	// Armv8.4 Weaker Release Consistency enhancements
	// LDAPR & STLR with Immediate Offset instructions
	let Predicates = [HasRCPC_IMMO] in {
	defm STLURB : BaseStoreUnscaleV84<"stlurb", 0b00, 0b00, GPR32>;
	defm STLURH : BaseStoreUnscaleV84<"stlurh", 0b01, 0b00, GPR32>;
	defm STLURW : BaseStoreUnscaleV84<"stlur", 0b10, 0b00, GPR32>;
	defm STLURX : BaseStoreUnscaleV84<"stlur", 0b11, 0b00, GPR64>;
	defm LDAPURB : BaseLoadUnscaleV84<"ldapurb", 0b00, 0b01, GPR32>;
	defm LDAPURSBW : BaseLoadUnscaleV84<"ldapursb", 0b00, 0b11, GPR32>;
	defm LDAPURSBX : BaseLoadUnscaleV84<"ldapursb", 0b00, 0b10, GPR64>;
	defm LDAPURH : BaseLoadUnscaleV84<"ldapurh", 0b01, 0b01, GPR32>;
	defm LDAPURSHW : BaseLoadUnscaleV84<"ldapursh", 0b01, 0b11, GPR32>;
	defm LDAPURSHX : BaseLoadUnscaleV84<"ldapursh", 0b01, 0b10, GPR64>;
	defm LDAPUR : BaseLoadUnscaleV84<"ldapur", 0b10, 0b01, GPR32>;
	defm LDAPURSW : BaseLoadUnscaleV84<"ldapursw", 0b10, 0b10, GPR64>;
	defm LDAPURX : BaseLoadUnscaleV84<"ldapur", 0b11, 0b01, GPR64>;
	}

	// Match all store 64 bits width whose type is compatible with FPR64
	def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;

	let AddedComplexity = 10 in {

	let Predicates = [IsLE] in {
	// We must use ST1 to store vectors in big-endian.
	def : Pat<(store (v2f32 FPR64:$Rt),
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v8i8 FPR64:$Rt),
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v4i16 FPR64:$Rt),
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v2i32 FPR64:$Rt),
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v4f16 FPR64:$Rt),
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	}

	// Match all store 128 bits width whose type is compatible with FPR128
	def : Pat<(store (f128 FPR128:$Rt), (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;

	let Predicates = [IsLE] in {
	// We must use ST1 to store vectors in big-endian.
	def : Pat<(store (v4f32 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v2f64 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v16i8 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v8i16 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v4i32 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v2i64 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v2f64 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v8f16 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	}

	} // AddedComplexity = 10

	// unscaled i64 truncating stores
	def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
	(STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
	(STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
	(STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;

	// Match stores from lane 0 to the appropriate subreg's store.
	multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
	ValueType VTy, ValueType STy,
	SubRegIndex SubRegIdx, Instruction STR> {
	defm : VecStoreLane0Pat<am_unscaled128, StoreOp, VTy, STy, SubRegIdx, simm9, STR>;
	}

	let AddedComplexity = 19 in {
	defm : VecStoreULane0Pat<truncstorei16, v8i16, i32, hsub, STURHi>;
	defm : VecStoreULane0Pat<store, v8f16, f16, hsub, STURHi>;
	defm : VecStoreULane0Pat<store, v4i32, i32, ssub, STURSi>;
	defm : VecStoreULane0Pat<store, v4f32, f32, ssub, STURSi>;
	defm : VecStoreULane0Pat<store, v2i64, i64, dsub, STURDi>;
	defm : VecStoreULane0Pat<store, v2f64, f64, dsub, STURDi>;
	}

	//---
	// STR mnemonics fall back to STUR for negative or unaligned offsets.
	def : InstAlias<"str $Rt, [$Rn, $offset]",
	(STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
	def : InstAlias<"str $Rt, [$Rn, $offset]",
	(STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
	def : InstAlias<"str $Rt, [$Rn, $offset]",
	(STURBi FPR8Op:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
	def : InstAlias<"str $Rt, [$Rn, $offset]",
	(STURHi FPR16Op:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
	def : InstAlias<"str $Rt, [$Rn, $offset]",
	(STURSi FPR32Op:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
	def : InstAlias<"str $Rt, [$Rn, $offset]",
	(STURDi FPR64Op:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
	def : InstAlias<"str $Rt, [$Rn, $offset]",
	(STURQi FPR128Op:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;

	def : InstAlias<"strb $Rt, [$Rn, $offset]",
	(STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
	def : InstAlias<"strh $Rt, [$Rn, $offset]",
	(STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;

	//---
	// (unscaled immediate, unprivileged)
	defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
	defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;

	defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
	defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;

	//---
	// (immediate pre-indexed)
	def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32z, "str", pre_store, i32>;
	def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64z, "str", pre_store, i64>;
	def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8Op, "str", pre_store, untyped>;
	def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16Op, "str", pre_store, f16>;
	def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32Op, "str", pre_store, f32>;
	def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64Op, "str", pre_store, f64>;
	def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128Op, "str", pre_store, f128>;

	def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32z, "strb", pre_truncsti8, i32>;
	def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32z, "strh", pre_truncsti16, i32>;

	// truncstore i64
	def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
	(STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
	simm9:$off)>;
	def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
	(STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
	simm9:$off)>;
	def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
	(STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
	simm9:$off)>;

	def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;

	def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;

	//---
	// (immediate post-indexed)
	def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32z, "str", post_store, i32>;
	def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64z, "str", post_store, i64>;
	def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8Op, "str", post_store, untyped>;
	def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16Op, "str", post_store, f16>;
	def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32Op, "str", post_store, f32>;
	def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64Op, "str", post_store, f64>;
	def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128Op, "str", post_store, f128>;

	def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32z, "strb", post_truncsti8, i32>;
	def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32z, "strh", post_truncsti16, i32>;

	// truncstore i64
	def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
	(STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
	simm9:$off)>;
	def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
	(STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
	simm9:$off)>;
	def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
	(STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
	simm9:$off)>;

	def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;

	def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;

	//===----------------------------------------------------------------------===//
	// Load/store exclusive instructions.
	//===----------------------------------------------------------------------===//

	def LDARW : LoadAcquire <0b10, 1, 1, 0, 1, GPR32, "ldar">;
	def LDARX : LoadAcquire <0b11, 1, 1, 0, 1, GPR64, "ldar">;
	def LDARB : LoadAcquire <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
	def LDARH : LoadAcquire <0b01, 1, 1, 0, 1, GPR32, "ldarh">;

	def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
	def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
	def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
	def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;

	def LDXRW : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
	def LDXRX : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
	def LDXRB : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
	def LDXRH : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;

	def STLRW : StoreRelease <0b10, 1, 0, 0, 1, GPR32, "stlr">;
	def STLRX : StoreRelease <0b11, 1, 0, 0, 1, GPR64, "stlr">;
	def STLRB : StoreRelease <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
	def STLRH : StoreRelease <0b01, 1, 0, 0, 1, GPR32, "stlrh">;

	def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
	def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
	def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
	def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;

	def STXRW : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
	def STXRX : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
	def STXRB : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
	def STXRH : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;

	def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
	def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;

	def LDXPW : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
	def LDXPX : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;

	def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
	def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;

	def STXPW : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
	def STXPX : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;

	let Predicates = [HasLOR] in {
	// v8.1a "Limited Order Region" extension load-acquire instructions
	def LDLARW : LoadAcquire <0b10, 1, 1, 0, 0, GPR32, "ldlar">;
	def LDLARX : LoadAcquire <0b11, 1, 1, 0, 0, GPR64, "ldlar">;
	def LDLARB : LoadAcquire <0b00, 1, 1, 0, 0, GPR32, "ldlarb">;
	def LDLARH : LoadAcquire <0b01, 1, 1, 0, 0, GPR32, "ldlarh">;

	// v8.1a "Limited Order Region" extension store-release instructions
	def STLLRW : StoreRelease <0b10, 1, 0, 0, 0, GPR32, "stllr">;
	def STLLRX : StoreRelease <0b11, 1, 0, 0, 0, GPR64, "stllr">;
	def STLLRB : StoreRelease <0b00, 1, 0, 0, 0, GPR32, "stllrb">;
	def STLLRH : StoreRelease <0b01, 1, 0, 0, 0, GPR32, "stllrh">;
	}

	//===----------------------------------------------------------------------===//
	// Scaled floating point to integer conversion instructions.
	//===----------------------------------------------------------------------===//

	defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_aarch64_neon_fcvtas>;
	defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_aarch64_neon_fcvtau>;
	defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_aarch64_neon_fcvtms>;
	defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_aarch64_neon_fcvtmu>;
	defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns>;
	defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>;
	defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>;
	defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>;
	defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
	defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
	defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
	defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;

	multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
	def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
	def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>;
	def : Pat<(i32 (round f32:$Rn)), (!cast<Instruction>(INST # UWSr) $Rn)>;
	def : Pat<(i64 (round f32:$Rn)), (!cast<Instruction>(INST # UXSr) $Rn)>;
	def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # UWDr) $Rn)>;
	def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # UXDr) $Rn)>;

	def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
	(!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
	def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
	(!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
	def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
	(!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
	def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
	(!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
	def : Pat<(i32 (round (fmul f64:$Rn, fixedpoint_f64_i32:$scale))),
	(!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
	def : Pat<(i64 (round (fmul f64:$Rn, fixedpoint_f64_i64:$scale))),
	(!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
	}

	defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;
	defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">;

	multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
	def : Pat<(i32 (to_int (round f32:$Rn))),
	(!cast<Instruction>(INST # UWSr) f32:$Rn)>;
	def : Pat<(i64 (to_int (round f32:$Rn))),
	(!cast<Instruction>(INST # UXSr) f32:$Rn)>;
	def : Pat<(i32 (to_int (round f64:$Rn))),
	(!cast<Instruction>(INST # UWDr) f64:$Rn)>;
	def : Pat<(i64 (to_int (round f64:$Rn))),
	(!cast<Instruction>(INST # UXDr) f64:$Rn)>;
	}

	defm : FPToIntegerPats<fp_to_sint, fceil, "FCVTPS">;
	defm : FPToIntegerPats<fp_to_uint, fceil, "FCVTPU">;
	defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">;
	defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">;
	defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">;
	defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">;
	defm : FPToIntegerPats<fp_to_sint, fround, "FCVTAS">;
	defm : FPToIntegerPats<fp_to_uint, fround, "FCVTAU">;

	let Predicates = [HasFullFP16] in {
	def : Pat<(i32 (lround f16:$Rn)),
	(!cast<Instruction>(FCVTASUWHr) f16:$Rn)>;
	def : Pat<(i64 (lround f16:$Rn)),
	(!cast<Instruction>(FCVTASUXHr) f16:$Rn)>;
	def : Pat<(i64 (llround f16:$Rn)),
	(!cast<Instruction>(FCVTASUXHr) f16:$Rn)>;
	}
	def : Pat<(i32 (lround f32:$Rn)),
	(!cast<Instruction>(FCVTASUWSr) f32:$Rn)>;
	def : Pat<(i32 (lround f64:$Rn)),
	(!cast<Instruction>(FCVTASUWDr) f64:$Rn)>;
	def : Pat<(i64 (lround f32:$Rn)),
	(!cast<Instruction>(FCVTASUXSr) f32:$Rn)>;
	def : Pat<(i64 (lround f64:$Rn)),
	(!cast<Instruction>(FCVTASUXDr) f64:$Rn)>;
	def : Pat<(i64 (llround f32:$Rn)),
	(!cast<Instruction>(FCVTASUXSr) f32:$Rn)>;
	def : Pat<(i64 (llround f64:$Rn)),
	(!cast<Instruction>(FCVTASUXDr) f64:$Rn)>;

	//===----------------------------------------------------------------------===//
	// Scaled integer to floating point conversion instructions.
	//===----------------------------------------------------------------------===//

	defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
	defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;

	//===----------------------------------------------------------------------===//
	// Unscaled integer to floating point conversion instruction.
	//===----------------------------------------------------------------------===//

	defm FMOV : UnscaledConversion<"fmov">;

	// Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable
	let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in {
	def FMOVH0 : Pseudo<(outs FPR16:$Rd), (ins), [(set f16:$Rd, (fpimm0))]>,
	Sched<[WriteF]>, Requires<[HasFullFP16]>;
	def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
	Sched<[WriteF]>;
	def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>,
	Sched<[WriteF]>;
	}
	// Similarly add aliases
	def : InstAlias<"fmov $Rd, #0.0", (FMOVWHr FPR16:$Rd, WZR), 0>,
	Requires<[HasFullFP16]>;
	def : InstAlias<"fmov $Rd, #0.0", (FMOVWSr FPR32:$Rd, WZR), 0>;
	def : InstAlias<"fmov $Rd, #0.0", (FMOVXDr FPR64:$Rd, XZR), 0>;

	//===----------------------------------------------------------------------===//
	// Floating point conversion instruction.
	//===----------------------------------------------------------------------===//

	defm FCVT : FPConversion<"fcvt">;

	//===----------------------------------------------------------------------===//
	// Floating point single operand instructions.
	//===----------------------------------------------------------------------===//

	defm FABS : SingleOperandFPData<0b0001, "fabs", fabs>;
	defm FMOV : SingleOperandFPData<0b0000, "fmov">;
	defm FNEG : SingleOperandFPData<0b0010, "fneg", fneg>;
	defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>;
	defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
	defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
	defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
	defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;

	def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
	(FRINTNDr FPR64:$Rn)>;

	defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
	defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;

	let SchedRW = [WriteFDiv] in {
	defm FSQRT : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
	}

	let Predicates = [HasFRInt3264] in {
	defm FRINT32Z : FRIntNNT<0b00, "frint32z">;
	defm FRINT64Z : FRIntNNT<0b10, "frint64z">;
	defm FRINT32X : FRIntNNT<0b01, "frint32x">;
	defm FRINT64X : FRIntNNT<0b11, "frint64x">;
	} // HasFRInt3264

	let Predicates = [HasFullFP16] in {
	def : Pat<(i32 (lrint f16:$Rn)),
	(FCVTZSUWHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
	def : Pat<(i64 (lrint f16:$Rn)),
	(FCVTZSUXHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
	def : Pat<(i64 (llrint f16:$Rn)),
	(FCVTZSUXHr (!cast<Instruction>(FRINTXHr) f16:$Rn))>;
	}
	def : Pat<(i32 (lrint f32:$Rn)),
	(FCVTZSUWSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
	def : Pat<(i32 (lrint f64:$Rn)),
	(FCVTZSUWDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
	def : Pat<(i64 (lrint f32:$Rn)),
	(FCVTZSUXSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
	def : Pat<(i64 (lrint f64:$Rn)),
	(FCVTZSUXDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;
	def : Pat<(i64 (llrint f32:$Rn)),
	(FCVTZSUXSr (!cast<Instruction>(FRINTXSr) f32:$Rn))>;
	def : Pat<(i64 (llrint f64:$Rn)),
	(FCVTZSUXDr (!cast<Instruction>(FRINTXDr) f64:$Rn))>;

	//===----------------------------------------------------------------------===//
	// Floating point two operand instructions.
	//===----------------------------------------------------------------------===//

	defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>;
	let SchedRW = [WriteFDiv] in {
	defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>;
	}
	defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>;
	defm FMAX : TwoOperandFPData<0b0100, "fmax", fmaximum>;
	defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>;
	defm FMIN : TwoOperandFPData<0b0101, "fmin", fminimum>;
	let SchedRW = [WriteFMul] in {
	defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>;
	defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
	}
	defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>;

	def : Pat<(v1f64 (fmaximum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
	(FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(v1f64 (fminimum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
	(FMINDrr FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
	(FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
	(FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;

	//===----------------------------------------------------------------------===//
	// Floating point three operand instructions.
	//===----------------------------------------------------------------------===//

	defm FMADD : ThreeOperandFPData<0, 0, "fmadd", fma>;
	defm FMSUB : ThreeOperandFPData<0, 1, "fmsub",
	TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
	defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
	TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
	defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
	TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;

	// The following def pats catch the case where the LHS of an FMA is negated.
	// The TriOpFrag above catches the case where the middle operand is negated.

	// N.b. FMSUB etc have the accumulator at the end of (outs), unlike
	// the NEON variant.
	def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
	(FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;

	def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
	(FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;

	// We handled -(a + bc) for FNMADD above, now it's time for "(-a) + (-b)c" and
	// "(-a) + b*(-c)".
	def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
	(FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;

	def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
	(FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;

	def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
	(FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;

	def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
	(FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;

	//===----------------------------------------------------------------------===//
	// Floating point comparison instructions.
	//===----------------------------------------------------------------------===//

	defm FCMPE : FPComparison<1, "fcmpe">;
	defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>;

	//===----------------------------------------------------------------------===//
	// Floating point conditional comparison instructions.
	//===----------------------------------------------------------------------===//

	defm FCCMPE : FPCondComparison<1, "fccmpe">;
	defm FCCMP : FPCondComparison<0, "fccmp", AArch64fccmp>;

	//===----------------------------------------------------------------------===//
	// Floating point conditional select instruction.
	//===----------------------------------------------------------------------===//

	defm FCSEL : FPCondSelect<"fcsel">;

	// CSEL instructions providing f128 types need to be handled by a
	// pseudo-instruction since the eventual code will need to introduce basic
	// blocks and control flow.
	def F128CSEL : Pseudo<(outs FPR128:$Rd),
	(ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
	[(set (f128 FPR128:$Rd),
	(AArch64csel FPR128:$Rn, FPR128:$Rm,
	(i32 imm:$cond), NZCV))]> {
	let Uses = [NZCV];
	let usesCustomInserter = 1;
	let hasNoSchedulingInfo = 1;
	}

	//===----------------------------------------------------------------------===//
	// Instructions used for emitting unwind opcodes on ARM64 Windows.
	//===----------------------------------------------------------------------===//
	let isPseudo = 1 in {
	def SEH_StackAlloc : Pseudo<(outs), (ins i32imm:$size), []>, Sched<[]>;
	def SEH_SaveFPLR : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>;
	def SEH_SaveFPLR_X : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>;
	def SEH_SaveReg : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
	def SEH_SaveReg_X : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
	def SEH_SaveRegP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
	def SEH_SaveRegP_X : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
	def SEH_SaveFReg : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
	def SEH_SaveFReg_X : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
	def SEH_SaveFRegP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
	def SEH_SaveFRegP_X : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
	def SEH_SetFP : Pseudo<(outs), (ins), []>, Sched<[]>;
	def SEH_AddFP : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>;
	def SEH_Nop : Pseudo<(outs), (ins), []>, Sched<[]>;
	def SEH_PrologEnd : Pseudo<(outs), (ins), []>, Sched<[]>;
	def SEH_EpilogStart : Pseudo<(outs), (ins), []>, Sched<[]>;
	def SEH_EpilogEnd : Pseudo<(outs), (ins), []>, Sched<[]>;
	}

	// Pseudo instructions for Windows EH
	//===----------------------------------------------------------------------===//
	let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
	isCodeGenOnly = 1, isReturn = 1, isEHScopeReturn = 1, isPseudo = 1 in {
	def CLEANUPRET : Pseudo<(outs), (ins), [(cleanupret)]>, Sched<[]>;
	let usesCustomInserter = 1 in
	def CATCHRET : Pseudo<(outs), (ins am_brcond:$dst, am_brcond:$src), [(catchret bb:$dst, bb:$src)]>,
	Sched<[]>;
	}

	let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
	usesCustomInserter = 1 in
	def CATCHPAD : Pseudo<(outs), (ins), [(catchpad)]>, Sched<[]>;

	//===----------------------------------------------------------------------===//
	// Floating point immediate move.
	//===----------------------------------------------------------------------===//

	let isReMaterializable = 1 in {
	defm FMOV : FPMoveImmediate<"fmov">;
	}

	//===----------------------------------------------------------------------===//
	// Advanced SIMD two vector instructions.
	//===----------------------------------------------------------------------===//

	defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
	int_aarch64_neon_uabd>;
	// Match UABDL in log2-shuffle patterns.
	def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
	(zext (v8i8 V64:$opB))))),
	(UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
	def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
	(v8i16 (add (sub (zext (v8i8 V64:$opA)),
	(zext (v8i8 V64:$opB))),
	(AArch64vashr v8i16:$src, (i32 15))))),
	(UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
	def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 V128:$opA)),
	(zext (extract_high_v16i8 V128:$opB))))),
	(UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
	def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
	(v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)),
	(zext (extract_high_v16i8 V128:$opB))),
	(AArch64vashr v8i16:$src, (i32 15))))),
	(UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
	def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)),
	(zext (v4i16 V64:$opB))))),
	(UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
	def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 V128:$opA)),
	(zext (extract_high_v8i16 V128:$opB))))),
	(UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
	def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)),
	(zext (v2i32 V64:$opB))))),
	(UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
	def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 V128:$opA)),
	(zext (extract_high_v4i32 V128:$opB))))),
	(UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;

	defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>;
	defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
	defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
	defm CMEQ : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
	defm CMGE : SIMDCmpTwoVector<1, 0b01000, "cmge", AArch64cmgez>;
	defm CMGT : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>;
	defm CMLE : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>;
	defm CMLT : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
	defm CNT : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
	defm FABS : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;

	defm FCMEQ : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
	defm FCMGE : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
	defm FCMGT : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
	defm FCMLE : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
	defm FCMLT : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
	defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>;
	defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>;
	defm FCVTL : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
	def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
	(FCVTLv4i16 V64:$Rn)>;
	def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
	(i64 4)))),
	(FCVTLv8i16 V128:$Rn)>;
	def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
	def : Pat<(v2f64 (fpextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
	(i64 2))))),
	(FCVTLv4i32 V128:$Rn)>;

	def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
	def : Pat<(v4f32 (fpextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
	(i64 4))))),
	(FCVTLv8i16 V128:$Rn)>;

	defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
	defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
	defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
	defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>;
	defm FCVTN : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
	def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
	(FCVTNv4i16 V128:$Rn)>;
	def : Pat<(concat_vectors V64:$Rd,
	(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
	(FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
	def : Pat<(v2f32 (fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
	def : Pat<(v4f16 (fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
	def : Pat<(concat_vectors V64:$Rd, (v2f32 (fpround (v2f64 V128:$Rn)))),
	(FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
	defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
	defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
	defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
	int_aarch64_neon_fcvtxn>;
	defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
	defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;

	def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>;
	def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>;
	def : Pat<(v2i32 (int_aarch64_neon_fcvtzs v2f32:$Rn)), (FCVTZSv2f32 $Rn)>;
	def : Pat<(v4i32 (int_aarch64_neon_fcvtzs v4f32:$Rn)), (FCVTZSv4f32 $Rn)>;
	def : Pat<(v2i64 (int_aarch64_neon_fcvtzs v2f64:$Rn)), (FCVTZSv2f64 $Rn)>;

	def : Pat<(v4i16 (int_aarch64_neon_fcvtzu v4f16:$Rn)), (FCVTZUv4f16 $Rn)>;
	def : Pat<(v8i16 (int_aarch64_neon_fcvtzu v8f16:$Rn)), (FCVTZUv8f16 $Rn)>;
	def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>;
	def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>;
	def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>;

	defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
	defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
	defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>;
	defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
	defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
	defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
	defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
	defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
	defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;

	let Predicates = [HasFRInt3264] in {
	defm FRINT32Z : FRIntNNTVector<0, 0, "frint32z">;
	defm FRINT64Z : FRIntNNTVector<0, 1, "frint64z">;
	defm FRINT32X : FRIntNNTVector<1, 0, "frint32x">;
	defm FRINT64X : FRIntNNTVector<1, 1, "frint64x">;
	} // HasFRInt3264

	defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
	defm FSQRT : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
	defm NEG : SIMDTwoVectorBHSD<1, 0b01011, "neg",
	UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
	defm NOT : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
	// Aliases for MVN -> NOT.
	def : InstAlias<"mvn{ $Vd.8b, $Vn.8b\|.8b $Vd, $Vn}",
	(NOTv8i8 V64:$Vd, V64:$Vn)>;
	def : InstAlias<"mvn{ $Vd.16b, $Vn.16b\|.16b $Vd, $Vn}",
	(NOTv16i8 V128:$Vd, V128:$Vn)>;

	def : Pat<(AArch64neg (v8i8 V64:$Rn)), (NEGv8i8 V64:$Rn)>;
	def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
	def : Pat<(AArch64neg (v4i16 V64:$Rn)), (NEGv4i16 V64:$Rn)>;
	def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
	def : Pat<(AArch64neg (v2i32 V64:$Rn)), (NEGv2i32 V64:$Rn)>;
	def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
	def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;

	def : Pat<(AArch64not (v8i8 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
	def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
	def : Pat<(AArch64not (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
	def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
	def : Pat<(AArch64not (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
	def : Pat<(AArch64not (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
	def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
	def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;

	def : Pat<(vnot (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
	def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
	def : Pat<(vnot (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
	def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
	def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;

	defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
	defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
	defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
	defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
	defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
	BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
	defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
	defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
	defm SHLL : SIMDVectorLShiftLongBySizeBHS;
	defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
	defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
	defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;
	defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
	defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
	defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
	BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >;
	defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
	int_aarch64_neon_uaddlp>;
	defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
	defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
	defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
	defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
	defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
	defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;

	def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;
	def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;
	def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
	def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
	def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
	def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;

	// Patterns for vector long shift (by element width). These need to match all
	// three of zext, sext and anyext so it's easier to pull the patterns out of the
	// definition.
	multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
	def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
	(SHLLv8i8 V64:$Rn)>;
	def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
	(SHLLv16i8 V128:$Rn)>;
	def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
	(SHLLv4i16 V64:$Rn)>;
	def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
	(SHLLv8i16 V128:$Rn)>;
	def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
	(SHLLv2i32 V64:$Rn)>;
	def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
	(SHLLv4i32 V128:$Rn)>;
	}

	defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
	defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
	defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;

	//===----------------------------------------------------------------------===//
	// Advanced SIMD three vector instructions.
	//===----------------------------------------------------------------------===//

	defm ADD : SIMDThreeSameVector<0, 0b10000, "add", add>;
	defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>;
	defm CMEQ : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>;
	defm CMGE : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>;
	defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
	defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
	defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
	defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
	defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
	let Predicates = [HasNEON] in {
	foreach VT = [ v2f32, v4f32, v2f64 ] in
	def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, VT:$Rm)>;
	}
	let Predicates = [HasNEON, HasFullFP16] in {
	foreach VT = [ v4f16, v8f16 ] in
	def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast<Instruction>("FABD"#VT) VT:$Rn, VT:$Rm)>;
	}
	defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
	defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
	defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>;
	defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
	defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
	defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
	defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
	defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
	defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
	defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
	defm FMAXP : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
	defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaximum>;
	defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
	defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
	defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
	defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminimum>;

	// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
	// instruction expects the addend first, while the fma intrinsic puts it last.
	defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
	TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
	defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
	TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;

	// The following def pats catch the case where the LHS of an FMA is negated.
	// The TriOpFrag above catches the case where the middle operand is negated.
	def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
	(FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;

	def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
	(FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;

	def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
	(FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;

	defm FMULX : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
	defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
	defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
	defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
	defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
	defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
	TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
	defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
	TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
	defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
	defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
	defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
	TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >;
	defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
	defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
	defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
	defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
	defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>;
	defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
	defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>;
	defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
	defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
	defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
	defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
	defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
	defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
	defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;
	defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
	defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
	defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>;
	defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
	TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
	defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
	defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
	defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
	defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
	defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
	defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
	defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>;
	defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
	defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
	defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
	defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
	defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
	defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
	defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
	defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
	int_aarch64_neon_sqadd>;
	defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
	int_aarch64_neon_sqsub>;

	defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
	defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
	BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
	defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
	defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
	defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
	TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
	defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
	defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
	BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
	defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;


	def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
	(BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
	def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
	(BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
	def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
	(BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
	def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
	(BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;

	def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
	(BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
	def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
	(BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
	def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
	(BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
	def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
	(BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;

	def : InstAlias<"mov{\t$dst.16b, $src.16b\|.16b\t$dst, $src}",
	(ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
	def : InstAlias<"mov{\t$dst.8h, $src.8h\|.8h\t$dst, $src}",
	(ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
	def : InstAlias<"mov{\t$dst.4s, $src.4s\|.4s\t$dst, $src}",
	(ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
	def : InstAlias<"mov{\t$dst.2d, $src.2d\|.2d\t$dst, $src}",
	(ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;

	def : InstAlias<"mov{\t$dst.8b, $src.8b\|.8b\t$dst, $src}",
	(ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>;
	def : InstAlias<"mov{\t$dst.4h, $src.4h\|.4h\t$dst, $src}",
	(ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
	def : InstAlias<"mov{\t$dst.2s, $src.2s\|.2s\t$dst, $src}",
	(ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
	def : InstAlias<"mov{\t$dst.1d, $src.1d\|.1d\t$dst, $src}",
	(ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;

	def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
	"\|cmls.8b\t$dst, $src1, $src2}",
	(CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
	"\|cmls.16b\t$dst, $src1, $src2}",
	(CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
	"\|cmls.4h\t$dst, $src1, $src2}",
	(CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
	"\|cmls.8h\t$dst, $src1, $src2}",
	(CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
	"\|cmls.2s\t$dst, $src1, $src2}",
	(CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
	"\|cmls.4s\t$dst, $src1, $src2}",
	(CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
	"\|cmls.2d\t$dst, $src1, $src2}",
	(CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;

	def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
	"\|cmlo.8b\t$dst, $src1, $src2}",
	(CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
	"\|cmlo.16b\t$dst, $src1, $src2}",
	(CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
	"\|cmlo.4h\t$dst, $src1, $src2}",
	(CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
	"\|cmlo.8h\t$dst, $src1, $src2}",
	(CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
	"\|cmlo.2s\t$dst, $src1, $src2}",
	(CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
	"\|cmlo.4s\t$dst, $src1, $src2}",
	(CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
	"\|cmlo.2d\t$dst, $src1, $src2}",
	(CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;

	def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
	"\|cmle.8b\t$dst, $src1, $src2}",
	(CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
	"\|cmle.16b\t$dst, $src1, $src2}",
	(CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
	"\|cmle.4h\t$dst, $src1, $src2}",
	(CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
	"\|cmle.8h\t$dst, $src1, $src2}",
	(CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
	"\|cmle.2s\t$dst, $src1, $src2}",
	(CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
	"\|cmle.4s\t$dst, $src1, $src2}",
	(CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
	"\|cmle.2d\t$dst, $src1, $src2}",
	(CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;

	def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
	"\|cmlt.8b\t$dst, $src1, $src2}",
	(CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
	"\|cmlt.16b\t$dst, $src1, $src2}",
	(CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
	"\|cmlt.4h\t$dst, $src1, $src2}",
	(CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
	"\|cmlt.8h\t$dst, $src1, $src2}",
	(CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
	"\|cmlt.2s\t$dst, $src1, $src2}",
	(CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
	"\|cmlt.4s\t$dst, $src1, $src2}",
	(CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
	"\|cmlt.2d\t$dst, $src1, $src2}",
	(CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;

	let Predicates = [HasNEON, HasFullFP16] in {
	def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" #
	"\|fcmle.4h\t$dst, $src1, $src2}",
	(FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{fcmle\t$dst.8h, $src1.8h, $src2.8h" #
	"\|fcmle.8h\t$dst, $src1, $src2}",
	(FCMGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
	}
	def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
	"\|fcmle.2s\t$dst, $src1, $src2}",
	(FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
	"\|fcmle.4s\t$dst, $src1, $src2}",
	(FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
	"\|fcmle.2d\t$dst, $src1, $src2}",
	(FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;

	let Predicates = [HasNEON, HasFullFP16] in {
	def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" #
	"\|fcmlt.4h\t$dst, $src1, $src2}",
	(FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{fcmlt\t$dst.8h, $src1.8h, $src2.8h" #
	"\|fcmlt.8h\t$dst, $src1, $src2}",
	(FCMGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
	}
	def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
	"\|fcmlt.2s\t$dst, $src1, $src2}",
	(FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
	"\|fcmlt.4s\t$dst, $src1, $src2}",
	(FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
	"\|fcmlt.2d\t$dst, $src1, $src2}",
	(FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;

	let Predicates = [HasNEON, HasFullFP16] in {
	def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" #
	"\|facle.4h\t$dst, $src1, $src2}",
	(FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{facle\t$dst.8h, $src1.8h, $src2.8h" #
	"\|facle.8h\t$dst, $src1, $src2}",
	(FACGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
	}
	def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
	"\|facle.2s\t$dst, $src1, $src2}",
	(FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
	"\|facle.4s\t$dst, $src1, $src2}",
	(FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
	"\|facle.2d\t$dst, $src1, $src2}",
	(FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;

	let Predicates = [HasNEON, HasFullFP16] in {
	def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" #
	"\|faclt.4h\t$dst, $src1, $src2}",
	(FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{faclt\t$dst.8h, $src1.8h, $src2.8h" #
	"\|faclt.8h\t$dst, $src1, $src2}",
	(FACGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
	}
	def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
	"\|faclt.2s\t$dst, $src1, $src2}",
	(FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
	"\|faclt.4s\t$dst, $src1, $src2}",
	(FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
	"\|faclt.2d\t$dst, $src1, $src2}",
	(FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;

	//===----------------------------------------------------------------------===//
	// Advanced SIMD three scalar instructions.
	//===----------------------------------------------------------------------===//

	defm ADD : SIMDThreeScalarD<0, 0b10000, "add", add>;
	defm CMEQ : SIMDThreeScalarD<1, 0b10001, "cmeq", AArch64cmeq>;
	defm CMGE : SIMDThreeScalarD<0, 0b00111, "cmge", AArch64cmge>;
	defm CMGT : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
	defm CMHI : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
	defm CMHS : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
	defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
	defm FABD : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
	def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
	(FABD64 FPR64:$Rn, FPR64:$Rm)>;
	let Predicates = [HasFullFP16] in {
	def : Pat<(fabs (fsub f16:$Rn, f16:$Rm)), (FABD16 f16:$Rn, f16:$Rm)>;
	}
	def : Pat<(fabs (fsub f32:$Rn, f32:$Rm)), (FABD32 f32:$Rn, f32:$Rm)>;
	def : Pat<(fabs (fsub f64:$Rn, f64:$Rm)), (FABD64 f64:$Rn, f64:$Rm)>;
	defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge",
	int_aarch64_neon_facge>;
	defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
	int_aarch64_neon_facgt>;
	defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
	defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
	defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
	defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>;
	defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>;
	defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>;
	defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
	defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
	defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
	defm SQRSHL : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_aarch64_neon_sqrshl>;
	defm SQSHL : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl>;
	defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub>;
	defm SRSHL : SIMDThreeScalarD< 0, 0b01010, "srshl", int_aarch64_neon_srshl>;
	defm SSHL : SIMDThreeScalarD< 0, 0b01000, "sshl", int_aarch64_neon_sshl>;
	defm SUB : SIMDThreeScalarD< 1, 0b10000, "sub", sub>;
	defm UQADD : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd>;
	defm UQRSHL : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_aarch64_neon_uqrshl>;
	defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>;
	defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
	defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>;
	defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>;
	let Predicates = [HasRDM] in {
	defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
	defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
	def : Pat<(i32 (int_aarch64_neon_sqadd
	(i32 FPR32:$Rd),
	(i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
	(i32 FPR32:$Rm))))),
	(SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
	def : Pat<(i32 (int_aarch64_neon_sqsub
	(i32 FPR32:$Rd),
	(i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
	(i32 FPR32:$Rm))))),
	(SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
	}

	def : InstAlias<"cmls $dst, $src1, $src2",
	(CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
	def : InstAlias<"cmle $dst, $src1, $src2",
	(CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
	def : InstAlias<"cmlo $dst, $src1, $src2",
	(CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
	def : InstAlias<"cmlt $dst, $src1, $src2",
	(CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
	def : InstAlias<"fcmle $dst, $src1, $src2",
	(FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
	def : InstAlias<"fcmle $dst, $src1, $src2",
	(FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
	def : InstAlias<"fcmlt $dst, $src1, $src2",
	(FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
	def : InstAlias<"fcmlt $dst, $src1, $src2",
	(FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
	def : InstAlias<"facle $dst, $src1, $src2",
	(FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
	def : InstAlias<"facle $dst, $src1, $src2",
	(FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
	def : InstAlias<"faclt $dst, $src1, $src2",
	(FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
	def : InstAlias<"faclt $dst, $src1, $src2",
	(FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;

	//===----------------------------------------------------------------------===//
	// Advanced SIMD three scalar instructions (mixed operands).
	//===----------------------------------------------------------------------===//
	defm SQDMULL : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
	int_aarch64_neon_sqdmulls_scalar>;
	defm SQDMLAL : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
	defm SQDMLSL : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;

	def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
	(i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
	(i32 FPR32:$Rm))))),
	(SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
	def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
	(i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
	(i32 FPR32:$Rm))))),
	(SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;

	//===----------------------------------------------------------------------===//
	// Advanced SIMD two scalar instructions.
	//===----------------------------------------------------------------------===//

	defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", abs>;
	defm CMEQ : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
	defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
	defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
	defm CMLE : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
	defm CMLT : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
	defm FCMEQ : SIMDFPCmpTwoScalar<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
	defm FCMGE : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
	defm FCMGT : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
	defm FCMLE : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
	defm FCMLT : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
	defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas">;
	defm FCVTAU : SIMDFPTwoScalar< 1, 0, 0b11100, "fcvtau">;
	defm FCVTMS : SIMDFPTwoScalar< 0, 0, 0b11011, "fcvtms">;
	defm FCVTMU : SIMDFPTwoScalar< 1, 0, 0b11011, "fcvtmu">;
	defm FCVTNS : SIMDFPTwoScalar< 0, 0, 0b11010, "fcvtns">;
	defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu">;
	defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps">;
	defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">;
	def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
	defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">;
	defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">;
	defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">;
	defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">;
	defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">;
	defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg",
	UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
	defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>;
	defm SQABS : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
	defm SQNEG : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
	defm SQXTN : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
	defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
	defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
	int_aarch64_neon_suqadd>;
	defm UCVTF : SIMDFPTwoScalarCVT< 1, 0, 0b11101, "ucvtf", AArch64uitof>;
	defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
	defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
	int_aarch64_neon_usqadd>;

	def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;

	def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))),
	(FCVTASv1i64 FPR64:$Rn)>;
	def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))),
	(FCVTAUv1i64 FPR64:$Rn)>;
	def : Pat<(v1i64 (int_aarch64_neon_fcvtms (v1f64 FPR64:$Rn))),
	(FCVTMSv1i64 FPR64:$Rn)>;
	def : Pat<(v1i64 (int_aarch64_neon_fcvtmu (v1f64 FPR64:$Rn))),
	(FCVTMUv1i64 FPR64:$Rn)>;
	def : Pat<(v1i64 (int_aarch64_neon_fcvtns (v1f64 FPR64:$Rn))),
	(FCVTNSv1i64 FPR64:$Rn)>;
	def : Pat<(v1i64 (int_aarch64_neon_fcvtnu (v1f64 FPR64:$Rn))),
	(FCVTNUv1i64 FPR64:$Rn)>;
	def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
	(FCVTPSv1i64 FPR64:$Rn)>;
	def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
	(FCVTPUv1i64 FPR64:$Rn)>;

	def : Pat<(f16 (int_aarch64_neon_frecpe (f16 FPR16:$Rn))),
	(FRECPEv1f16 FPR16:$Rn)>;
	def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
	(FRECPEv1i32 FPR32:$Rn)>;
	def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
	(FRECPEv1i64 FPR64:$Rn)>;
	def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
	(FRECPEv1i64 FPR64:$Rn)>;

	def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))),
	(FRECPEv1i32 FPR32:$Rn)>;
	def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))),
	(FRECPEv2f32 V64:$Rn)>;
	def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))),
	(FRECPEv4f32 FPR128:$Rn)>;
	def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))),
	(FRECPEv1i64 FPR64:$Rn)>;
	def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))),
	(FRECPEv1i64 FPR64:$Rn)>;
	def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))),
	(FRECPEv2f64 FPR128:$Rn)>;

	def : Pat<(f32 (AArch64frecps (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
	(FRECPS32 FPR32:$Rn, FPR32:$Rm)>;
	def : Pat<(v2f32 (AArch64frecps (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
	(FRECPSv2f32 V64:$Rn, V64:$Rm)>;
	def : Pat<(v4f32 (AArch64frecps (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
	(FRECPSv4f32 FPR128:$Rn, FPR128:$Rm)>;
	def : Pat<(f64 (AArch64frecps (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
	(FRECPS64 FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
	(FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>;

	def : Pat<(f16 (int_aarch64_neon_frecpx (f16 FPR16:$Rn))),
	(FRECPXv1f16 FPR16:$Rn)>;
	def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
	(FRECPXv1i32 FPR32:$Rn)>;
	def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
	(FRECPXv1i64 FPR64:$Rn)>;

	def : Pat<(f16 (int_aarch64_neon_frsqrte (f16 FPR16:$Rn))),
	(FRSQRTEv1f16 FPR16:$Rn)>;
	def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
	(FRSQRTEv1i32 FPR32:$Rn)>;
	def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
	(FRSQRTEv1i64 FPR64:$Rn)>;
	def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
	(FRSQRTEv1i64 FPR64:$Rn)>;

	def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))),
	(FRSQRTEv1i32 FPR32:$Rn)>;
	def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))),
	(FRSQRTEv2f32 V64:$Rn)>;
	def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))),
	(FRSQRTEv4f32 FPR128:$Rn)>;
	def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))),
	(FRSQRTEv1i64 FPR64:$Rn)>;
	def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))),
	(FRSQRTEv1i64 FPR64:$Rn)>;
	def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))),
	(FRSQRTEv2f64 FPR128:$Rn)>;

	def : Pat<(f32 (AArch64frsqrts (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
	(FRSQRTS32 FPR32:$Rn, FPR32:$Rm)>;
	def : Pat<(v2f32 (AArch64frsqrts (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
	(FRSQRTSv2f32 V64:$Rn, V64:$Rm)>;
	def : Pat<(v4f32 (AArch64frsqrts (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
	(FRSQRTSv4f32 FPR128:$Rn, FPR128:$Rm)>;
	def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
	(FRSQRTS64 FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
	(FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;

	// If an integer is about to be converted to a floating point value,
	// just load it on the floating point unit.
	// Here are the patterns for 8 and 16-bits to float.
	// 8-bits -> float.
	multiclass UIntToFPROLoadPat<ValueType DstTy, ValueType SrcTy,
	SDPatternOperator loadop, Instruction UCVTF,
	ROAddrMode ro, Instruction LDRW, Instruction LDRX,
	SubRegIndex sub> {
	def : Pat<(DstTy (uint_to_fp (SrcTy
	(loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm,
	ro.Wext:$extend))))),
	(UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
	(LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
	sub))>;

	def : Pat<(DstTy (uint_to_fp (SrcTy
	(loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm,
	ro.Wext:$extend))))),
	(UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
	(LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
	sub))>;
	}

	defm : UIntToFPROLoadPat<f32, i32, zextloadi8,
	UCVTFv1i32, ro8, LDRBroW, LDRBroX, bsub>;
	def : Pat <(f32 (uint_to_fp (i32
	(zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
	(UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
	(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
	def : Pat <(f32 (uint_to_fp (i32
	(zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
	(UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
	(LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
	// 16-bits -> float.
	defm : UIntToFPROLoadPat<f32, i32, zextloadi16,
	UCVTFv1i32, ro16, LDRHroW, LDRHroX, hsub>;
	def : Pat <(f32 (uint_to_fp (i32
	(zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
	(UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
	(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
	def : Pat <(f32 (uint_to_fp (i32
	(zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
	(UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
	(LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
	// 32-bits are handled in target specific dag combine:
	// performIntToFpCombine.
	// 64-bits integer to 32-bits floating point, not possible with
	// UCVTF on floating point registers (both source and destination
	// must have the same size).

	// Here are the patterns for 8, 16, 32, and 64-bits to double.
	// 8-bits -> double.
	defm : UIntToFPROLoadPat<f64, i32, zextloadi8,
	UCVTFv1i64, ro8, LDRBroW, LDRBroX, bsub>;
	def : Pat <(f64 (uint_to_fp (i32
	(zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
	(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
	def : Pat <(f64 (uint_to_fp (i32
	(zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
	(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	(LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
	// 16-bits -> double.
	defm : UIntToFPROLoadPat<f64, i32, zextloadi16,
	UCVTFv1i64, ro16, LDRHroW, LDRHroX, hsub>;
	def : Pat <(f64 (uint_to_fp (i32
	(zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
	(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
	def : Pat <(f64 (uint_to_fp (i32
	(zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
	(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	(LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
	// 32-bits -> double.
	defm : UIntToFPROLoadPat<f64, i32, load,
	UCVTFv1i64, ro32, LDRSroW, LDRSroX, ssub>;
	def : Pat <(f64 (uint_to_fp (i32
	(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
	(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>;
	def : Pat <(f64 (uint_to_fp (i32
	(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
	(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	(LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
	// 64-bits -> double are handled in target specific dag combine:
	// performIntToFpCombine.

	//===----------------------------------------------------------------------===//
	// Advanced SIMD three different-sized vector instructions.
	//===----------------------------------------------------------------------===//

	defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
	defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
	defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
	defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
	defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
	defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
	int_aarch64_neon_sabd>;
	defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
	int_aarch64_neon_sabd>;
	defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl",
	BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
	defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw",
	BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
	defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
	TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
	defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
	TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
	defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
	defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
	int_aarch64_neon_sqadd>;
	defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
	int_aarch64_neon_sqsub>;
	defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
	int_aarch64_neon_sqdmull>;
	defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
	BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
	defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
	BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
	defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
	int_aarch64_neon_uabd>;
	defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
	BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
	defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
	BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
	defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
	TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
	defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
	TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
	defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
	defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
	BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
	defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
	BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;

	// Additional patterns for SMULL and UMULL
	multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
	Instruction INST8B, Instruction INST4H, Instruction INST2S> {
	def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
	(INST8B V64:$Rn, V64:$Rm)>;
	def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
	(INST4H V64:$Rn, V64:$Rm)>;
	def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
	(INST2S V64:$Rn, V64:$Rm)>;
	}

	defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
	SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
	defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
	UMULLv4i16_v4i32, UMULLv2i32_v2i64>;

	// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
	multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
	Instruction INST8B, Instruction INST4H, Instruction INST2S> {
	def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
	(INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
	def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
	(INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
	def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
	(INST2S V128:$Rd, V64:$Rn, V64:$Rm)>;
	}

	defm : Neon_mulacc_widen_patterns<
	TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
	SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
	defm : Neon_mulacc_widen_patterns<
	TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
	UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
	defm : Neon_mulacc_widen_patterns<
	TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
	SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
	defm : Neon_mulacc_widen_patterns<
	TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
	UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;

	// Patterns for 64-bit pmull
	def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
	(PMULLv1i64 V64:$Rn, V64:$Rm)>;
	def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)),
	(extractelt (v2i64 V128:$Rm), (i64 1))),
	(PMULLv2i64 V128:$Rn, V128:$Rm)>;

	// CodeGen patterns for addhn and subhn instructions, which can actually be
	// written in LLVM IR without too much difficulty.

	// ADDHN
	def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
	(ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
	def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
	(i32 16))))),
	(ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
	def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
	(i32 32))))),
	(ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
	def : Pat<(concat_vectors (v8i8 V64:$Rd),
	(trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm),
	(i32 8))))),
	(ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;
	def : Pat<(concat_vectors (v4i16 V64:$Rd),
	(trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
	(i32 16))))),
	(ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;
	def : Pat<(concat_vectors (v2i32 V64:$Rd),
	(trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
	(i32 32))))),
	(ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;

	// SUBHN
	def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
	(SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
	def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
	(i32 16))))),
	(SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
	def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
	(i32 32))))),
	(SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
	def : Pat<(concat_vectors (v8i8 V64:$Rd),
	(trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
	(i32 8))))),
	(SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;
	def : Pat<(concat_vectors (v4i16 V64:$Rd),
	(trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
	(i32 16))))),
	(SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;
	def : Pat<(concat_vectors (v2i32 V64:$Rd),
	(trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
	(i32 32))))),
	(SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;

	//----------------------------------------------------------------------------
	// AdvSIMD bitwise extract from vector instruction.
	//----------------------------------------------------------------------------

	defm EXT : SIMDBitwiseExtract<"ext">;

	def AdjustExtImm : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(8 + N->getZExtValue(), SDLoc(N), MVT::i32);
	}]>;
	multiclass ExtPat<ValueType VT64, ValueType VT128, int N> {
	def : Pat<(VT64 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
	(EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
	def : Pat<(VT128 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
	(EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
	// We use EXT to handle extract_subvector to copy the upper 64-bits of a
	// 128-bit vector.
	def : Pat<(VT64 (extract_subvector V128:$Rn, (i64 N))),
	(EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
	// A 64-bit EXT of two halves of the same 128-bit register can be done as a
	// single 128-bit EXT.
	def : Pat<(VT64 (AArch64ext (extract_subvector V128:$Rn, (i64 0)),
	(extract_subvector V128:$Rn, (i64 N)),
	(i32 imm:$imm))),
	(EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, imm:$imm), dsub)>;
	// A 64-bit EXT of the high half of a 128-bit register can be done using a
	// 128-bit EXT of the whole register with an adjustment to the immediate. The
	// top half of the other operand will be unset, but that doesn't matter as it
	// will not be used.
	def : Pat<(VT64 (AArch64ext (extract_subvector V128:$Rn, (i64 N)),
	V64:$Rm,
	(i32 imm:$imm))),
	(EXTRACT_SUBREG (EXTv16i8 V128:$Rn,
	(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
	(AdjustExtImm imm:$imm)), dsub)>;
	}

	defm : ExtPat<v8i8, v16i8, 8>;
	defm : ExtPat<v4i16, v8i16, 4>;
	defm : ExtPat<v4f16, v8f16, 4>;
	defm : ExtPat<v2i32, v4i32, 2>;
	defm : ExtPat<v2f32, v4f32, 2>;
	defm : ExtPat<v1i64, v2i64, 1>;
	defm : ExtPat<v1f64, v2f64, 1>;

	//----------------------------------------------------------------------------
	// AdvSIMD zip vector
	//----------------------------------------------------------------------------

	defm TRN1 : SIMDZipVector<0b010, "trn1", AArch64trn1>;
	defm TRN2 : SIMDZipVector<0b110, "trn2", AArch64trn2>;
	defm UZP1 : SIMDZipVector<0b001, "uzp1", AArch64uzp1>;
	defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
	defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
	defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;

	//----------------------------------------------------------------------------
	// AdvSIMD TBL/TBX instructions
	//----------------------------------------------------------------------------

	defm TBL : SIMDTableLookup< 0, "tbl">;
	defm TBX : SIMDTableLookupTied<1, "tbx">;

	def : Pat<(v8i8 (int_aarch64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
	(TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
	def : Pat<(v16i8 (int_aarch64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
	(TBLv16i8One V128:$Ri, V128:$Rn)>;

	def : Pat<(v8i8 (int_aarch64_neon_tbx1 (v8i8 V64:$Rd),
	(v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
	(TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
	def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
	(v16i8 V128:$Ri), (v16i8 V128:$Rn))),
	(TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;


	//----------------------------------------------------------------------------
	// AdvSIMD scalar CPY instruction
	//----------------------------------------------------------------------------

	defm CPY : SIMDScalarCPY<"cpy">;

	//----------------------------------------------------------------------------
	// AdvSIMD scalar pairwise instructions
	//----------------------------------------------------------------------------

	defm ADDP : SIMDPairwiseScalarD<0, 0b11011, "addp">;
	defm FADDP : SIMDFPPairwiseScalar<0, 0b01101, "faddp">;
	defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">;
	defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
	defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
	defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
	def : Pat<(v2i64 (AArch64saddv V128:$Rn)),
	(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
	def : Pat<(v2i64 (AArch64uaddv V128:$Rn)),
	(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
	def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))),
	(FADDPv2i32p V64:$Rn)>;
	def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))),
	(FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
	def : Pat<(f64 (int_aarch64_neon_faddv (v2f64 V128:$Rn))),
	(FADDPv2i64p V128:$Rn)>;
	def : Pat<(f32 (int_aarch64_neon_fmaxnmv (v2f32 V64:$Rn))),
	(FMAXNMPv2i32p V64:$Rn)>;
	def : Pat<(f64 (int_aarch64_neon_fmaxnmv (v2f64 V128:$Rn))),
	(FMAXNMPv2i64p V128:$Rn)>;
	def : Pat<(f32 (int_aarch64_neon_fmaxv (v2f32 V64:$Rn))),
	(FMAXPv2i32p V64:$Rn)>;
	def : Pat<(f64 (int_aarch64_neon_fmaxv (v2f64 V128:$Rn))),
	(FMAXPv2i64p V128:$Rn)>;
	def : Pat<(f32 (int_aarch64_neon_fminnmv (v2f32 V64:$Rn))),
	(FMINNMPv2i32p V64:$Rn)>;
	def : Pat<(f64 (int_aarch64_neon_fminnmv (v2f64 V128:$Rn))),
	(FMINNMPv2i64p V128:$Rn)>;
	def : Pat<(f32 (int_aarch64_neon_fminv (v2f32 V64:$Rn))),
	(FMINPv2i32p V64:$Rn)>;
	def : Pat<(f64 (int_aarch64_neon_fminv (v2f64 V128:$Rn))),
	(FMINPv2i64p V128:$Rn)>;

	//----------------------------------------------------------------------------
	// AdvSIMD INS/DUP instructions
	//----------------------------------------------------------------------------

	def DUPv8i8gpr : SIMDDupFromMain<0, {?,?,?,?,1}, ".8b", v8i8, V64, GPR32>;
	def DUPv16i8gpr : SIMDDupFromMain<1, {?,?,?,?,1}, ".16b", v16i8, V128, GPR32>;
	def DUPv4i16gpr : SIMDDupFromMain<0, {?,?,?,1,0}, ".4h", v4i16, V64, GPR32>;
	def DUPv8i16gpr : SIMDDupFromMain<1, {?,?,?,1,0}, ".8h", v8i16, V128, GPR32>;
	def DUPv2i32gpr : SIMDDupFromMain<0, {?,?,1,0,0}, ".2s", v2i32, V64, GPR32>;
	def DUPv4i32gpr : SIMDDupFromMain<1, {?,?,1,0,0}, ".4s", v4i32, V128, GPR32>;
	def DUPv2i64gpr : SIMDDupFromMain<1, {?,1,0,0,0}, ".2d", v2i64, V128, GPR64>;

	def DUPv2i64lane : SIMDDup64FromElement;
	def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
	def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
	def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
	def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
	def DUPv8i8lane : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
	def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;

	// DUP from a 64-bit register to a 64-bit register is just a copy
	def : Pat<(v1i64 (AArch64dup (i64 GPR64:$Rn))),
	(COPY_TO_REGCLASS GPR64:$Rn, FPR64)>;
	def : Pat<(v1f64 (AArch64dup (f64 FPR64:$Rn))),
	(COPY_TO_REGCLASS FPR64:$Rn, FPR64)>;

	def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))),
	(v2f32 (DUPv2i32lane
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
	(i64 0)))>;
	def : Pat<(v4f32 (AArch64dup (f32 FPR32:$Rn))),
	(v4f32 (DUPv4i32lane
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
	(i64 0)))>;
	def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))),
	(v2f64 (DUPv2i64lane
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
	(i64 0)))>;
	def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))),
	(v4f16 (DUPv4i16lane
	(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
	(i64 0)))>;
	def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))),
	(v8f16 (DUPv8i16lane
	(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
	(i64 0)))>;

	def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
	(DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
	def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
	(DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;

	def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
	(DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
	def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
	(DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
	def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
	(DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;

	// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
	// instruction even if the types don't match: we just have to remap the lane
	// carefully. N.b. this trick only applies to truncations.
	def VecIndex_x2 : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(2 * N->getZExtValue(), SDLoc(N), MVT::i64);
	}]>;
	def VecIndex_x4 : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(4 * N->getZExtValue(), SDLoc(N), MVT::i64);
	}]>;
	def VecIndex_x8 : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64);
	}]>;

	multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
	ValueType Src128VT, ValueType ScalVT,
	Instruction DUP, SDNodeXForm IdxXFORM> {
	def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
	imm:$idx)))),
	(DUP V128:$Rn, (IdxXFORM imm:$idx))>;

	def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
	imm:$idx)))),
	(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
	}

	defm : DUPWithTruncPats<v8i8, v4i16, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
	defm : DUPWithTruncPats<v8i8, v2i32, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
	defm : DUPWithTruncPats<v4i16, v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;

	defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
	defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
	defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;

	multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
	SDNodeXForm IdxXFORM> {
	def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
	imm:$idx))))),
	(DUP V128:$Rn, (IdxXFORM imm:$idx))>;

	def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
	imm:$idx))))),
	(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
	}

	defm : DUPWithTrunci64Pats<v8i8, DUPv8i8lane, VecIndex_x8>;
	defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane, VecIndex_x4>;
	defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane, VecIndex_x2>;

	defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
	defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
	defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;

	// SMOV and UMOV definitions, with some extra patterns for convenience
	defm SMOV : SMov;
	defm UMOV : UMov;

	def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
	(i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
	def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
	(i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
	def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
	(i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
	def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
	(i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
	def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
	(i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
	def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
	(i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;

	def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v16i8 V128:$Rn),
	VectorIndexB:$idx)))), i8),
	(i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
	def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn),
	VectorIndexH:$idx)))), i16),
	(i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;

	// Extracting i8 or i16 elements will have the zero-extend transformed to
	// an 'and' mask by type legalization since neither i8 nor i16 are legal types
	// for AArch64. Match these patterns here since UMOV already zeroes out the high
	// bits of the destination register.
	def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
	(i32 0xff)),
	(i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
	def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
	(i32 0xffff)),
	(i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;

	defm INS : SIMDIns;

	def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
	(SUBREG_TO_REG (i32 0),
	(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
	def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
	(SUBREG_TO_REG (i32 0),
	(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;

	def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
	(SUBREG_TO_REG (i32 0),
	(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
	def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
	(SUBREG_TO_REG (i32 0),
	(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;

	def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
	(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
	def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
	(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;

	def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
	(v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
	(i32 FPR32:$Rn), ssub))>;
	def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
	(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
	(i32 FPR32:$Rn), ssub))>;

	def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
	(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
	(i64 FPR64:$Rn), dsub))>;

	def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
	(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
	def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
	(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;

	def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
	(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
	def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
	(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;

	def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
	(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;

	def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
	(f16 FPR16:$Rm), (i64 VectorIndexS:$imm))),
	(EXTRACT_SUBREG
	(INSvi16lane
	(v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)),
	VectorIndexS:$imm,
	(v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
	(i64 0)),
	dsub)>;

	def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
	(f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
	(INSvi16lane
	V128:$Rn, VectorIndexH:$imm,
	(v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
	(i64 0))>;

	def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
	(f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
	(EXTRACT_SUBREG
	(INSvi32lane
	(v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
	VectorIndexS:$imm,
	(v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
	(i64 0)),
	dsub)>;
	def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
	(f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
	(INSvi32lane
	V128:$Rn, VectorIndexS:$imm,
	(v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
	(i64 0))>;
	def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
	(f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
	(INSvi64lane
	V128:$Rn, VectorIndexD:$imm,
	(v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
	(i64 0))>;

	// Copy an element at a constant index in one vector into a constant indexed
	// element of another.
	// FIXME refactor to a shared class/dev parameterized on vector type, vector
	// index type and INS extension
	def : Pat<(v16i8 (int_aarch64_neon_vcopy_lane
	(v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
	VectorIndexB:$idx2)),
	(v16i8 (INSvi8lane
	V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
	)>;
	def : Pat<(v8i16 (int_aarch64_neon_vcopy_lane
	(v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
	VectorIndexH:$idx2)),
	(v8i16 (INSvi16lane
	V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
	)>;
	def : Pat<(v4i32 (int_aarch64_neon_vcopy_lane
	(v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
	VectorIndexS:$idx2)),
	(v4i32 (INSvi32lane
	V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
	)>;
	def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
	(v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
	VectorIndexD:$idx2)),
	(v2i64 (INSvi64lane
	V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
	)>;

	multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
	ValueType VTScal, Instruction INS> {
	def : Pat<(VT128 (vector_insert V128:$src,
	(VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
	imm:$Immd)),
	(INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;

	def : Pat<(VT128 (vector_insert V128:$src,
	(VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
	imm:$Immd)),
	(INS V128:$src, imm:$Immd,
	(SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;

	def : Pat<(VT64 (vector_insert V64:$src,
	(VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
	imm:$Immd)),
	(EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
	imm:$Immd, V128:$Rn, imm:$Immn),
	dsub)>;

	def : Pat<(VT64 (vector_insert V64:$src,
	(VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
	imm:$Immd)),
	(EXTRACT_SUBREG
	(INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
	(SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
	dsub)>;
	}

	defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;
	defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
	defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;


	// Floating point vector extractions are codegen'd as either a sequence of
	// subregister extractions, or a MOV (aka CPY here, alias for DUP) if
	// the lane number is anything other than zero.
	def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
	(f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
	def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
	(f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
	def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
	(f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;

	def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
	(f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>;
	def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
	(f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>;
	def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx),
	(f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>;

	// All concat_vectors operations are canonicalised to act on i64 vectors for
	// AArch64. In the general case we need an instruction, which had just as well be
	// INS.
	class ConcatPat<ValueType DstTy, ValueType SrcTy>
	: Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
	(INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
	(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;

	def : ConcatPat<v2i64, v1i64>;
	def : ConcatPat<v2f64, v1f64>;
	def : ConcatPat<v4i32, v2i32>;
	def : ConcatPat<v4f32, v2f32>;
	def : ConcatPat<v8i16, v4i16>;
	def : ConcatPat<v8f16, v4f16>;
	def : ConcatPat<v16i8, v8i8>;

	// If the high lanes are undef, though, we can just ignore them:
	class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
	: Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
	(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;

	def : ConcatUndefPat<v2i64, v1i64>;
	def : ConcatUndefPat<v2f64, v1f64>;
	def : ConcatUndefPat<v4i32, v2i32>;
	def : ConcatUndefPat<v4f32, v2f32>;
	def : ConcatUndefPat<v8i16, v4i16>;
	def : ConcatUndefPat<v16i8, v8i8>;

	//----------------------------------------------------------------------------
	// AdvSIMD across lanes instructions
	//----------------------------------------------------------------------------

	defm ADDV : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
	defm SMAXV : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
	defm SMINV : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
	defm UMAXV : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
	defm UMINV : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
	defm SADDLV : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
	defm UADDLV : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
	defm FMAXNMV : SIMDFPAcrossLanes<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
	defm FMAXV : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
	defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
	defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>;

	// Patterns for across-vector intrinsics, that have a node equivalent, that
	// returns a vector (with only the low lane defined) instead of a scalar.
	// In effect, opNode is the same as (scalar_to_vector (IntNode)).
	multiclass SIMDAcrossLanesIntrinsic<string baseOpc,
	SDPatternOperator opNode> {
	// If a lane instruction caught the vector_extract around opNode, we can
	// directly match the latter to the instruction.
	def : Pat<(v8i8 (opNode V64:$Rn)),
	(INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub)>;
	def : Pat<(v16i8 (opNode V128:$Rn)),
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub)>;
	def : Pat<(v4i16 (opNode V64:$Rn)),
	(INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub)>;
	def : Pat<(v8i16 (opNode V128:$Rn)),
	(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub)>;
	def : Pat<(v4i32 (opNode V128:$Rn)),
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub)>;


	// If none did, fallback to the explicit patterns, consuming the vector_extract.
	def : Pat<(i32 (vector_extract (insert_subvector undef, (v8i8 (opNode V64:$Rn)),
	(i32 0)), (i64 0))),
	(EXTRACT_SUBREG (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn),
	bsub), ssub)>;
	def : Pat<(i32 (vector_extract (v16i8 (opNode V128:$Rn)), (i64 0))),
	(EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn),
	bsub), ssub)>;
	def : Pat<(i32 (vector_extract (insert_subvector undef,
	(v4i16 (opNode V64:$Rn)), (i32 0)), (i64 0))),
	(EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn),
	hsub), ssub)>;
	def : Pat<(i32 (vector_extract (v8i16 (opNode V128:$Rn)), (i64 0))),
	(EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn),
	hsub), ssub)>;
	def : Pat<(i32 (vector_extract (v4i32 (opNode V128:$Rn)), (i64 0))),
	(EXTRACT_SUBREG (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn),
	ssub), ssub)>;

	}

	multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc,
	SDPatternOperator opNode>
	: SIMDAcrossLanesIntrinsic<baseOpc, opNode> {
	// If there is a sign extension after this intrinsic, consume it as smov already
	// performed it
	def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef,
	(opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), i8)),
	(i32 (SMOVvi8to32
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
	(i64 0)))>;
	def : Pat<(i32 (sext_inreg (i32 (vector_extract
	(opNode (v16i8 V128:$Rn)), (i64 0))), i8)),
	(i32 (SMOVvi8to32
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
	(i64 0)))>;
	def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef,
	(opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), i16)),
	(i32 (SMOVvi16to32
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
	(i64 0)))>;
	def : Pat<(i32 (sext_inreg (i32 (vector_extract
	(opNode (v8i16 V128:$Rn)), (i64 0))), i16)),
	(i32 (SMOVvi16to32
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
	(i64 0)))>;
	}

	multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc,
	SDPatternOperator opNode>
	: SIMDAcrossLanesIntrinsic<baseOpc, opNode> {
	// If there is a masking operation keeping only what has been actually
	// generated, consume it.
	def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef,
	(opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), maski8_or_more)),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
	ssub))>;
	def : Pat<(i32 (and (i32 (vector_extract (opNode (v16i8 V128:$Rn)), (i64 0))),
	maski8_or_more)),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
	ssub))>;
	def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef,
	(opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), maski16_or_more)),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
	ssub))>;
	def : Pat<(i32 (and (i32 (vector_extract (opNode (v8i16 V128:$Rn)), (i64 0))),
	maski16_or_more)),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
	ssub))>;
	}

	defm : SIMDAcrossLanesSignedIntrinsic<"ADDV", AArch64saddv>;
	// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
	def : Pat<(v2i32 (AArch64saddv (v2i32 V64:$Rn))),
	(ADDPv2i32 V64:$Rn, V64:$Rn)>;

	defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV", AArch64uaddv>;
	// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
	def : Pat<(v2i32 (AArch64uaddv (v2i32 V64:$Rn))),
	(ADDPv2i32 V64:$Rn, V64:$Rn)>;

	defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", AArch64smaxv>;
	def : Pat<(v2i32 (AArch64smaxv (v2i32 V64:$Rn))),
	(SMAXPv2i32 V64:$Rn, V64:$Rn)>;

	defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", AArch64sminv>;
	def : Pat<(v2i32 (AArch64sminv (v2i32 V64:$Rn))),
	(SMINPv2i32 V64:$Rn, V64:$Rn)>;

	defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", AArch64umaxv>;
	def : Pat<(v2i32 (AArch64umaxv (v2i32 V64:$Rn))),
	(UMAXPv2i32 V64:$Rn, V64:$Rn)>;

	defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", AArch64uminv>;
	def : Pat<(v2i32 (AArch64uminv (v2i32 V64:$Rn))),
	(UMINPv2i32 V64:$Rn, V64:$Rn)>;

	multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
	def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
	(i32 (SMOVvi16to32
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
	(i64 0)))>;
	def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
	(i32 (SMOVvi16to32
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
	(i64 0)))>;

	def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
	ssub))>;
	def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
	ssub))>;

	def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
	(i64 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
	dsub))>;
	}

	multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
	Intrinsic intOp> {
	def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
	ssub))>;
	def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
	ssub))>;

	def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
	ssub))>;
	def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
	ssub))>;

	def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
	(i64 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
	dsub))>;
	}

	defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
	defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;

	// The vaddlv_s32 intrinsic gets mapped to SADDLP.
	def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
	(i64 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(SADDLPv2i32_v1i64 V64:$Rn), dsub),
	dsub))>;
	// The vaddlv_u32 intrinsic gets mapped to UADDLP.
	def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
	(i64 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(UADDLPv2i32_v1i64 V64:$Rn), dsub),
	dsub))>;

	//------------------------------------------------------------------------------
	// AdvSIMD modified immediate instructions
	//------------------------------------------------------------------------------

	// AdvSIMD BIC
	defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>;
	// AdvSIMD ORR
	defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>;

	def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;

	def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;

	def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;

	def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;

	// AdvSIMD FMOV
	def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8,
	"fmov", ".2d",
	[(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
	def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64, fpimm8,
	"fmov", ".2s",
	[(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
	def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8,
	"fmov", ".4s",
	[(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
	let Predicates = [HasNEON, HasFullFP16] in {
	def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64, fpimm8,
	"fmov", ".4h",
	[(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
	def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8,
	"fmov", ".8h",
	[(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
	} // Predicates = [HasNEON, HasFullFP16]

	// AdvSIMD MOVI

	// EDIT byte mask: scalar
	let isReMaterializable = 1, isAsCheapAsAMove = 1 in
	def MOVID : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
	[(set FPR64:$Rd, simdimmtype10:$imm8)]>;
	// The movi_edit node has the immediate value already encoded, so we use
	// a plain imm0_255 here.
	def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
	(MOVID imm0_255:$shift)>;

	// EDIT byte mask: 2d

	// The movi_edit node has the immediate value already encoded, so we use
	// a plain imm0_255 in the pattern
	let isReMaterializable = 1, isAsCheapAsAMove = 1 in
	def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
	simdimmtype10,
	"movi", ".2d",
	[(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;

	def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
	def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
	def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
	def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;

	def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
	def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
	def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
	def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;

	// Set 64-bit vectors to all 0/1 by extracting from a 128-bit register as the
	// extract is free and this gives better MachineCSE results.
	def : Pat<(v1i64 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
	def : Pat<(v2i32 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
	def : Pat<(v4i16 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
	def : Pat<(v8i8 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;

	def : Pat<(v1i64 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
	def : Pat<(v2i32 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
	def : Pat<(v4i16 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
	def : Pat<(v8i8 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;

	// EDIT per word & halfword: 2s, 4h, 4s, & 8h
	let isReMaterializable = 1, isAsCheapAsAMove = 1 in
	defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;

	def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;

	def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;

	def : Pat<(v2i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
	def : Pat<(v4i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
	def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
	def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MOVIv8i16 imm0_255:$imm8, imm:$shift)>;

	let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
	// EDIT per word: 2s & 4s with MSL shifter
	def MOVIv2s_msl : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
	[(set (v2i32 V64:$Rd),
	(AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
	def MOVIv4s_msl : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
	[(set (v4i32 V128:$Rd),
	(AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;

	// Per byte: 8b & 16b
	def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64, imm0_255,
	"movi", ".8b",
	[(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;

	def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255,
	"movi", ".16b",
	[(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
	}

	// AdvSIMD MVNI

	// EDIT per word & halfword: 2s, 4h, 4s, & 8h
	let isReMaterializable = 1, isAsCheapAsAMove = 1 in
	defm MVNI : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;

	def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;

	def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;

	def : Pat<(v2i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
	def : Pat<(v4i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
	def : Pat<(v4i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
	def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MVNIv8i16 imm0_255:$imm8, imm:$shift)>;

	// EDIT per word: 2s & 4s with MSL shifter
	let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
	def MVNIv2s_msl : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
	[(set (v2i32 V64:$Rd),
	(AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
	def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
	[(set (v4i32 V128:$Rd),
	(AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
	}

	//----------------------------------------------------------------------------
	// AdvSIMD indexed element
	//----------------------------------------------------------------------------

	let hasSideEffects = 0 in {
	defm FMLA : SIMDFPIndexedTied<0, 0b0001, "fmla">;
	defm FMLS : SIMDFPIndexedTied<0, 0b0101, "fmls">;
	}

	// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
	// instruction expects the addend first, while the intrinsic expects it last.

	// On the other hand, there are quite a few valid combinatorial options due to
	// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
	defm : SIMDFPIndexedTiedPatterns<"FMLA",
	TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
	defm : SIMDFPIndexedTiedPatterns<"FMLA",
	TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;

	defm : SIMDFPIndexedTiedPatterns<"FMLS",
	TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
	defm : SIMDFPIndexedTiedPatterns<"FMLS",
	TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
	defm : SIMDFPIndexedTiedPatterns<"FMLS",
	TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
	defm : SIMDFPIndexedTiedPatterns<"FMLS",
	TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;

	multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
	// 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
	// and DUP scalar.
	def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
	(AArch64duplane32 (v4f32 (fneg V128:$Rm)),
	VectorIndexS:$idx))),
	(FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
	def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
	(v2f32 (AArch64duplane32
	(v4f32 (insert_subvector undef,
	(v2f32 (fneg V64:$Rm)),
	(i32 0))),
	VectorIndexS:$idx)))),
	(FMLSv2i32_indexed V64:$Rd, V64:$Rn,
	(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
	VectorIndexS:$idx)>;
	def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
	(AArch64dup (f32 (fneg FPR32Op:$Rm))))),
	(FMLSv2i32_indexed V64:$Rd, V64:$Rn,
	(SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;

	// 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
	// and DUP scalar.
	def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
	(AArch64duplane32 (v4f32 (fneg V128:$Rm)),
	VectorIndexS:$idx))),
	(FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
	VectorIndexS:$idx)>;
	def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
	(v4f32 (AArch64duplane32
	(v4f32 (insert_subvector undef,
	(v2f32 (fneg V64:$Rm)),
	(i32 0))),
	VectorIndexS:$idx)))),
	(FMLSv4i32_indexed V128:$Rd, V128:$Rn,
	(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
	VectorIndexS:$idx)>;
	def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
	(AArch64dup (f32 (fneg FPR32Op:$Rm))))),
	(FMLSv4i32_indexed V128:$Rd, V128:$Rn,
	(SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;

	// 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
	// (DUPLANE from 64-bit would be trivial).
	def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
	(AArch64duplane64 (v2f64 (fneg V128:$Rm)),
	VectorIndexD:$idx))),
	(FMLSv2i64_indexed
	V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
	def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
	(AArch64dup (f64 (fneg FPR64Op:$Rm))))),
	(FMLSv2i64_indexed V128:$Rd, V128:$Rn,
	(SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;

	// 2 variants for 32-bit scalar version: extract from .2s or from .4s
	def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
	(vector_extract (v4f32 (fneg V128:$Rm)),
	VectorIndexS:$idx))),
	(FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
	V128:$Rm, VectorIndexS:$idx)>;
	def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
	(vector_extract (v4f32 (insert_subvector undef,
	(v2f32 (fneg V64:$Rm)),
	(i32 0))),
	VectorIndexS:$idx))),
	(FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
	(SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;

	// 1 variant for 64-bit scalar version: extract from .1d or from .2d
	def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
	(vector_extract (v2f64 (fneg V128:$Rm)),
	VectorIndexS:$idx))),
	(FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
	V128:$Rm, VectorIndexS:$idx)>;
	}

	defm : FMLSIndexedAfterNegPatterns<
	TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
	defm : FMLSIndexedAfterNegPatterns<
	TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;

	defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
	defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", fmul>;

	def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
	(FMULv2i32_indexed V64:$Rn,
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
	(i64 0))>;
	def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
	(FMULv4i32_indexed V128:$Rn,
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
	(i64 0))>;
	def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
	(FMULv2i64_indexed V128:$Rn,
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
	(i64 0))>;

	defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
	defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
	defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
	TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
	defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
	TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
	defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
	defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
	TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
	defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
	TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
	defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
	int_aarch64_neon_smull>;
	defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
	int_aarch64_neon_sqadd>;
	defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
	int_aarch64_neon_sqsub>;
	defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
	int_aarch64_neon_sqadd>;
	defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
	int_aarch64_neon_sqsub>;
	defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
	defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
	TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
	defm UMLSL : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
	TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
	defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
	int_aarch64_neon_umull>;

	// A scalar sqdmull with the second operand being a vector lane can be
	// handled directly with the indexed instruction encoding.
	def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
	(vector_extract (v4i32 V128:$Vm),
	VectorIndexS:$idx)),
	(SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;

	//----------------------------------------------------------------------------
	// AdvSIMD scalar shift instructions
	//----------------------------------------------------------------------------
	defm FCVTZS : SIMDFPScalarRShift<0, 0b11111, "fcvtzs">;
	defm FCVTZU : SIMDFPScalarRShift<1, 0b11111, "fcvtzu">;
	defm SCVTF : SIMDFPScalarRShift<0, 0b11100, "scvtf">;
	defm UCVTF : SIMDFPScalarRShift<1, 0b11100, "ucvtf">;
	// Codegen patterns for the above. We don't put these directly on the
	// instructions because TableGen's type inference can't handle the truth.
	// Having the same base pattern for fp <--> int totally freaks it out.
	def : Pat<(int_aarch64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
	(FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
	def : Pat<(int_aarch64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
	(FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
	def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
	(FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
	(FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
	vecshiftR64:$imm)),
	(FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
	vecshiftR64:$imm)),
	(FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
	(UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
	def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
	(UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
	vecshiftR64:$imm)),
	(SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
	(SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
	vecshiftR64:$imm)),
	(UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
	(SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;

	// Patterns for FP16 Instrinsics - requires reg copy to/from as i16s not supported.

	def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 (sext_inreg FPR32:$Rn, i16)), vecshiftR16:$imm)),
	(SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
	def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 FPR32:$Rn), vecshiftR16:$imm)),
	(SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
	def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR16:$imm)),
	(SCVTFh (EXTRACT_SUBREG FPR64:$Rn, hsub), vecshiftR16:$imm)>;
	def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp
	(and FPR32:$Rn, (i32 65535)),
	vecshiftR16:$imm)),
	(UCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
	def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR16:$imm)),
	(UCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>;
	def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR16:$imm)),
	(UCVTFh (EXTRACT_SUBREG FPR64:$Rn, hsub), vecshiftR16:$imm)>;
	def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxs (f16 FPR16:$Rn), vecshiftR32:$imm)),
	(i32 (INSERT_SUBREG
	(i32 (IMPLICIT_DEF)),
	(FCVTZSh FPR16:$Rn, vecshiftR32:$imm),
	hsub))>;
	def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f16 FPR16:$Rn), vecshiftR64:$imm)),
	(i64 (INSERT_SUBREG
	(i64 (IMPLICIT_DEF)),
	(FCVTZSh FPR16:$Rn, vecshiftR64:$imm),
	hsub))>;
	def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxu (f16 FPR16:$Rn), vecshiftR32:$imm)),
	(i32 (INSERT_SUBREG
	(i32 (IMPLICIT_DEF)),
	(FCVTZUh FPR16:$Rn, vecshiftR32:$imm),
	hsub))>;
	def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f16 FPR16:$Rn), vecshiftR64:$imm)),
	(i64 (INSERT_SUBREG
	(i64 (IMPLICIT_DEF)),
	(FCVTZUh FPR16:$Rn, vecshiftR64:$imm),
	hsub))>;
	def : Pat<(i32 (int_aarch64_neon_facge (f16 FPR16:$Rn), (f16 FPR16:$Rm))),
	(i32 (INSERT_SUBREG
	(i32 (IMPLICIT_DEF)),
	(FACGE16 FPR16:$Rn, FPR16:$Rm),
	hsub))>;
	def : Pat<(i32 (int_aarch64_neon_facgt (f16 FPR16:$Rn), (f16 FPR16:$Rm))),
	(i32 (INSERT_SUBREG
	(i32 (IMPLICIT_DEF)),
	(FACGT16 FPR16:$Rn, FPR16:$Rm),
	hsub))>;

	defm SHL : SIMDScalarLShiftD< 0, 0b01010, "shl", AArch64vshl>;
	defm SLI : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
	defm SQRSHRN : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
	int_aarch64_neon_sqrshrn>;
	defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
	int_aarch64_neon_sqrshrun>;
	defm SQSHLU : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
	defm SQSHL : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
	defm SQSHRN : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
	int_aarch64_neon_sqshrn>;
	defm SQSHRUN : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
	int_aarch64_neon_sqshrun>;
	defm SRI : SIMDScalarRShiftDTied< 1, 0b01000, "sri">;
	defm SRSHR : SIMDScalarRShiftD< 0, 0b00100, "srshr", AArch64srshri>;
	defm SRSRA : SIMDScalarRShiftDTied< 0, 0b00110, "srsra",
	TriOpFrag<(add node:$LHS,
	(AArch64srshri node:$MHS, node:$RHS))>>;
	defm SSHR : SIMDScalarRShiftD< 0, 0b00000, "sshr", AArch64vashr>;
	defm SSRA : SIMDScalarRShiftDTied< 0, 0b00010, "ssra",
	TriOpFrag<(add node:$LHS,
	(AArch64vashr node:$MHS, node:$RHS))>>;
	defm UQRSHRN : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
	int_aarch64_neon_uqrshrn>;
	defm UQSHL : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
	defm UQSHRN : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
	int_aarch64_neon_uqshrn>;
	defm URSHR : SIMDScalarRShiftD< 1, 0b00100, "urshr", AArch64urshri>;
	defm URSRA : SIMDScalarRShiftDTied< 1, 0b00110, "ursra",
	TriOpFrag<(add node:$LHS,
	(AArch64urshri node:$MHS, node:$RHS))>>;
	defm USHR : SIMDScalarRShiftD< 1, 0b00000, "ushr", AArch64vlshr>;
	defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra",
	TriOpFrag<(add node:$LHS,
	(AArch64vlshr node:$MHS, node:$RHS))>>;

	//----------------------------------------------------------------------------
	// AdvSIMD vector shift instructions
	//----------------------------------------------------------------------------
	defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
	defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
	defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
	int_aarch64_neon_vcvtfxs2fp>;
	defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
	int_aarch64_neon_rshrn>;
	defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
	defm SHRN : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
	BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>;
	defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>;
	def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
	(i32 vecshiftL64:$imm))),
	(SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
	defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
	int_aarch64_neon_sqrshrn>;
	defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
	int_aarch64_neon_sqrshrun>;
	defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
	defm SQSHL : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
	defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
	int_aarch64_neon_sqshrn>;
	defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
	int_aarch64_neon_sqshrun>;
	defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>;
	def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
	(i32 vecshiftR64:$imm))),
	(SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
	defm SRSHR : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>;
	defm SRSRA : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
	TriOpFrag<(add node:$LHS,
	(AArch64srshri node:$MHS, node:$RHS))> >;
	defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
	BinOpFrag<(AArch64vshl (sext node:$LHS), node:$RHS)>>;

	defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
	defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
	TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
	defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf",
	int_aarch64_neon_vcvtfxu2fp>;
	defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
	int_aarch64_neon_uqrshrn>;
	defm UQSHL : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
	defm UQSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
	int_aarch64_neon_uqshrn>;
	defm URSHR : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>;
	defm URSRA : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
	TriOpFrag<(add node:$LHS,
	(AArch64urshri node:$MHS, node:$RHS))> >;
	defm USHLL : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
	BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>;
	defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
	defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
	TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;

	// SHRN patterns for when a logical right shift was used instead of arithmetic
	// (the immediate guarantees no sign bits actually end up in the result so it
	// doesn't matter).
	def : Pat<(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
	(SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
	def : Pat<(v4i16 (trunc (AArch64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
	(SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
	def : Pat<(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
	(SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;

	def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
	(trunc (AArch64vlshr (v8i16 V128:$Rn),
	vecshiftR16Narrow:$imm)))),
	(SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
	V128:$Rn, vecshiftR16Narrow:$imm)>;
	def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
	(trunc (AArch64vlshr (v4i32 V128:$Rn),
	vecshiftR32Narrow:$imm)))),
	(SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
	V128:$Rn, vecshiftR32Narrow:$imm)>;
	def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
	(trunc (AArch64vlshr (v2i64 V128:$Rn),
	vecshiftR64Narrow:$imm)))),
	(SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
	V128:$Rn, vecshiftR32Narrow:$imm)>;

	// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
	// Anyexts are implemented as zexts.
	def : Pat<(v8i16 (sext (v8i8 V64:$Rn))), (SSHLLv8i8_shift V64:$Rn, (i32 0))>;
	def : Pat<(v8i16 (zext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>;
	def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>;
	def : Pat<(v4i32 (sext (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
	def : Pat<(v4i32 (zext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
	def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
	def : Pat<(v2i64 (sext (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
	def : Pat<(v2i64 (zext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
	def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
	// Also match an extend from the upper half of a 128 bit source register.
	def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
	(USHLLv16i8_shift V128:$Rn, (i32 0))>;
	def : Pat<(v8i16 (zext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
	(USHLLv16i8_shift V128:$Rn, (i32 0))>;
	def : Pat<(v8i16 (sext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
	(SSHLLv16i8_shift V128:$Rn, (i32 0))>;
	def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
	(USHLLv8i16_shift V128:$Rn, (i32 0))>;
	def : Pat<(v4i32 (zext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
	(USHLLv8i16_shift V128:$Rn, (i32 0))>;
	def : Pat<(v4i32 (sext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
	(SSHLLv8i16_shift V128:$Rn, (i32 0))>;
	def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
	(USHLLv4i32_shift V128:$Rn, (i32 0))>;
	def : Pat<(v2i64 (zext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
	(USHLLv4i32_shift V128:$Rn, (i32 0))>;
	def : Pat<(v2i64 (sext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
	(SSHLLv4i32_shift V128:$Rn, (i32 0))>;

	// Vector shift sxtl aliases
	def : InstAlias<"sxtl.8h $dst, $src1",
	(SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"sxtl $dst.8h, $src1.8b",
	(SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"sxtl.4s $dst, $src1",
	(SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"sxtl $dst.4s, $src1.4h",
	(SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"sxtl.2d $dst, $src1",
	(SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"sxtl $dst.2d, $src1.2s",
	(SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;

	// Vector shift sxtl2 aliases
	def : InstAlias<"sxtl2.8h $dst, $src1",
	(SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
	(SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"sxtl2.4s $dst, $src1",
	(SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
	(SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"sxtl2.2d $dst, $src1",
	(SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
	(SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;

	// Vector shift uxtl aliases
	def : InstAlias<"uxtl.8h $dst, $src1",
	(USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"uxtl $dst.8h, $src1.8b",
	(USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"uxtl.4s $dst, $src1",
	(USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"uxtl $dst.4s, $src1.4h",
	(USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"uxtl.2d $dst, $src1",
	(USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"uxtl $dst.2d, $src1.2s",
	(USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;

	// Vector shift uxtl2 aliases
	def : InstAlias<"uxtl2.8h $dst, $src1",
	(USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
	(USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"uxtl2.4s $dst, $src1",
	(USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
	(USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"uxtl2.2d $dst, $src1",
	(USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
	(USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;

	// If an integer is about to be converted to a floating point value,
	// just load it on the floating point unit.
	// These patterns are more complex because floating point loads do not
	// support sign extension.
	// The sign extension has to be explicitly added and is only supported for
	// one step: byte-to-half, half-to-word, word-to-doubleword.
	// SCVTF GPR -> FPR is 9 cycles.
	// SCVTF FPR -> FPR is 4 cyclces.
	// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
	// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
	// and still being faster.
	// However, this is not good for code size.
	// 8-bits -> float. 2 sizes step-up.
	class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
	: Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))),
	(SCVTFv1i32 (f32 (EXTRACT_SUBREG
	(SSHLLv4i16_shift
	(f64
	(EXTRACT_SUBREG
	(SSHLLv8i8_shift
	(INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	INST,
	bsub),
	0),
	dsub)),
	0),
	ssub)))>,
	Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;

	def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
	(LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
	def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext),
	(LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>;
	def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset),
	(LDRBui GPR64sp:$Rn, uimm12s1:$offset)>;
	def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset),
	(LDURBi GPR64sp:$Rn, simm9:$offset)>;

	// 16-bits -> float. 1 size step-up.
	class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
	: Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))),
	(SCVTFv1i32 (f32 (EXTRACT_SUBREG
	(SSHLLv4i16_shift
	(INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	INST,
	hsub),
	0),
	ssub)))>, Requires<[NotForCodeSize]>;

	def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
	(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
	def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
	(LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
	def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
	(LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
	def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
	(LDURHi GPR64sp:$Rn, simm9:$offset)>;

	// 32-bits to 32-bits are handled in target specific dag combine:
	// performIntToFpCombine.
	// 64-bits integer to 32-bits floating point, not possible with
	// SCVTF on floating point registers (both source and destination
	// must have the same size).

	// Here are the patterns for 8, 16, 32, and 64-bits to double.
	// 8-bits -> double. 3 size step-up: give up.
	// 16-bits -> double. 2 size step.
	class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
	: Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))),
	(SCVTFv1i64 (f64 (EXTRACT_SUBREG
	(SSHLLv2i32_shift
	(f64
	(EXTRACT_SUBREG
	(SSHLLv4i16_shift
	(INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	INST,
	hsub),
	0),
	dsub)),
	0),
	dsub)))>,
	Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;

	def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
	(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
	def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
	(LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
	def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
	(LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
	def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
	(LDURHi GPR64sp:$Rn, simm9:$offset)>;
	// 32-bits -> double. 1 size step-up.
	class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
	: Pat <(f64 (sint_to_fp (i32 (load addrmode)))),
	(SCVTFv1i64 (f64 (EXTRACT_SUBREG
	(SSHLLv2i32_shift
	(INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	INST,
	ssub),
	0),
	dsub)))>, Requires<[NotForCodeSize]>;

	def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
	(LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
	def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext),
	(LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>;
	def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset),
	(LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
	def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset),
	(LDURSi GPR64sp:$Rn, simm9:$offset)>;

	// 64-bits -> double are handled in target specific dag combine:
	// performIntToFpCombine.


	//----------------------------------------------------------------------------
	// AdvSIMD Load-Store Structure
	//----------------------------------------------------------------------------
	defm LD1 : SIMDLd1Multiple<"ld1">;
	defm LD2 : SIMDLd2Multiple<"ld2">;
	defm LD3 : SIMDLd3Multiple<"ld3">;
	defm LD4 : SIMDLd4Multiple<"ld4">;

	defm ST1 : SIMDSt1Multiple<"st1">;
	defm ST2 : SIMDSt2Multiple<"st2">;
	defm ST3 : SIMDSt3Multiple<"st3">;
	defm ST4 : SIMDSt4Multiple<"st4">;

	class Ld1Pat<ValueType ty, Instruction INST>
	: Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>;

	def : Ld1Pat<v16i8, LD1Onev16b>;
	def : Ld1Pat<v8i16, LD1Onev8h>;
	def : Ld1Pat<v4i32, LD1Onev4s>;
	def : Ld1Pat<v2i64, LD1Onev2d>;
	def : Ld1Pat<v8i8, LD1Onev8b>;
	def : Ld1Pat<v4i16, LD1Onev4h>;
	def : Ld1Pat<v2i32, LD1Onev2s>;
	def : Ld1Pat<v1i64, LD1Onev1d>;

	class St1Pat<ValueType ty, Instruction INST>
	: Pat<(store ty:$Vt, GPR64sp:$Rn),
	(INST ty:$Vt, GPR64sp:$Rn)>;

	def : St1Pat<v16i8, ST1Onev16b>;
	def : St1Pat<v8i16, ST1Onev8h>;
	def : St1Pat<v4i32, ST1Onev4s>;
	def : St1Pat<v2i64, ST1Onev2d>;
	def : St1Pat<v8i8, ST1Onev8b>;
	def : St1Pat<v4i16, ST1Onev4h>;
	def : St1Pat<v2i32, ST1Onev2s>;
	def : St1Pat<v1i64, ST1Onev1d>;

	//---
	// Single-element
	//---

	defm LD1R : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
	defm LD2R : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
	defm LD3R : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
	defm LD4R : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
	let mayLoad = 1, hasSideEffects = 0 in {
	defm LD1 : SIMDLdSingleBTied<0, 0b000, "ld1", VecListOneb, GPR64pi1>;
	defm LD1 : SIMDLdSingleHTied<0, 0b010, 0, "ld1", VecListOneh, GPR64pi2>;
	defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes, GPR64pi4>;
	defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned, GPR64pi8>;
	defm LD2 : SIMDLdSingleBTied<1, 0b000, "ld2", VecListTwob, GPR64pi2>;
	defm LD2 : SIMDLdSingleHTied<1, 0b010, 0, "ld2", VecListTwoh, GPR64pi4>;
	defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos, GPR64pi8>;
	defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod, GPR64pi16>;
	defm LD3 : SIMDLdSingleBTied<0, 0b001, "ld3", VecListThreeb, GPR64pi3>;
	defm LD3 : SIMDLdSingleHTied<0, 0b011, 0, "ld3", VecListThreeh, GPR64pi6>;
	defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
	defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
	defm LD4 : SIMDLdSingleBTied<1, 0b001, "ld4", VecListFourb, GPR64pi4>;
	defm LD4 : SIMDLdSingleHTied<1, 0b011, 0, "ld4", VecListFourh, GPR64pi8>;
	defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours, GPR64pi16>;
	defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd, GPR64pi32>;
	}

	def : Pat<(v8i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
	(LD1Rv8b GPR64sp:$Rn)>;
	def : Pat<(v16i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
	(LD1Rv16b GPR64sp:$Rn)>;
	def : Pat<(v4i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
	(LD1Rv4h GPR64sp:$Rn)>;
	def : Pat<(v8i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
	(LD1Rv8h GPR64sp:$Rn)>;
	def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
	(LD1Rv2s GPR64sp:$Rn)>;
	def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
	(LD1Rv4s GPR64sp:$Rn)>;
	def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
	(LD1Rv2d GPR64sp:$Rn)>;
	def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
	(LD1Rv1d GPR64sp:$Rn)>;
	// Grab the floating point version too
	def : Pat<(v2f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
	(LD1Rv2s GPR64sp:$Rn)>;
	def : Pat<(v4f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
	(LD1Rv4s GPR64sp:$Rn)>;
	def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
	(LD1Rv2d GPR64sp:$Rn)>;
	def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
	(LD1Rv1d GPR64sp:$Rn)>;
	def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
	(LD1Rv4h GPR64sp:$Rn)>;
	def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
	(LD1Rv8h GPR64sp:$Rn)>;

	class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
	ValueType VTy, ValueType STy, Instruction LD1>
	: Pat<(vector_insert (VTy VecListOne128:$Rd),
	(STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
	(LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>;

	def : Ld1Lane128Pat<extloadi8, VectorIndexB, v16i8, i32, LD1i8>;
	def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
	def : Ld1Lane128Pat<load, VectorIndexS, v4i32, i32, LD1i32>;
	def : Ld1Lane128Pat<load, VectorIndexS, v4f32, f32, LD1i32>;
	def : Ld1Lane128Pat<load, VectorIndexD, v2i64, i64, LD1i64>;
	def : Ld1Lane128Pat<load, VectorIndexD, v2f64, f64, LD1i64>;
	def : Ld1Lane128Pat<load, VectorIndexH, v8f16, f16, LD1i16>;

	class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
	ValueType VTy, ValueType STy, Instruction LD1>
	: Pat<(vector_insert (VTy VecListOne64:$Rd),
	(STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
	(EXTRACT_SUBREG
	(LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
	VecIndex:$idx, GPR64sp:$Rn),
	dsub)>;

	def : Ld1Lane64Pat<extloadi8, VectorIndexB, v8i8, i32, LD1i8>;
	def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
	def : Ld1Lane64Pat<load, VectorIndexS, v2i32, i32, LD1i32>;
	def : Ld1Lane64Pat<load, VectorIndexS, v2f32, f32, LD1i32>;
	def : Ld1Lane64Pat<load, VectorIndexH, v4f16, f16, LD1i16>;


	defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
	defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
	defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
	defm LD4 : SIMDLdSt4SingleAliases<"ld4">;

	// Stores
	defm ST1 : SIMDStSingleB<0, 0b000, "st1", VecListOneb, GPR64pi1>;
	defm ST1 : SIMDStSingleH<0, 0b010, 0, "st1", VecListOneh, GPR64pi2>;
	defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
	defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;

	let AddedComplexity = 19 in
	class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
	ValueType VTy, ValueType STy, Instruction ST1>
	: Pat<(scalar_store
	(STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
	GPR64sp:$Rn),
	(ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>;

	def : St1Lane128Pat<truncstorei8, VectorIndexB, v16i8, i32, ST1i8>;
	def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
	def : St1Lane128Pat<store, VectorIndexS, v4i32, i32, ST1i32>;
	def : St1Lane128Pat<store, VectorIndexS, v4f32, f32, ST1i32>;
	def : St1Lane128Pat<store, VectorIndexD, v2i64, i64, ST1i64>;
	def : St1Lane128Pat<store, VectorIndexD, v2f64, f64, ST1i64>;
	def : St1Lane128Pat<store, VectorIndexH, v8f16, f16, ST1i16>;

	let AddedComplexity = 19 in
	class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
	ValueType VTy, ValueType STy, Instruction ST1>
	: Pat<(scalar_store
	(STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
	GPR64sp:$Rn),
	(ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
	VecIndex:$idx, GPR64sp:$Rn)>;

	def : St1Lane64Pat<truncstorei8, VectorIndexB, v8i8, i32, ST1i8>;
	def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
	def : St1Lane64Pat<store, VectorIndexS, v2i32, i32, ST1i32>;
	def : St1Lane64Pat<store, VectorIndexS, v2f32, f32, ST1i32>;
	def : St1Lane64Pat<store, VectorIndexH, v4f16, f16, ST1i16>;

	multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
	ValueType VTy, ValueType STy, Instruction ST1,
	int offset> {
	def : Pat<(scalar_store
	(STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
	GPR64sp:$Rn, offset),
	(ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
	VecIndex:$idx, GPR64sp:$Rn, XZR)>;

	def : Pat<(scalar_store
	(STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
	GPR64sp:$Rn, GPR64:$Rm),
	(ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
	VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
	}

	defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>;
	defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST,
	2>;
	defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
	defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
	defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
	defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
	defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>;

	multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
	ValueType VTy, ValueType STy, Instruction ST1,
	int offset> {
	def : Pat<(scalar_store
	(STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
	GPR64sp:$Rn, offset),
	(ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>;

	def : Pat<(scalar_store
	(STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
	GPR64sp:$Rn, GPR64:$Rm),
	(ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
	}

	defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
	1>;
	defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
	2>;
	defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
	defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
	defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
	defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
	defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>;

	let mayStore = 1, hasSideEffects = 0 in {
	defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>;
	defm ST2 : SIMDStSingleH<1, 0b010, 0, "st2", VecListTwoh, GPR64pi4>;
	defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos, GPR64pi8>;
	defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod, GPR64pi16>;
	defm ST3 : SIMDStSingleB<0, 0b001, "st3", VecListThreeb, GPR64pi3>;
	defm ST3 : SIMDStSingleH<0, 0b011, 0, "st3", VecListThreeh, GPR64pi6>;
	defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
	defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
	defm ST4 : SIMDStSingleB<1, 0b001, "st4", VecListFourb, GPR64pi4>;
	defm ST4 : SIMDStSingleH<1, 0b011, 0, "st4", VecListFourh, GPR64pi8>;
	defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours, GPR64pi16>;
	defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd, GPR64pi32>;
	}

	defm ST1 : SIMDLdSt1SingleAliases<"st1">;
	defm ST2 : SIMDLdSt2SingleAliases<"st2">;
	defm ST3 : SIMDLdSt3SingleAliases<"st3">;
	defm ST4 : SIMDLdSt4SingleAliases<"st4">;

	//----------------------------------------------------------------------------
	// Crypto extensions
	//----------------------------------------------------------------------------

	let Predicates = [HasAES] in {
	def AESErr : AESTiedInst<0b0100, "aese", int_aarch64_crypto_aese>;
	def AESDrr : AESTiedInst<0b0101, "aesd", int_aarch64_crypto_aesd>;
	def AESMCrr : AESInst< 0b0110, "aesmc", int_aarch64_crypto_aesmc>;
	def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>;
	}

	// Pseudo instructions for AESMCrr/AESIMCrr with a register constraint required
	// for AES fusion on some CPUs.
	let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
	def AESMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
	Sched<[WriteV]>;
	def AESIMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
	Sched<[WriteV]>;
	}

	// Only use constrained versions of AES(I)MC instructions if they are paired with
	// AESE/AESD.
	def : Pat<(v16i8 (int_aarch64_crypto_aesmc
	(v16i8 (int_aarch64_crypto_aese (v16i8 V128:$src1),
	(v16i8 V128:$src2))))),
	(v16i8 (AESMCrrTied (v16i8 (AESErr (v16i8 V128:$src1),
	(v16i8 V128:$src2)))))>,
	Requires<[HasFuseAES]>;

	def : Pat<(v16i8 (int_aarch64_crypto_aesimc
	(v16i8 (int_aarch64_crypto_aesd (v16i8 V128:$src1),
	(v16i8 V128:$src2))))),
	(v16i8 (AESIMCrrTied (v16i8 (AESDrr (v16i8 V128:$src1),
	(v16i8 V128:$src2)))))>,
	Requires<[HasFuseAES]>;

	let Predicates = [HasSHA2] in {
	def SHA1Crrr : SHATiedInstQSV<0b000, "sha1c", int_aarch64_crypto_sha1c>;
	def SHA1Prrr : SHATiedInstQSV<0b001, "sha1p", int_aarch64_crypto_sha1p>;
	def SHA1Mrrr : SHATiedInstQSV<0b010, "sha1m", int_aarch64_crypto_sha1m>;
	def SHA1SU0rrr : SHATiedInstVVV<0b011, "sha1su0", int_aarch64_crypto_sha1su0>;
	def SHA256Hrrr : SHATiedInstQQV<0b100, "sha256h", int_aarch64_crypto_sha256h>;
	def SHA256H2rrr : SHATiedInstQQV<0b101, "sha256h2",int_aarch64_crypto_sha256h2>;
	def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1>;

	def SHA1Hrr : SHAInstSS< 0b0000, "sha1h", int_aarch64_crypto_sha1h>;
	def SHA1SU1rr : SHATiedInstVV<0b0001, "sha1su1", int_aarch64_crypto_sha1su1>;
	def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>;
	}

	//----------------------------------------------------------------------------
	// Compiler-pseudos
	//----------------------------------------------------------------------------
	// FIXME: Like for X86, these should go in their own separate .td file.

	def def32 : PatLeaf<(i32 GPR32:$src), [{
	return isDef32(*N);
	}]>;

	// In the case of a 32-bit def that is known to implicitly zero-extend,
	// we can use a SUBREG_TO_REG.
	def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;

	// For an anyext, we don't care what the high bits are, so we can perform an
	// INSERT_SUBREF into an IMPLICIT_DEF.
	def : Pat<(i64 (anyext GPR32:$src)),
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;

	// When we need to explicitly zero-extend, we use a 32-bit MOV instruction and
	// then assert the extension has happened.
	def : Pat<(i64 (zext GPR32:$src)),
	(SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;

	// To sign extend, we use a signed bitfield move instruction (SBFM) on the
	// containing super-reg.
	def : Pat<(i64 (sext GPR32:$src)),
	(SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
	def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
	def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
	def : Pat<(i64 (sext_inreg GPR64:$src, i8)), (SBFMXri GPR64:$src, 0, 7)>;
	def : Pat<(i64 (sext_inreg GPR64:$src, i1)), (SBFMXri GPR64:$src, 0, 0)>;
	def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
	def : Pat<(i32 (sext_inreg GPR32:$src, i8)), (SBFMWri GPR32:$src, 0, 7)>;
	def : Pat<(i32 (sext_inreg GPR32:$src, i1)), (SBFMWri GPR32:$src, 0, 0)>;

	def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
	(SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
	(i64 (i32shift_sext_i8 imm0_31:$imm)))>;
	def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
	(SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
	(i64 (i64shift_sext_i8 imm0_63:$imm)))>;

	def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
	(SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
	(i64 (i32shift_sext_i16 imm0_31:$imm)))>;
	def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
	(SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
	(i64 (i64shift_sext_i16 imm0_63:$imm)))>;

	def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
	(SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
	(i64 (i64shift_a imm0_63:$imm)),
	(i64 (i64shift_sext_i32 imm0_63:$imm)))>;

	// sra patterns have an AddedComplexity of 10, so make sure we have a higher
	// AddedComplexity for the following patterns since we want to match sext + sra
	// patterns before we attempt to match a single sra node.
	let AddedComplexity = 20 in {
	// We support all sext + sra combinations which preserve at least one bit of the
	// original value which is to be sign extended. E.g. we support shifts up to
	// bitwidth-1 bits.
	def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
	(SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
	def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
	(SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;

	def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
	(SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
	def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
	(SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;

	def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
	(SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
	(i64 imm0_31:$imm), 31)>;
	} // AddedComplexity = 20

	// To truncate, we can simply extract from a subregister.
	def : Pat<(i32 (trunc GPR64sp:$src)),
	(i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;

	// __builtin_trap() uses the BRK instruction on AArch64.
	def : Pat<(trap), (BRK 1)>;
	def : Pat<(debugtrap), (BRK 0xF000)>, Requires<[IsWindows]>;

	// Multiply high patterns which multiply the lower subvector using smull/umull
	// and the upper subvector with smull2/umull2. Then shuffle the high the high
	// part of both results together.
	def : Pat<(v16i8 (mulhs V128:$Rn, V128:$Rm)),
	(UZP2v16i8
	(SMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub),
	(EXTRACT_SUBREG V128:$Rm, dsub)),
	(SMULLv16i8_v8i16 V128:$Rn, V128:$Rm))>;
	def : Pat<(v8i16 (mulhs V128:$Rn, V128:$Rm)),
	(UZP2v8i16
	(SMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub),
	(EXTRACT_SUBREG V128:$Rm, dsub)),
	(SMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>;
	def : Pat<(v4i32 (mulhs V128:$Rn, V128:$Rm)),
	(UZP2v4i32
	(SMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub),
	(EXTRACT_SUBREG V128:$Rm, dsub)),
	(SMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;

	def : Pat<(v16i8 (mulhu V128:$Rn, V128:$Rm)),
	(UZP2v16i8
	(UMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub),
	(EXTRACT_SUBREG V128:$Rm, dsub)),
	(UMULLv16i8_v8i16 V128:$Rn, V128:$Rm))>;
	def : Pat<(v8i16 (mulhu V128:$Rn, V128:$Rm)),
	(UZP2v8i16
	(UMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub),
	(EXTRACT_SUBREG V128:$Rm, dsub)),
	(UMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>;
	def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
	(UZP2v4i32
	(UMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub),
	(EXTRACT_SUBREG V128:$Rm, dsub)),
	(UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;

	// Conversions within AdvSIMD types in the same register size are free.
	// But because we need a consistent lane ordering, in big endian many
	// conversions require one or more REV instructions.
	//
	// Consider a simple memory load followed by a bitconvert then a store.
	// v0 = load v2i32
	// v1 = BITCAST v2i32 v0 to v4i16
	// store v4i16 v2
	//
	// In big endian mode every memory access has an implicit byte swap. LDR and
	// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that
	// is, they treat the vector as a sequence of elements to be byte-swapped.
	// The two pairs of instructions are fundamentally incompatible. We've decided
	// to use LD1/ST1 only to simplify compiler implementation.
	//
	// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes
	// the original code sequence:
	// v0 = load v2i32
	// v1 = REV v2i32 (implicit)
	// v2 = BITCAST v2i32 v1 to v4i16
	// v3 = REV v4i16 v2 (implicit)
	// store v4i16 v3
	//
	// But this is now broken - the value stored is different to the value loaded
	// due to lane reordering. To fix this, on every BITCAST we must perform two
	// other REVs:
	// v0 = load v2i32
	// v1 = REV v2i32 (implicit)
	// v2 = REV v2i32
	// v3 = BITCAST v2i32 v2 to v4i16
	// v4 = REV v4i16
	// v5 = REV v4i16 v4 (implicit)
	// store v4i16 v5
	//
	// This means an extra two instructions, but actually in most cases the two REV
	// instructions can be combined into one. For example:
	// (REV64_2s (REV64_4h X)) === (REV32_4h X)
	//
	// There is also no 128-bit REV instruction. This must be synthesized with an
	// EXT instruction.
	//
	// Most bitconverts require some sort of conversion. The only exceptions are:
	// a) Identity conversions - vNfX <-> vNiX
	// b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
	//

	// Natural vector casts (64 bit)
	def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;

	def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;

	def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2f32 (AArch64NvCast (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;

	def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
	def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>;

	def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
	def : Pat<(v1f64 (AArch64NvCast (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;

	// Natural vector casts (128 bit)
	def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;

	def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;

	def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;

	def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v2f64 (AArch64NvCast (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;

	def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;

	def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
	def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;

	def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
	def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
	def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
	def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
	def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
	def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v8i8 (bitconvert GPR64:$Xn)),
	(REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
	def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
	(REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
	def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
	(REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
	def : Pat<(v4f16 (bitconvert GPR64:$Xn)),
	(REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
	def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
	(REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;

	def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))),
	(REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
	def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
	(REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
	def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
	(REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
	def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
	(REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
	def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
	(REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
	}
	def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
	def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
	(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
	(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;

	def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
	(COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
	def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
	(COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
	def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
	(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
	(COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
	def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;

	let Predicates = [IsLE] in {
	def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
	def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
	def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
	def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>;
	def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))),
	(v1i64 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
	(v1i64 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))),
	(v1i64 (REV64v8i8 FPR64:$src))>;
	def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))),
	(v1i64 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
	(v1i64 (REV64v2i32 FPR64:$src))>;
	}
	def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
	def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
	(v2i32 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))),
	(v2i32 (REV32v4i16 FPR64:$src))>;
	def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))),
	(v2i32 (REV32v8i8 FPR64:$src))>;
	def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))),
	(v2i32 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
	(v2i32 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))),
	(v2i32 (REV32v4i16 FPR64:$src))>;
	}
	def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))),
	(v4i16 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))),
	(v4i16 (REV32v4i16 FPR64:$src))>;
	def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))),
	(v4i16 (REV16v8i8 FPR64:$src))>;
	def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))),
	(v4i16 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
	(v4i16 (REV32v4i16 FPR64:$src))>;
	def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
	(v4i16 (REV64v4i16 FPR64:$src))>;
	}
	def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))),
	(v4f16 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))),
	(v4f16 (REV32v4i16 FPR64:$src))>;
	def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))),
	(v4f16 (REV16v8i8 FPR64:$src))>;
	def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))),
	(v4f16 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))),
	(v4f16 (REV32v4i16 FPR64:$src))>;
	def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))),
	(v4f16 (REV64v4i16 FPR64:$src))>;
	}
	def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))),
	(v8i8 (REV64v8i8 FPR64:$src))>;
	def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))),
	(v8i8 (REV32v8i8 FPR64:$src))>;
	def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))),
	(v8i8 (REV16v8i8 FPR64:$src))>;
	def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))),
	(v8i8 (REV64v8i8 FPR64:$src))>;
	def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))),
	(v8i8 (REV32v8i8 FPR64:$src))>;
	def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))),
	(v8i8 (REV64v8i8 FPR64:$src))>;
	def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))),
	(v8i8 (REV16v8i8 FPR64:$src))>;
	}

	let Predicates = [IsLE] in {
	def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), (f64 FPR64:$src)>;
	def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>;
	def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>;
	def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>;
	def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))),
	(f64 (REV64v2i32 FPR64:$src))>;
	def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))),
	(f64 (REV64v4i16 FPR64:$src))>;
	def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))),
	(f64 (REV64v2i32 FPR64:$src))>;
	def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))),
	(f64 (REV64v8i8 FPR64:$src))>;
	def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))),
	(f64 (REV64v4i16 FPR64:$src))>;
	}
	def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>;
	def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
	def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
	def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>;
	def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
	def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
	(v1f64 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))),
	(v1f64 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))),
	(v1f64 (REV64v8i8 FPR64:$src))>;
	def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
	(v1f64 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))),
	(v1f64 (REV64v4i16 FPR64:$src))>;
	}
	def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
	def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
	(v2f32 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))),
	(v2f32 (REV32v4i16 FPR64:$src))>;
	def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))),
	(v2f32 (REV32v8i8 FPR64:$src))>;
	def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
	(v2f32 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))),
	(v2f32 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))),
	(v2f32 (REV32v4i16 FPR64:$src))>;
	}
	def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
	def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
	def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
	def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
	def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
	def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>;
	def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))),
	(f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
	def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
	(f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
	(REV64v4i32 FPR128:$src), (i32 8)))>;
	def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
	(f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
	(REV64v8i16 FPR128:$src), (i32 8)))>;
	def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))),
	(f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
	(REV64v8i16 FPR128:$src), (i32 8)))>;
	def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
	(f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
	def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
	(f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
	(REV64v4i32 FPR128:$src), (i32 8)))>;
	def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))),
	(f128 (EXTv16i8 (REV64v16i8 FPR128:$src),
	(REV64v16i8 FPR128:$src), (i32 8)))>;
	}

	let Predicates = [IsLE] in {
	def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))),
	(v2f64 (EXTv16i8 FPR128:$src,
	FPR128:$src, (i32 8)))>;
	def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
	(v2f64 (REV64v4i32 FPR128:$src))>;
	def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
	(v2f64 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))),
	(v2f64 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
	(v2f64 (REV64v16i8 FPR128:$src))>;
	def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
	(v2f64 (REV64v4i32 FPR128:$src))>;
	}
	def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))),
	(v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src),
	(REV64v4i32 FPR128:$src), (i32 8)))>;
	def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
	(v4f32 (REV32v8i16 FPR128:$src))>;
	def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))),
	(v4f32 (REV32v8i16 FPR128:$src))>;
	def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
	(v4f32 (REV32v16i8 FPR128:$src))>;
	def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
	(v4f32 (REV64v4i32 FPR128:$src))>;
	def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))),
	(v4f32 (REV64v4i32 FPR128:$src))>;
	}
	def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))),
	(v2i64 (EXTv16i8 FPR128:$src,
	FPR128:$src, (i32 8)))>;
	def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))),
	(v2i64 (REV64v4i32 FPR128:$src))>;
	def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))),
	(v2i64 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
	(v2i64 (REV64v16i8 FPR128:$src))>;
	def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
	(v2i64 (REV64v4i32 FPR128:$src))>;
	def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))),
	(v2i64 (REV64v8i16 FPR128:$src))>;
	}
	def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))),
	(v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src),
	(REV64v4i32 FPR128:$src),
	(i32 8)))>;
	def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))),
	(v4i32 (REV64v4i32 FPR128:$src))>;
	def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))),
	(v4i32 (REV32v8i16 FPR128:$src))>;
	def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
	(v4i32 (REV32v16i8 FPR128:$src))>;
	def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
	(v4i32 (REV64v4i32 FPR128:$src))>;
	def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))),
	(v4i32 (REV32v8i16 FPR128:$src))>;
	}
	def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))),
	(v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src),
	(REV64v8i16 FPR128:$src),
	(i32 8)))>;
	def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))),
	(v8i16 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))),
	(v8i16 (REV32v8i16 FPR128:$src))>;
	def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))),
	(v8i16 (REV16v16i8 FPR128:$src))>;
	def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
	(v8i16 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
	(v8i16 (REV32v8i16 FPR128:$src))>;
	}
	def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))),
	(v8f16 (EXTv16i8 (REV64v8i16 FPR128:$src),
	(REV64v8i16 FPR128:$src),
	(i32 8)))>;
	def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))),
	(v8f16 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))),
	(v8f16 (REV32v8i16 FPR128:$src))>;
	def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))),
	(v8f16 (REV16v16i8 FPR128:$src))>;
	def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
	(v8f16 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))),
	(v8f16 (REV32v8i16 FPR128:$src))>;
	}
	def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))),
	(v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src),
	(REV64v16i8 FPR128:$src),
	(i32 8)))>;
	def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))),
	(v16i8 (REV64v16i8 FPR128:$src))>;
	def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))),
	(v16i8 (REV32v16i8 FPR128:$src))>;
	def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))),
	(v16i8 (REV16v16i8 FPR128:$src))>;
	def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
	(v16i8 (REV64v16i8 FPR128:$src))>;
	def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
	(v16i8 (REV32v16i8 FPR128:$src))>;
	def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
	(v16i8 (REV16v16i8 FPR128:$src))>;
	}

	def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))),
	(EXTRACT_SUBREG V128:$Rn, dsub)>;
	def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 0))),
	(EXTRACT_SUBREG V128:$Rn, dsub)>;
	def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))),
	(EXTRACT_SUBREG V128:$Rn, dsub)>;
	def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))),
	(EXTRACT_SUBREG V128:$Rn, dsub)>;
	def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))),
	(EXTRACT_SUBREG V128:$Rn, dsub)>;
	def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))),
	(EXTRACT_SUBREG V128:$Rn, dsub)>;
	def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 0))),
	(EXTRACT_SUBREG V128:$Rn, dsub)>;

	def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
	(EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
	def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
	(EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
	def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
	(EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
	def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
	(EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;

	// A 64-bit subvector insert to the first 128-bit vector position
	// is a subregister copy that needs no instruction.
	multiclass InsertSubvectorUndef<ValueType Ty> {
	def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (Ty 0)),
	(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
	def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (Ty 0)),
	(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
	def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (Ty 0)),
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
	def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (Ty 0)),
	(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
	def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (Ty 0)),
	(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
	def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (Ty 0)),
	(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
	def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (Ty 0)),
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
	}

	defm : InsertSubvectorUndef<i32>;
	defm : InsertSubvectorUndef<i64>;

	// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
	// or v2f32.
	def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
	(vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
	(i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
	def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
	(vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
	(f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
	// vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
	// so we match on v4f32 here, not v2f32. This will also catch adding
	// the low two lanes of a true v4f32 vector.
	def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
	(vector_extract (v4f32 FPR128:$Rn), (i64 1))),
	(f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;

	// Scalar 64-bit shifts in FPR64 registers.
	def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
	(SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(i64 (int_aarch64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
	(USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
	(SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
	(URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;

	// Patterns for nontemporal/no-allocate stores.
	// We have to resort to tricks to turn a single-input store into a store pair,
	// because there is no single-input nontemporal store, only STNP.
	let Predicates = [IsLE] in {
	let AddedComplexity = 15 in {
	class NTStore128Pat<ValueType VT> :
	Pat<(nontemporalstore (VT FPR128:$Rt),
	(am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
	(STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub),
	(CPYi64 FPR128:$Rt, (i64 1)),
	GPR64sp:$Rn, simm7s8:$offset)>;

	def : NTStore128Pat<v2i64>;
	def : NTStore128Pat<v4i32>;
	def : NTStore128Pat<v8i16>;
	def : NTStore128Pat<v16i8>;

	class NTStore64Pat<ValueType VT> :
	Pat<(nontemporalstore (VT FPR64:$Rt),
	(am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
	(STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub),
	(CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)),
	GPR64sp:$Rn, simm7s4:$offset)>;

	// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64?
	def : NTStore64Pat<v1f64>;
	def : NTStore64Pat<v1i64>;
	def : NTStore64Pat<v2i32>;
	def : NTStore64Pat<v4i16>;
	def : NTStore64Pat<v8i8>;

	def : Pat<(nontemporalstore GPR64:$Rt,
	(am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
	(STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
	(EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 32, 63), sub_32),
	GPR64sp:$Rn, simm7s4:$offset)>;
	} // AddedComplexity=10
	} // Predicates = [IsLE]

	// Tail call return handling. These are all compiler pseudo-instructions,
	// so no encoding information or anything like that.
	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
	def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff), []>,
	Sched<[WriteBrReg]>;
	def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>,
	Sched<[WriteBrReg]>;
	// Indirect tail-call with any register allowed, used by MachineOutliner when
	// this is proven safe.
	// FIXME: If we have to add any more hacks like this, we should instead relax
	// some verifier checks for outlined functions.
	def TCRETURNriALL : Pseudo<(outs), (ins GPR64:$dst, i32imm:$FPDiff), []>,
	Sched<[WriteBrReg]>;
	// Indirect tail-call limited to only use registers (x16 and x17) which are
	// allowed to tail-call a "BTI c" instruction.
	def TCRETURNriBTI : Pseudo<(outs), (ins rtcGPR64:$dst, i32imm:$FPDiff), []>,
	Sched<[WriteBrReg]>;
	}

	def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
	(TCRETURNri tcGPR64:$dst, imm:$FPDiff)>,
	Requires<[NotUseBTI]>;
	def : Pat<(AArch64tcret rtcGPR64:$dst, (i32 timm:$FPDiff)),
	(TCRETURNriBTI rtcGPR64:$dst, imm:$FPDiff)>,
	Requires<[UseBTI]>;
	def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
	(TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
	def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
	(TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;

	def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>;
	def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>;

	include "AArch64InstrAtomics.td"
	include "AArch64SVEInstrInfo.td"
	Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64SVEInstrInfo.td
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64SVEInstrInfo.td (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64SVEInstrInfo.td (revision 351303)
	@@ -1,1426 +1,1430 @@
	//=- AArch64SVEInstrInfo.td - AArch64 SVE Instructions -- tablegen ------=//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// AArch64 Scalable Vector Extension (SVE) Instruction definitions.
	//
	//===----------------------------------------------------------------------===//

	let Predicates = [HasSVE] in {

	def RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr">;
	def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
	def RDFFR_P : sve_int_rdffr_unpred<"rdffr">;
	def SETFFR : sve_int_setffr<"setffr">;
	def WRFFR : sve_int_wrffr<"wrffr">;

	defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add">;
	defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub">;
	defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd">;
	defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd">;
	defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub">;
	defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub">;

	defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and">;
	defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr">;
	defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor">;
	defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic">;

	defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add">;
	defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub">;
	defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr">;

	defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr">;
	defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor">;
	defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and">;
	defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic">;

	defm ADD_ZI : sve_int_arith_imm0<0b000, "add">;
	defm SUB_ZI : sve_int_arith_imm0<0b001, "sub">;
	defm SUBR_ZI : sve_int_arith_imm0<0b011, "subr">;
	defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd">;
	defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd">;
	defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub">;
	defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub">;

	defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad">;
	defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb">;
	defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla">;
	defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls">;

	// SVE predicated integer reductions.
	defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv">;
	defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv">;
	defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv">;
	defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv">;
	defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv">;
	defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv">;
	defm ORV_VPZ : sve_int_reduce_2<0b000, "orv">;
	defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv">;
	defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv">;

	defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn">;
	defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon">;
	defm AND_ZI : sve_int_log_imm<0b10, "and", "bic">;

	defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", simm8>;
	defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", simm8>;
	defm UMAX_ZI : sve_int_arith_imm1<0b01, "umax", imm0_255>;
	defm UMIN_ZI : sve_int_arith_imm1<0b11, "umin", imm0_255>;

	defm MUL_ZI : sve_int_arith_imm2<"mul">;
	defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul">;
	defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh">;
	defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh">;

	defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv">;
	defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv">;
	defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr">;
	defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr">;

	defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot">;
	defm UDOT_ZZZ : sve_intx_dot<0b1, "udot">;

	defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot">;
	defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot">;

	defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb">;
	defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb">;
	defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth">;
	defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth">;
	defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw">;
	defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw">;
	defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs">;
	defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg">;

	defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls">;
	defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz">;
	defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt">;
	defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot">;
	defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not">;
	defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs">;
	defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg">;

	defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax">;
	defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax">;
	defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin">;
	defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin">;
	defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd">;
	defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd">;

	defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe">;
	defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte">;

	defm FADD_ZPmI : sve_fp_2op_i_p_zds<0b000, "fadd", sve_fpimm_half_one>;
	defm FSUB_ZPmI : sve_fp_2op_i_p_zds<0b001, "fsub", sve_fpimm_half_one>;
	defm FMUL_ZPmI : sve_fp_2op_i_p_zds<0b010, "fmul", sve_fpimm_half_two>;
	defm FSUBR_ZPmI : sve_fp_2op_i_p_zds<0b011, "fsubr", sve_fpimm_half_one>;
	defm FMAXNM_ZPmI : sve_fp_2op_i_p_zds<0b100, "fmaxnm", sve_fpimm_zero_one>;
	defm FMINNM_ZPmI : sve_fp_2op_i_p_zds<0b101, "fminnm", sve_fpimm_zero_one>;
	defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>;
	defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>;

	defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd">;
	defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub">;
	defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul">;
	defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr">;
	defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm">;
	defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm">;
	defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax">;
	defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin">;
	defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd">;
	defm FSCALE_ZPmZ : sve_fp_2op_p_zds<0b1001, "fscale">;
	defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx">;
	defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr">;
	defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv">;

	defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd">;
	defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub">;
	defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul">;
	defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul">;
	defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps">;
	defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts">;

	defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">;

	defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd">;
	defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla">;

	defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla">;
	defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls">;
	defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla">;
	defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls">;

	defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad">;
	defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb">;
	defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad">;
	defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb">;

	defm FTMAD_ZZI : sve_fp_ftmad<"ftmad">;

	defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla">;
	defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls">;

	defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla">;
	defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul">;

	// SVE floating point reductions.
	defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda">;
	defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv">;
	defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv">;
	defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv">;
	defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv">;
	defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv">;

	// Splat immediate (unpredicated)
	defm DUP_ZI : sve_int_dup_imm<"dup">;
	defm FDUP_ZI : sve_int_dup_fpimm<"fdup">;
	defm DUPM_ZI : sve_int_dup_mask_imm<"dupm">;

	// Splat immediate (predicated)
	defm CPY_ZPmI : sve_int_dup_imm_pred_merge<"cpy">;
	defm CPY_ZPzI : sve_int_dup_imm_pred_zero<"cpy">;
	defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">;

	// Splat scalar register (unpredicated, GPR or vector + element index)
	defm DUP_ZR : sve_int_perm_dup_r<"dup">;
	defm DUP_ZZI : sve_int_perm_dup_i<"dup">;

	// Splat scalar register (predicated)
	defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy">;
	defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy">;

	// Select elements from either vector (predicated)
	defm SEL_ZPZZ : sve_int_sel_vvv<"sel">;

	defm SPLICE_ZPZ : sve_int_perm_splice<"splice">;
	defm COMPACT_ZPZ : sve_int_perm_compact<"compact">;
	defm INSR_ZR : sve_int_perm_insrs<"insr">;
	defm INSR_ZV : sve_int_perm_insrv<"insr">;
	def EXT_ZZI : sve_int_perm_extract_i<"ext">;

	defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit">;
	defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb">;
	defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh">;
	defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw">;

	defm REV_PP : sve_int_perm_reverse_p<"rev">;
	defm REV_ZZ : sve_int_perm_reverse_z<"rev">;

	defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo">;
	defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi">;
	defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo">;
	defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi">;

	def PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo">;
	def PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi">;

	defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">;
	defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">;
	def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>;
	def FEXPA_ZZ_H : sve_int_bin_cons_misc_0_c<0b01000000, "fexpa", ZPR16>;
	def FEXPA_ZZ_S : sve_int_bin_cons_misc_0_c<0b10000000, "fexpa", ZPR32>;
	def FEXPA_ZZ_D : sve_int_bin_cons_misc_0_c<0b11000000, "fexpa", ZPR64>;

	def BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa">;
	def BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas">;
	def BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb">;
	def BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs">;

	def BRKN_PPzP : sve_int_brkn<0b0, "brkn">;
	def BRKNS_PPzP : sve_int_brkn<0b1, "brkns">;

	defm BRKA_PPzP : sve_int_break_z<0b000, "brka">;
	defm BRKA_PPmP : sve_int_break_m<0b001, "brka">;
	defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas">;
	defm BRKB_PPzP : sve_int_break_z<0b100, "brkb">;
	defm BRKB_PPmP : sve_int_break_m<0b101, "brkb">;
	defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs">;

	def PTEST_PP : sve_int_ptest<0b010000, "ptest">;
	def PFALSE : sve_int_pfalse<0b000000, "pfalse">;
	defm PFIRST : sve_int_pfirst<0b00000, "pfirst">;
	defm PNEXT : sve_int_pnext<0b00110, "pnext">;

	def AND_PPzPP : sve_int_pred_log<0b0000, "and">;
	def BIC_PPzPP : sve_int_pred_log<0b0001, "bic">;
	def EOR_PPzPP : sve_int_pred_log<0b0010, "eor">;
	def SEL_PPPP : sve_int_pred_log<0b0011, "sel">;
	def ANDS_PPzPP : sve_int_pred_log<0b0100, "ands">;
	def BICS_PPzPP : sve_int_pred_log<0b0101, "bics">;
	def EORS_PPzPP : sve_int_pred_log<0b0110, "eors">;
	def ORR_PPzPP : sve_int_pred_log<0b1000, "orr">;
	def ORN_PPzPP : sve_int_pred_log<0b1001, "orn">;
	def NOR_PPzPP : sve_int_pred_log<0b1010, "nor">;
	def NAND_PPzPP : sve_int_pred_log<0b1011, "nand">;
	def ORRS_PPzPP : sve_int_pred_log<0b1100, "orrs">;
	def ORNS_PPzPP : sve_int_pred_log<0b1101, "orns">;
	def NORS_PPzPP : sve_int_pred_log<0b1110, "nors">;
	def NANDS_PPzPP : sve_int_pred_log<0b1111, "nands">;

	defm CLASTA_RPZ : sve_int_perm_clast_rz<0, "clasta">;
	defm CLASTB_RPZ : sve_int_perm_clast_rz<1, "clastb">;
	defm CLASTA_VPZ : sve_int_perm_clast_vz<0, "clasta">;
	defm CLASTB_VPZ : sve_int_perm_clast_vz<1, "clastb">;
	defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta">;
	defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb">;

	defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta">;
	defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb">;
	defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta">;
	defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb">;

	// continuous load with reg+immediate
	defm LD1B_IMM : sve_mem_cld_si<0b0000, "ld1b", Z_b, ZPR8>;
	defm LD1B_H_IMM : sve_mem_cld_si<0b0001, "ld1b", Z_h, ZPR16>;
	defm LD1B_S_IMM : sve_mem_cld_si<0b0010, "ld1b", Z_s, ZPR32>;
	defm LD1B_D_IMM : sve_mem_cld_si<0b0011, "ld1b", Z_d, ZPR64>;
	defm LD1SW_D_IMM : sve_mem_cld_si<0b0100, "ld1sw", Z_d, ZPR64>;
	defm LD1H_IMM : sve_mem_cld_si<0b0101, "ld1h", Z_h, ZPR16>;
	defm LD1H_S_IMM : sve_mem_cld_si<0b0110, "ld1h", Z_s, ZPR32>;
	defm LD1H_D_IMM : sve_mem_cld_si<0b0111, "ld1h", Z_d, ZPR64>;
	defm LD1SH_D_IMM : sve_mem_cld_si<0b1000, "ld1sh", Z_d, ZPR64>;
	defm LD1SH_S_IMM : sve_mem_cld_si<0b1001, "ld1sh", Z_s, ZPR32>;
	defm LD1W_IMM : sve_mem_cld_si<0b1010, "ld1w", Z_s, ZPR32>;
	defm LD1W_D_IMM : sve_mem_cld_si<0b1011, "ld1w", Z_d, ZPR64>;
	defm LD1SB_D_IMM : sve_mem_cld_si<0b1100, "ld1sb", Z_d, ZPR64>;
	defm LD1SB_S_IMM : sve_mem_cld_si<0b1101, "ld1sb", Z_s, ZPR32>;
	defm LD1SB_H_IMM : sve_mem_cld_si<0b1110, "ld1sb", Z_h, ZPR16>;
	defm LD1D_IMM : sve_mem_cld_si<0b1111, "ld1d", Z_d, ZPR64>;

	// LD1R loads (splat scalar to vector)
	defm LD1RB_IMM : sve_mem_ld_dup<0b00, 0b00, "ld1rb", Z_b, ZPR8, uimm6s1>;
	defm LD1RB_H_IMM : sve_mem_ld_dup<0b00, 0b01, "ld1rb", Z_h, ZPR16, uimm6s1>;
	defm LD1RB_S_IMM : sve_mem_ld_dup<0b00, 0b10, "ld1rb", Z_s, ZPR32, uimm6s1>;
	defm LD1RB_D_IMM : sve_mem_ld_dup<0b00, 0b11, "ld1rb", Z_d, ZPR64, uimm6s1>;
	defm LD1RSW_IMM : sve_mem_ld_dup<0b01, 0b00, "ld1rsw", Z_d, ZPR64, uimm6s4>;
	defm LD1RH_IMM : sve_mem_ld_dup<0b01, 0b01, "ld1rh", Z_h, ZPR16, uimm6s2>;
	defm LD1RH_S_IMM : sve_mem_ld_dup<0b01, 0b10, "ld1rh", Z_s, ZPR32, uimm6s2>;
	defm LD1RH_D_IMM : sve_mem_ld_dup<0b01, 0b11, "ld1rh", Z_d, ZPR64, uimm6s2>;
	defm LD1RSH_D_IMM : sve_mem_ld_dup<0b10, 0b00, "ld1rsh", Z_d, ZPR64, uimm6s2>;
	defm LD1RSH_S_IMM : sve_mem_ld_dup<0b10, 0b01, "ld1rsh", Z_s, ZPR32, uimm6s2>;
	defm LD1RW_IMM : sve_mem_ld_dup<0b10, 0b10, "ld1rw", Z_s, ZPR32, uimm6s4>;
	defm LD1RW_D_IMM : sve_mem_ld_dup<0b10, 0b11, "ld1rw", Z_d, ZPR64, uimm6s4>;
	defm LD1RSB_D_IMM : sve_mem_ld_dup<0b11, 0b00, "ld1rsb", Z_d, ZPR64, uimm6s1>;
	defm LD1RSB_S_IMM : sve_mem_ld_dup<0b11, 0b01, "ld1rsb", Z_s, ZPR32, uimm6s1>;
	defm LD1RSB_H_IMM : sve_mem_ld_dup<0b11, 0b10, "ld1rsb", Z_h, ZPR16, uimm6s1>;
	defm LD1RD_IMM : sve_mem_ld_dup<0b11, 0b11, "ld1rd", Z_d, ZPR64, uimm6s8>;

	// LD1RQ loads (load quadword-vector and splat to scalable vector)
	defm LD1RQ_B_IMM : sve_mem_ldqr_si<0b00, "ld1rqb", Z_b, ZPR8>;
	defm LD1RQ_H_IMM : sve_mem_ldqr_si<0b01, "ld1rqh", Z_h, ZPR16>;
	defm LD1RQ_W_IMM : sve_mem_ldqr_si<0b10, "ld1rqw", Z_s, ZPR32>;
	defm LD1RQ_D_IMM : sve_mem_ldqr_si<0b11, "ld1rqd", Z_d, ZPR64>;
	defm LD1RQ_B : sve_mem_ldqr_ss<0b00, "ld1rqb", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm LD1RQ_H : sve_mem_ldqr_ss<0b01, "ld1rqh", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm LD1RQ_W : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm LD1RQ_D : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// continuous load with reg+reg addressing.
	defm LD1B : sve_mem_cld_ss<0b0000, "ld1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm LD1B_H : sve_mem_cld_ss<0b0001, "ld1b", Z_h, ZPR16, GPR64NoXZRshifted8>;
	defm LD1B_S : sve_mem_cld_ss<0b0010, "ld1b", Z_s, ZPR32, GPR64NoXZRshifted8>;
	defm LD1B_D : sve_mem_cld_ss<0b0011, "ld1b", Z_d, ZPR64, GPR64NoXZRshifted8>;
	defm LD1SW_D : sve_mem_cld_ss<0b0100, "ld1sw", Z_d, ZPR64, GPR64NoXZRshifted32>;
	defm LD1H : sve_mem_cld_ss<0b0101, "ld1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm LD1H_S : sve_mem_cld_ss<0b0110, "ld1h", Z_s, ZPR32, GPR64NoXZRshifted16>;
	defm LD1H_D : sve_mem_cld_ss<0b0111, "ld1h", Z_d, ZPR64, GPR64NoXZRshifted16>;
	defm LD1SH_D : sve_mem_cld_ss<0b1000, "ld1sh", Z_d, ZPR64, GPR64NoXZRshifted16>;
	defm LD1SH_S : sve_mem_cld_ss<0b1001, "ld1sh", Z_s, ZPR32, GPR64NoXZRshifted16>;
	defm LD1W : sve_mem_cld_ss<0b1010, "ld1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm LD1W_D : sve_mem_cld_ss<0b1011, "ld1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
	defm LD1SB_D : sve_mem_cld_ss<0b1100, "ld1sb", Z_d, ZPR64, GPR64NoXZRshifted8>;
	defm LD1SB_S : sve_mem_cld_ss<0b1101, "ld1sb", Z_s, ZPR32, GPR64NoXZRshifted8>;
	defm LD1SB_H : sve_mem_cld_ss<0b1110, "ld1sb", Z_h, ZPR16, GPR64NoXZRshifted8>;
	defm LD1D : sve_mem_cld_ss<0b1111, "ld1d", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// non-faulting continuous load with reg+immediate
	defm LDNF1B_IMM : sve_mem_cldnf_si<0b0000, "ldnf1b", Z_b, ZPR8>;
	defm LDNF1B_H_IMM : sve_mem_cldnf_si<0b0001, "ldnf1b", Z_h, ZPR16>;
	defm LDNF1B_S_IMM : sve_mem_cldnf_si<0b0010, "ldnf1b", Z_s, ZPR32>;
	defm LDNF1B_D_IMM : sve_mem_cldnf_si<0b0011, "ldnf1b", Z_d, ZPR64>;
	defm LDNF1SW_D_IMM : sve_mem_cldnf_si<0b0100, "ldnf1sw", Z_d, ZPR64>;
	defm LDNF1H_IMM : sve_mem_cldnf_si<0b0101, "ldnf1h", Z_h, ZPR16>;
	defm LDNF1H_S_IMM : sve_mem_cldnf_si<0b0110, "ldnf1h", Z_s, ZPR32>;
	defm LDNF1H_D_IMM : sve_mem_cldnf_si<0b0111, "ldnf1h", Z_d, ZPR64>;
	defm LDNF1SH_D_IMM : sve_mem_cldnf_si<0b1000, "ldnf1sh", Z_d, ZPR64>;
	defm LDNF1SH_S_IMM : sve_mem_cldnf_si<0b1001, "ldnf1sh", Z_s, ZPR32>;
	defm LDNF1W_IMM : sve_mem_cldnf_si<0b1010, "ldnf1w", Z_s, ZPR32>;
	defm LDNF1W_D_IMM : sve_mem_cldnf_si<0b1011, "ldnf1w", Z_d, ZPR64>;
	defm LDNF1SB_D_IMM : sve_mem_cldnf_si<0b1100, "ldnf1sb", Z_d, ZPR64>;
	defm LDNF1SB_S_IMM : sve_mem_cldnf_si<0b1101, "ldnf1sb", Z_s, ZPR32>;
	defm LDNF1SB_H_IMM : sve_mem_cldnf_si<0b1110, "ldnf1sb", Z_h, ZPR16>;
	defm LDNF1D_IMM : sve_mem_cldnf_si<0b1111, "ldnf1d", Z_d, ZPR64>;

	// First-faulting loads with reg+reg addressing.
	defm LDFF1B : sve_mem_cldff_ss<0b0000, "ldff1b", Z_b, ZPR8, GPR64shifted8>;
	defm LDFF1B_H : sve_mem_cldff_ss<0b0001, "ldff1b", Z_h, ZPR16, GPR64shifted8>;
	defm LDFF1B_S : sve_mem_cldff_ss<0b0010, "ldff1b", Z_s, ZPR32, GPR64shifted8>;
	defm LDFF1B_D : sve_mem_cldff_ss<0b0011, "ldff1b", Z_d, ZPR64, GPR64shifted8>;
	defm LDFF1SW_D : sve_mem_cldff_ss<0b0100, "ldff1sw", Z_d, ZPR64, GPR64shifted32>;
	defm LDFF1H : sve_mem_cldff_ss<0b0101, "ldff1h", Z_h, ZPR16, GPR64shifted16>;
	defm LDFF1H_S : sve_mem_cldff_ss<0b0110, "ldff1h", Z_s, ZPR32, GPR64shifted16>;
	defm LDFF1H_D : sve_mem_cldff_ss<0b0111, "ldff1h", Z_d, ZPR64, GPR64shifted16>;
	defm LDFF1SH_D : sve_mem_cldff_ss<0b1000, "ldff1sh", Z_d, ZPR64, GPR64shifted16>;
	defm LDFF1SH_S : sve_mem_cldff_ss<0b1001, "ldff1sh", Z_s, ZPR32, GPR64shifted16>;
	defm LDFF1W : sve_mem_cldff_ss<0b1010, "ldff1w", Z_s, ZPR32, GPR64shifted32>;
	defm LDFF1W_D : sve_mem_cldff_ss<0b1011, "ldff1w", Z_d, ZPR64, GPR64shifted32>;
	defm LDFF1SB_D : sve_mem_cldff_ss<0b1100, "ldff1sb", Z_d, ZPR64, GPR64shifted8>;
	defm LDFF1SB_S : sve_mem_cldff_ss<0b1101, "ldff1sb", Z_s, ZPR32, GPR64shifted8>;
	defm LDFF1SB_H : sve_mem_cldff_ss<0b1110, "ldff1sb", Z_h, ZPR16, GPR64shifted8>;
	defm LDFF1D : sve_mem_cldff_ss<0b1111, "ldff1d", Z_d, ZPR64, GPR64shifted64>;

	// LD(2\|3\|4) structured loads with reg+immediate
	defm LD2B_IMM : sve_mem_eld_si<0b00, 0b01, ZZ_b, "ld2b", simm4s2>;
	defm LD3B_IMM : sve_mem_eld_si<0b00, 0b10, ZZZ_b, "ld3b", simm4s3>;
	defm LD4B_IMM : sve_mem_eld_si<0b00, 0b11, ZZZZ_b, "ld4b", simm4s4>;
	defm LD2H_IMM : sve_mem_eld_si<0b01, 0b01, ZZ_h, "ld2h", simm4s2>;
	defm LD3H_IMM : sve_mem_eld_si<0b01, 0b10, ZZZ_h, "ld3h", simm4s3>;
	defm LD4H_IMM : sve_mem_eld_si<0b01, 0b11, ZZZZ_h, "ld4h", simm4s4>;
	defm LD2W_IMM : sve_mem_eld_si<0b10, 0b01, ZZ_s, "ld2w", simm4s2>;
	defm LD3W_IMM : sve_mem_eld_si<0b10, 0b10, ZZZ_s, "ld3w", simm4s3>;
	defm LD4W_IMM : sve_mem_eld_si<0b10, 0b11, ZZZZ_s, "ld4w", simm4s4>;
	defm LD2D_IMM : sve_mem_eld_si<0b11, 0b01, ZZ_d, "ld2d", simm4s2>;
	defm LD3D_IMM : sve_mem_eld_si<0b11, 0b10, ZZZ_d, "ld3d", simm4s3>;
	defm LD4D_IMM : sve_mem_eld_si<0b11, 0b11, ZZZZ_d, "ld4d", simm4s4>;

	// LD(2\|3\|4) structured loads (register + register)
	def LD2B : sve_mem_eld_ss<0b00, 0b01, ZZ_b, "ld2b", GPR64NoXZRshifted8>;
	def LD3B : sve_mem_eld_ss<0b00, 0b10, ZZZ_b, "ld3b", GPR64NoXZRshifted8>;
	def LD4B : sve_mem_eld_ss<0b00, 0b11, ZZZZ_b, "ld4b", GPR64NoXZRshifted8>;
	def LD2H : sve_mem_eld_ss<0b01, 0b01, ZZ_h, "ld2h", GPR64NoXZRshifted16>;
	def LD3H : sve_mem_eld_ss<0b01, 0b10, ZZZ_h, "ld3h", GPR64NoXZRshifted16>;
	def LD4H : sve_mem_eld_ss<0b01, 0b11, ZZZZ_h, "ld4h", GPR64NoXZRshifted16>;
	def LD2W : sve_mem_eld_ss<0b10, 0b01, ZZ_s, "ld2w", GPR64NoXZRshifted32>;
	def LD3W : sve_mem_eld_ss<0b10, 0b10, ZZZ_s, "ld3w", GPR64NoXZRshifted32>;
	def LD4W : sve_mem_eld_ss<0b10, 0b11, ZZZZ_s, "ld4w", GPR64NoXZRshifted32>;
	def LD2D : sve_mem_eld_ss<0b11, 0b01, ZZ_d, "ld2d", GPR64NoXZRshifted64>;
	def LD3D : sve_mem_eld_ss<0b11, 0b10, ZZZ_d, "ld3d", GPR64NoXZRshifted64>;
	def LD4D : sve_mem_eld_ss<0b11, 0b11, ZZZZ_d, "ld4d", GPR64NoXZRshifted64>;

	// Gathers using unscaled 32-bit offsets, e.g.
	// ld1h z0.s, p0/z, [x0, z0.s, uxtw]
	defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
	defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
	defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
	defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
	defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
	defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
	defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
	defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
	defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
	defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;

	// Gathers using scaled 32-bit offsets, e.g.
	// ld1h z0.s, p0/z, [x0, z0.s, uxtw #1]
	defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
	defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
	defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
	defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
	defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
	defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;

	// Gathers using scaled 32-bit pointers with offset, e.g.
	// ld1h z0.s, p0/z, [z0.s, #16]
	defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31>;
	defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31>;
	defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31>;
	defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31>;
	defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2>;
	defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2>;
	defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2>;
	defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2>;
	defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4>;
	defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4>;

	// Gathers using scaled 64-bit pointers with offset, e.g.
	// ld1h z0.d, p0/z, [z0.d, #16]
	defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31>;
	defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31>;
	defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31>;
	defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31>;
	defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2>;
	defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2>;
	defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2>;
	defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2>;
	defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4>;
	defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4>;
	defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4>;
	defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4>;
	defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8>;
	defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8>;

	// Gathers using unscaled 64-bit offsets, e.g.
	// ld1h z0.d, p0/z, [x0, z0.d]
	defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb">;
	defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb">;
	defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b">;
	defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b">;
	defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh">;
	defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh">;
	defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h">;
	defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h">;
	defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw">;
	defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw">;
	defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w">;
	defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w">;
	defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d">;
	defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d">;

	// Gathers using scaled 64-bit offsets, e.g.
	// ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
	defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", ZPR64ExtLSL16>;
	defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", ZPR64ExtLSL16>;
	defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", ZPR64ExtLSL16>;
	defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", ZPR64ExtLSL16>;
	defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", ZPR64ExtLSL32>;
	defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", ZPR64ExtLSL32>;
	defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", ZPR64ExtLSL32>;
	defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", ZPR64ExtLSL32>;
	defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", ZPR64ExtLSL64>;
	defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", ZPR64ExtLSL64>;

	// Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g.
	// ld1h z0.d, p0/z, [x0, z0.d, uxtw]
	defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
	defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
	defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
	defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
	defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
	defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
	defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
	defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
	defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
	defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
	defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
	defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
	defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
	defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;

	// Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g.
	// ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
	defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
	defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh",ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
	defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
	defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
	defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
	defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw",ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
	defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
	defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
	defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
	defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;

	// Non-temporal contiguous loads (register + immediate)
	defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
	defm LDNT1H_ZRI : sve_mem_cldnt_si<0b01, "ldnt1h", Z_h, ZPR16>;
	defm LDNT1W_ZRI : sve_mem_cldnt_si<0b10, "ldnt1w", Z_s, ZPR32>;
	defm LDNT1D_ZRI : sve_mem_cldnt_si<0b11, "ldnt1d", Z_d, ZPR64>;

	// Non-temporal contiguous loads (register + register)
	defm LDNT1B_ZRR : sve_mem_cldnt_ss<0b00, "ldnt1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm LDNT1H_ZRR : sve_mem_cldnt_ss<0b01, "ldnt1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm LDNT1W_ZRR : sve_mem_cldnt_ss<0b10, "ldnt1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm LDNT1D_ZRR : sve_mem_cldnt_ss<0b11, "ldnt1d", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// contiguous store with immediates
	defm ST1B_IMM : sve_mem_cst_si<0b00, 0b00, "st1b", Z_b, ZPR8>;
	defm ST1B_H_IMM : sve_mem_cst_si<0b00, 0b01, "st1b", Z_h, ZPR16>;
	defm ST1B_S_IMM : sve_mem_cst_si<0b00, 0b10, "st1b", Z_s, ZPR32>;
	defm ST1B_D_IMM : sve_mem_cst_si<0b00, 0b11, "st1b", Z_d, ZPR64>;
	defm ST1H_IMM : sve_mem_cst_si<0b01, 0b01, "st1h", Z_h, ZPR16>;
	defm ST1H_S_IMM : sve_mem_cst_si<0b01, 0b10, "st1h", Z_s, ZPR32>;
	defm ST1H_D_IMM : sve_mem_cst_si<0b01, 0b11, "st1h", Z_d, ZPR64>;
	defm ST1W_IMM : sve_mem_cst_si<0b10, 0b10, "st1w", Z_s, ZPR32>;
	defm ST1W_D_IMM : sve_mem_cst_si<0b10, 0b11, "st1w", Z_d, ZPR64>;
	defm ST1D_IMM : sve_mem_cst_si<0b11, 0b11, "st1d", Z_d, ZPR64>;

	// contiguous store with reg+reg addressing.
	defm ST1B : sve_mem_cst_ss<0b0000, "st1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm ST1B_H : sve_mem_cst_ss<0b0001, "st1b", Z_h, ZPR16, GPR64NoXZRshifted8>;
	defm ST1B_S : sve_mem_cst_ss<0b0010, "st1b", Z_s, ZPR32, GPR64NoXZRshifted8>;
	defm ST1B_D : sve_mem_cst_ss<0b0011, "st1b", Z_d, ZPR64, GPR64NoXZRshifted8>;
	defm ST1H : sve_mem_cst_ss<0b0101, "st1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm ST1H_S : sve_mem_cst_ss<0b0110, "st1h", Z_s, ZPR32, GPR64NoXZRshifted16>;
	defm ST1H_D : sve_mem_cst_ss<0b0111, "st1h", Z_d, ZPR64, GPR64NoXZRshifted16>;
	defm ST1W : sve_mem_cst_ss<0b1010, "st1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
	defm ST1D : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// Scatters using unscaled 32-bit offsets, e.g.
	// st1h z0.s, p0, [x0, z0.s, uxtw]
	// and unpacked:
	// st1h z0.d, p0, [x0, z0.d, uxtw]
	defm SST1B_D : sve_mem_sst_sv_32_unscaled<0b000, "st1b", Z_d, ZPR64, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
	defm SST1B_S : sve_mem_sst_sv_32_unscaled<0b001, "st1b", Z_s, ZPR32, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
	defm SST1H_D : sve_mem_sst_sv_32_unscaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
	defm SST1H_S : sve_mem_sst_sv_32_unscaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
	defm SST1W_D : sve_mem_sst_sv_32_unscaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
	defm SST1W : sve_mem_sst_sv_32_unscaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
	defm SST1D : sve_mem_sst_sv_32_unscaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;

	// Scatters using scaled 32-bit offsets, e.g.
	// st1h z0.s, p0, [x0, z0.s, uxtw #1]
	// and unpacked:
	// st1h z0.d, p0, [x0, z0.d, uxtw #1]
	defm SST1H_D : sve_mem_sst_sv_32_scaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
	defm SST1H_S : sve_mem_sst_sv_32_scaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
	defm SST1W_D : sve_mem_sst_sv_32_scaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
	defm SST1W : sve_mem_sst_sv_32_scaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
	defm SST1D : sve_mem_sst_sv_32_scaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW64, ZPR64ExtUXTW64>;

	// Scatters using 32/64-bit pointers with offset, e.g.
	// st1h z0.s, p0, [z0.s, #16]
	// st1h z0.d, p0, [z0.d, #16]
	defm SST1B_D : sve_mem_sst_vi_ptrs<0b000, "st1b", Z_d, ZPR64, imm0_31>;
	defm SST1B_S : sve_mem_sst_vi_ptrs<0b001, "st1b", Z_s, ZPR32, imm0_31>;
	defm SST1H_D : sve_mem_sst_vi_ptrs<0b010, "st1h", Z_d, ZPR64, uimm5s2>;
	defm SST1H_S : sve_mem_sst_vi_ptrs<0b011, "st1h", Z_s, ZPR32, uimm5s2>;
	defm SST1W_D : sve_mem_sst_vi_ptrs<0b100, "st1w", Z_d, ZPR64, uimm5s4>;
	defm SST1W : sve_mem_sst_vi_ptrs<0b101, "st1w", Z_s, ZPR32, uimm5s4>;
	defm SST1D : sve_mem_sst_vi_ptrs<0b110, "st1d", Z_d, ZPR64, uimm5s8>;

	// Scatters using unscaled 64-bit offsets, e.g.
	// st1h z0.d, p0, [x0, z0.d]
	defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b">;
	defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h">;
	defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w">;
	defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d">;

	// Scatters using scaled 64-bit offsets, e.g.
	// st1h z0.d, p0, [x0, z0.d, lsl #1]
	defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", ZPR64ExtLSL16>;
	defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", ZPR64ExtLSL32>;
	defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", ZPR64ExtLSL64>;

	// ST(2\|3\|4) structured stores (register + immediate)
	defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b, "st2b", simm4s2>;
	defm ST3B_IMM : sve_mem_est_si<0b00, 0b10, ZZZ_b, "st3b", simm4s3>;
	defm ST4B_IMM : sve_mem_est_si<0b00, 0b11, ZZZZ_b, "st4b", simm4s4>;
	defm ST2H_IMM : sve_mem_est_si<0b01, 0b01, ZZ_h, "st2h", simm4s2>;
	defm ST3H_IMM : sve_mem_est_si<0b01, 0b10, ZZZ_h, "st3h", simm4s3>;
	defm ST4H_IMM : sve_mem_est_si<0b01, 0b11, ZZZZ_h, "st4h", simm4s4>;
	defm ST2W_IMM : sve_mem_est_si<0b10, 0b01, ZZ_s, "st2w", simm4s2>;
	defm ST3W_IMM : sve_mem_est_si<0b10, 0b10, ZZZ_s, "st3w", simm4s3>;
	defm ST4W_IMM : sve_mem_est_si<0b10, 0b11, ZZZZ_s, "st4w", simm4s4>;
	defm ST2D_IMM : sve_mem_est_si<0b11, 0b01, ZZ_d, "st2d", simm4s2>;
	defm ST3D_IMM : sve_mem_est_si<0b11, 0b10, ZZZ_d, "st3d", simm4s3>;
	defm ST4D_IMM : sve_mem_est_si<0b11, 0b11, ZZZZ_d, "st4d", simm4s4>;

	// ST(2\|3\|4) structured stores (register + register)
	def ST2B : sve_mem_est_ss<0b00, 0b01, ZZ_b, "st2b", GPR64NoXZRshifted8>;
	def ST3B : sve_mem_est_ss<0b00, 0b10, ZZZ_b, "st3b", GPR64NoXZRshifted8>;
	def ST4B : sve_mem_est_ss<0b00, 0b11, ZZZZ_b, "st4b", GPR64NoXZRshifted8>;
	def ST2H : sve_mem_est_ss<0b01, 0b01, ZZ_h, "st2h", GPR64NoXZRshifted16>;
	def ST3H : sve_mem_est_ss<0b01, 0b10, ZZZ_h, "st3h", GPR64NoXZRshifted16>;
	def ST4H : sve_mem_est_ss<0b01, 0b11, ZZZZ_h, "st4h", GPR64NoXZRshifted16>;
	def ST2W : sve_mem_est_ss<0b10, 0b01, ZZ_s, "st2w", GPR64NoXZRshifted32>;
	def ST3W : sve_mem_est_ss<0b10, 0b10, ZZZ_s, "st3w", GPR64NoXZRshifted32>;
	def ST4W : sve_mem_est_ss<0b10, 0b11, ZZZZ_s, "st4w", GPR64NoXZRshifted32>;
	def ST2D : sve_mem_est_ss<0b11, 0b01, ZZ_d, "st2d", GPR64NoXZRshifted64>;
	def ST3D : sve_mem_est_ss<0b11, 0b10, ZZZ_d, "st3d", GPR64NoXZRshifted64>;
	def ST4D : sve_mem_est_ss<0b11, 0b11, ZZZZ_d, "st4d", GPR64NoXZRshifted64>;

	// Non-temporal contiguous stores (register + immediate)
	defm STNT1B_ZRI : sve_mem_cstnt_si<0b00, "stnt1b", Z_b, ZPR8>;
	defm STNT1H_ZRI : sve_mem_cstnt_si<0b01, "stnt1h", Z_h, ZPR16>;
	defm STNT1W_ZRI : sve_mem_cstnt_si<0b10, "stnt1w", Z_s, ZPR32>;
	defm STNT1D_ZRI : sve_mem_cstnt_si<0b11, "stnt1d", Z_d, ZPR64>;

	// Non-temporal contiguous stores (register + register)
	defm STNT1B_ZRR : sve_mem_cstnt_ss<0b00, "stnt1b", Z_b, ZPR8, GPR64NoXZRshifted8>;
	defm STNT1H_ZRR : sve_mem_cstnt_ss<0b01, "stnt1h", Z_h, ZPR16, GPR64NoXZRshifted16>;
	defm STNT1W_ZRR : sve_mem_cstnt_ss<0b10, "stnt1w", Z_s, ZPR32, GPR64NoXZRshifted32>;
	defm STNT1D_ZRR : sve_mem_cstnt_ss<0b11, "stnt1d", Z_d, ZPR64, GPR64NoXZRshifted64>;

	// Fill/Spill
	defm LDR_ZXI : sve_mem_z_fill<"ldr">;
	defm LDR_PXI : sve_mem_p_fill<"ldr">;
	defm STR_ZXI : sve_mem_z_spill<"str">;
	defm STR_PXI : sve_mem_p_spill<"str">;

	// Contiguous prefetch (register + immediate)
	defm PRFB_PRI : sve_mem_prfm_si<0b00, "prfb">;
	defm PRFH_PRI : sve_mem_prfm_si<0b01, "prfh">;
	defm PRFW_PRI : sve_mem_prfm_si<0b10, "prfw">;
	defm PRFD_PRI : sve_mem_prfm_si<0b11, "prfd">;

	// Contiguous prefetch (register + register)
	def PRFB_PRR : sve_mem_prfm_ss<0b001, "prfb", GPR64NoXZRshifted8>;
	def PRFH_PRR : sve_mem_prfm_ss<0b011, "prfh", GPR64NoXZRshifted16>;
	def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
	def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>;

	// Gather prefetch using scaled 32-bit offsets, e.g.
	// prfh pldl1keep, p0, [x0, z0.s, uxtw #1]
	defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
	defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
	defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
	defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64>;

	// Gather prefetch using unpacked, scaled 32-bit offsets, e.g.
	// prfh pldl1keep, p0, [x0, z0.d, uxtw #1]
	defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
	defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
	defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
	defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;

	// Gather prefetch using scaled 64-bit offsets, e.g.
	// prfh pldl1keep, p0, [x0, z0.d, lsl #1]
	defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8>;
	defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16>;
	defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32>;
	defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64>;

	// Gather prefetch using 32/64-bit pointers with offset, e.g.
	// prfh pldl1keep, p0, [z0.s, #16]
	// prfh pldl1keep, p0, [z0.d, #16]
	defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31>;
	defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2>;
	defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4>;
	defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8>;

	defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31>;
	defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2>;
	defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4>;
	defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8>;

	defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">;
	defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">;
	defm ADR_LSL_ZZZ_S : sve_int_bin_cons_misc_0_a_32_lsl<0b10, "adr">;
	defm ADR_LSL_ZZZ_D : sve_int_bin_cons_misc_0_a_64_lsl<0b11, "adr">;

	defm TBL_ZZZ : sve_int_perm_tbl<"tbl">;

	defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1">;
	defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2">;
	defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1">;
	defm UZP2_ZZZ : sve_int_perm_bin_perm_zz<0b011, "uzp2">;
	defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1">;
	defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2">;

	defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1">;
	defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2">;
	defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1">;
	defm UZP2_PPP : sve_int_perm_bin_perm_pp<0b011, "uzp2">;
	defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1">;
	defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2">;

	defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs">;
	defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi">;
	defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge">;
	defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt">;
	defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq">;
	defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne">;

	defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq">;
	defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne">;
	defm CMPGE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b000, "cmpge">;
	defm CMPGT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b001, "cmpgt">;
	defm CMPLT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b010, "cmplt">;
	defm CMPLE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b011, "cmple">;
	defm CMPHS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b100, "cmphs">;
	defm CMPHI_WIDE_PPzZZ : sve_int_cmp_1_wide<0b101, "cmphi">;
	defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo">;
	defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls">;

	defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge">;
	defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt">;
	defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt">;
	defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple">;
	defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq">;
	defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne">;
	defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs">;
	defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi">;
	defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo">;
	defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls">;

	defm FCMGE_PPzZZ : sve_fp_3op_p_pd<0b000, "fcmge">;
	defm FCMGT_PPzZZ : sve_fp_3op_p_pd<0b001, "fcmgt">;
	defm FCMEQ_PPzZZ : sve_fp_3op_p_pd<0b010, "fcmeq">;
	defm FCMNE_PPzZZ : sve_fp_3op_p_pd<0b011, "fcmne">;
	defm FCMUO_PPzZZ : sve_fp_3op_p_pd<0b100, "fcmuo">;
	defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge">;
	defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt">;

	defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge">;
	defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt">;
	defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt">;
	defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle">;
	defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">;
	defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">;

	defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt">;
	defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele">;
	defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo">;
	defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels">;

	defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt">;
	defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele">;
	defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo">;
	defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels">;

	def CTERMEQ_WW : sve_int_cterm<0b0, 0b0, "ctermeq", GPR32>;
	def CTERMNE_WW : sve_int_cterm<0b0, 0b1, "ctermne", GPR32>;
	def CTERMEQ_XX : sve_int_cterm<0b1, 0b0, "ctermeq", GPR64>;
	def CTERMNE_XX : sve_int_cterm<0b1, 0b1, "ctermne", GPR64>;

	def RDVLI_XI : sve_int_read_vl_a<0b0, 0b11111, "rdvl">;
	def ADDVL_XXI : sve_int_arith_vl<0b0, "addvl">;
	def ADDPL_XXI : sve_int_arith_vl<0b1, "addpl">;

	defm CNTB_XPiI : sve_int_count<0b000, "cntb">;
	defm CNTH_XPiI : sve_int_count<0b010, "cnth">;
	defm CNTW_XPiI : sve_int_count<0b100, "cntw">;
	defm CNTD_XPiI : sve_int_count<0b110, "cntd">;
	defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp">;

	defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb">;
	defm DECB_XPiI : sve_int_pred_pattern_a<0b001, "decb">;
	defm INCH_XPiI : sve_int_pred_pattern_a<0b010, "inch">;
	defm DECH_XPiI : sve_int_pred_pattern_a<0b011, "dech">;
	defm INCW_XPiI : sve_int_pred_pattern_a<0b100, "incw">;
	defm DECW_XPiI : sve_int_pred_pattern_a<0b101, "decw">;
	defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd">;
	defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd">;

	defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb">;
	defm UQINCB_WPiI : sve_int_pred_pattern_b_u32<0b00001, "uqincb">;
	defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb">;
	defm UQDECB_WPiI : sve_int_pred_pattern_b_u32<0b00011, "uqdecb">;
	defm SQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00100, "sqincb">;
	defm UQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00101, "uqincb">;
	defm SQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00110, "sqdecb">;
	defm UQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00111, "uqdecb">;

	defm SQINCH_XPiWdI : sve_int_pred_pattern_b_s32<0b01000, "sqinch">;
	defm UQINCH_WPiI : sve_int_pred_pattern_b_u32<0b01001, "uqinch">;
	defm SQDECH_XPiWdI : sve_int_pred_pattern_b_s32<0b01010, "sqdech">;
	defm UQDECH_WPiI : sve_int_pred_pattern_b_u32<0b01011, "uqdech">;
	defm SQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01100, "sqinch">;
	defm UQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01101, "uqinch">;
	defm SQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01110, "sqdech">;
	defm UQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01111, "uqdech">;

	defm SQINCW_XPiWdI : sve_int_pred_pattern_b_s32<0b10000, "sqincw">;
	defm UQINCW_WPiI : sve_int_pred_pattern_b_u32<0b10001, "uqincw">;
	defm SQDECW_XPiWdI : sve_int_pred_pattern_b_s32<0b10010, "sqdecw">;
	defm UQDECW_WPiI : sve_int_pred_pattern_b_u32<0b10011, "uqdecw">;
	defm SQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10100, "sqincw">;
	defm UQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10101, "uqincw">;
	defm SQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10110, "sqdecw">;
	defm UQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10111, "uqdecw">;

	defm SQINCD_XPiWdI : sve_int_pred_pattern_b_s32<0b11000, "sqincd">;
	defm UQINCD_WPiI : sve_int_pred_pattern_b_u32<0b11001, "uqincd">;
	defm SQDECD_XPiWdI : sve_int_pred_pattern_b_s32<0b11010, "sqdecd">;
	defm UQDECD_WPiI : sve_int_pred_pattern_b_u32<0b11011, "uqdecd">;
	defm SQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11100, "sqincd">;
	defm UQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11101, "uqincd">;
	defm SQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11110, "sqdecd">;
	defm UQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11111, "uqdecd">;

	defm SQINCH_ZPiI : sve_int_countvlv<0b01000, "sqinch", ZPR16>;
	defm UQINCH_ZPiI : sve_int_countvlv<0b01001, "uqinch", ZPR16>;
	defm SQDECH_ZPiI : sve_int_countvlv<0b01010, "sqdech", ZPR16>;
	defm UQDECH_ZPiI : sve_int_countvlv<0b01011, "uqdech", ZPR16>;
	defm INCH_ZPiI : sve_int_countvlv<0b01100, "inch", ZPR16>;
	defm DECH_ZPiI : sve_int_countvlv<0b01101, "dech", ZPR16>;
	defm SQINCW_ZPiI : sve_int_countvlv<0b10000, "sqincw", ZPR32>;
	defm UQINCW_ZPiI : sve_int_countvlv<0b10001, "uqincw", ZPR32>;
	defm SQDECW_ZPiI : sve_int_countvlv<0b10010, "sqdecw", ZPR32>;
	defm UQDECW_ZPiI : sve_int_countvlv<0b10011, "uqdecw", ZPR32>;
	defm INCW_ZPiI : sve_int_countvlv<0b10100, "incw", ZPR32>;
	defm DECW_ZPiI : sve_int_countvlv<0b10101, "decw", ZPR32>;
	defm SQINCD_ZPiI : sve_int_countvlv<0b11000, "sqincd", ZPR64>;
	defm UQINCD_ZPiI : sve_int_countvlv<0b11001, "uqincd", ZPR64>;
	defm SQDECD_ZPiI : sve_int_countvlv<0b11010, "sqdecd", ZPR64>;
	defm UQDECD_ZPiI : sve_int_countvlv<0b11011, "uqdecd", ZPR64>;
	defm INCD_ZPiI : sve_int_countvlv<0b11100, "incd", ZPR64>;
	defm DECD_ZPiI : sve_int_countvlv<0b11101, "decd", ZPR64>;

	defm SQINCP_XPWd : sve_int_count_r_s32<0b00000, "sqincp">;
	defm SQINCP_XP : sve_int_count_r_x64<0b00010, "sqincp">;
	defm UQINCP_WP : sve_int_count_r_u32<0b00100, "uqincp">;
	defm UQINCP_XP : sve_int_count_r_x64<0b00110, "uqincp">;
	defm SQDECP_XPWd : sve_int_count_r_s32<0b01000, "sqdecp">;
	defm SQDECP_XP : sve_int_count_r_x64<0b01010, "sqdecp">;
	defm UQDECP_WP : sve_int_count_r_u32<0b01100, "uqdecp">;
	defm UQDECP_XP : sve_int_count_r_x64<0b01110, "uqdecp">;
	defm INCP_XP : sve_int_count_r_x64<0b10000, "incp">;
	defm DECP_XP : sve_int_count_r_x64<0b10100, "decp">;

	defm SQINCP_ZP : sve_int_count_v<0b00000, "sqincp">;
	defm UQINCP_ZP : sve_int_count_v<0b00100, "uqincp">;
	defm SQDECP_ZP : sve_int_count_v<0b01000, "sqdecp">;
	defm UQDECP_ZP : sve_int_count_v<0b01100, "uqdecp">;
	defm INCP_ZP : sve_int_count_v<0b10000, "incp">;
	defm DECP_ZP : sve_int_count_v<0b10100, "decp">;

	defm INDEX_RR : sve_int_index_rr<"index">;
	defm INDEX_IR : sve_int_index_ir<"index">;
	defm INDEX_RI : sve_int_index_ri<"index">;
	defm INDEX_II : sve_int_index_ii<"index">;

	// Unpredicated shifts
	defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr">;
	defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr">;
	defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl">;

	defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
	defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
	defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;

	// Predicated shifts
	defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr">;
	defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr">;
	defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">;
	defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd">;

	defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr">;
	defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr">;
	defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl">;
	defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr">;
	defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr">;
	defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr">;

	defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr">;
	defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr">;
	defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl">;

	def FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, ElementSizeS>;
	def FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, ElementSizeS>;
	def SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, ElementSizeH>;
	def SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, ElementSizeS>;
	def UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, ElementSizeS>;
	def UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, ElementSizeH>;
	def FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, ElementSizeH>;
	def FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, ElementSizeS>;
	def FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, ElementSizeH>;
	def FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, ElementSizeS>;
	def FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, ElementSizeD>;
	def FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, ElementSizeD>;
	def FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, ElementSizeD>;
	def FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, ElementSizeD>;
	def SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, ElementSizeD>;
	def UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, ElementSizeD>;
	def UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, ElementSizeS>;
	def SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, ElementSizeD>;
	def SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, ElementSizeS>;
	def SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, ElementSizeD>;
	def UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, ElementSizeD>;
	def UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, ElementSizeD>;
	def SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, ElementSizeD>;
	def UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, ElementSizeD>;
	def FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, ElementSizeD>;
	def FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, ElementSizeD>;
	def FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, ElementSizeD>;
	def FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, ElementSizeS>;
	def FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, ElementSizeD>;
	def FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, ElementSizeS>;
	def FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, ElementSizeD>;
	def FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, ElementSizeD>;
	def FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, ElementSizeD>;
	def FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, ElementSizeD>;

	defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn">;
	defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp">;
	defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm">;
	defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz">;
	defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta">;
	defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx">;
	defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti">;
	defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx">;
	defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt">;

	// InstAliases
	def : InstAlias<"mov $Zd, $Zn",
	(ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>;
	def : InstAlias<"mov $Pd, $Pg/m, $Pn",
	(SEL_PPPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pd), 1>;
	def : InstAlias<"mov $Pd, $Pn",
	(ORR_PPzPP PPR8:$Pd, PPR8:$Pn, PPR8:$Pn, PPR8:$Pn), 1>;
	def : InstAlias<"mov $Pd, $Pg/z, $Pn",
	(AND_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pn), 1>;

	def : InstAlias<"movs $Pd, $Pn",
	(ORRS_PPzPP PPR8:$Pd, PPR8:$Pn, PPR8:$Pn, PPR8:$Pn), 1>;
	def : InstAlias<"movs $Pd, $Pg/z, $Pn",
	(ANDS_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPR8:$Pn), 1>;

	def : InstAlias<"not $Pd, $Pg/z, $Pn",
	(EOR_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPRAny:$Pg), 1>;

	def : InstAlias<"nots $Pd, $Pg/z, $Pn",
	(EORS_PPzPP PPR8:$Pd, PPRAny:$Pg, PPR8:$Pn, PPRAny:$Pg), 1>;

	def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
	(CMPGE_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
	def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
	(CMPGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
	(CMPGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"cmple $Zd, $Pg/z, $Zm, $Zn",
	(CMPGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
	(CMPHI_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
	def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
	(CMPHI_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
	(CMPHI_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"cmplo $Zd, $Pg/z, $Zm, $Zn",
	(CMPHI_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
	(CMPHS_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
	def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
	(CMPHS_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
	(CMPHS_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"cmpls $Zd, $Pg/z, $Zm, $Zn",
	(CMPHS_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
	(CMPGT_PPzZZ_B PPR8:$Zd, PPR3bAny:$Pg, ZPR8:$Zn, ZPR8:$Zm), 0>;
	def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
	(CMPGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
	(CMPGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"cmplt $Zd, $Pg/z, $Zm, $Zn",
	(CMPGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
	(FACGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
	(FACGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"facle $Zd, $Pg/z, $Zm, $Zn",
	(FACGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
	(FACGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
	(FACGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"faclt $Zd, $Pg/z, $Zm, $Zn",
	(FACGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
	(FCMGE_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
	(FCMGE_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"fcmle $Zd, $Pg/z, $Zm, $Zn",
	(FCMGE_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;

	def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
	(FCMGT_PPzZZ_H PPR16:$Zd, PPR3bAny:$Pg, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
	(FCMGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
	(FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
	}

	let Predicates = [HasSVE2] in {
	// SVE2 integer multiply-add (indexed)
	defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla">;
	defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls">;

	// SVE2 saturating multiply-add high (indexed)
	defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah">;
	defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh">;

	// SVE2 saturating multiply-add high (vectors, unpredicated)
	defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah">;
	defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh">;

	// SVE2 integer multiply (indexed)
	defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul">;

	// SVE2 saturating multiply high (indexed)
	defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh">;
	defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh">;

	// SVE2 signed saturating doubling multiply high (unpredicated)
	defm SQDMULH_ZZZ : sve2_int_mul<0b100, "sqdmulh">;
	defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh">;

	// SVE2 integer multiply vectors (unpredicated)
	defm MUL_ZZZ : sve2_int_mul<0b000, "mul">;
	defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh">;
	defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh">;
	def PMUL_ZZZ_B : sve2_int_mul<0b00, 0b001, "pmul", ZPR8>;

	// SVE2 complex integer dot product (indexed)
	defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot">;

	// SVE2 complex integer dot product
	defm CDOT_ZZZ : sve2_cintx_dot<"cdot">;

	// SVE2 complex integer multiply-add (indexed)
	defm CMLA_ZZZI : sve2_cmla_by_indexed_elem<0b0, "cmla">;
	// SVE2 complex saturating multiply-add (indexed)
	defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah">;

	// SVE2 complex integer multiply-add
	defm CMLA_ZZZ : sve2_int_cmla<0b0, "cmla">;
	defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah">;

	// SVE2 integer multiply long (indexed)
	defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb">;
	defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt">;
	defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb">;
	defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt">;

	// SVE2 saturating multiply (indexed)
	defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb">;
	defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt">;

	// SVE2 integer multiply-add long (indexed)
	defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb">;
	defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt">;
	defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb">;
	defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt">;
	defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb">;
	defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt">;
	defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb">;
	defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt">;

	// SVE2 integer multiply-add long (vectors, unpredicated)
	defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb">;
	defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt">;
	defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb">;
	defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt">;
	defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb">;
	defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt">;
	defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb">;
	defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt">;

	// SVE2 saturating multiply-add long (indexed)
	defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb">;
	defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt">;
	defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb">;
	defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt">;

	// SVE2 saturating multiply-add long (vectors, unpredicated)
	defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb">;
	defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt">;
	defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb">;
	defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt">;

	// SVE2 saturating multiply-add interleaved long
	defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt">;
	defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt">;

	// SVE2 integer halving add/subtract (predicated)
	defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd">;
	defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd">;
	defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub">;
	defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub">;
	defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd">;
	defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd">;
	defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr">;
	defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr">;

	// SVE2 integer pairwise add and accumulate long
	defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp">;
	defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp">;

	// SVE2 integer pairwise arithmetic
	defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp">;
	defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp">;
	defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp">;
	defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp">;
	defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp">;

	// SVE2 integer unary operations (predicated)
	defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe">;
	defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte">;
	defm SQABS_ZPmZ : sve2_int_un_pred_arit<0b100, "sqabs">;
	defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg">;

	// SVE2 saturating add/subtract
	defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd">;
	defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd">;
	defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub">;
	defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub">;
	defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd">;
	defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd">;
	defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr">;
	defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr">;

	// SVE2 saturating/rounding bitwise shift left (predicated)
	defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl">;
	defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl">;
	defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr">;
	defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr">;
	defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl">;
	defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl">;
	defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl">;
	defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl">;
	defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr">;
	defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr">;
	defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">;
	defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">;

	+ // SVE2 predicated shifts
	+ defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
	+ defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
	+ defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
	+ defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
	+ defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
	+
	// SVE2 integer add/subtract long
	defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb">;
	defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt">;
	defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb">;
	defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt">;
	defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb">;
	defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt">;
	defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb">;
	defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt">;
	defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb">;
	defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt">;
	defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb">;
	defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt">;

	// SVE2 integer add/subtract wide
	defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb">;
	defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt">;
	defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb">;
	defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt">;
	defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb">;
	defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt">;
	defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb">;
	defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt">;

	// SVE2 integer multiply long
	defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb">;
	defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt">;
	defm SMULLB_ZZZ : sve2_wide_int_arith_long<0b11100, "smullb">;
	defm SMULLT_ZZZ : sve2_wide_int_arith_long<0b11101, "smullt">;
	defm UMULLB_ZZZ : sve2_wide_int_arith_long<0b11110, "umullb">;
	defm UMULLT_ZZZ : sve2_wide_int_arith_long<0b11111, "umullt">;
	defm PMULLB_ZZZ : sve2_pmul_long<0b0, "pmullb">;
	defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt">;

	// SVE2 bitwise shift and insert
	- defm SRI_ZZI : sve2_int_bin_cons_shift_imm_right<0b0, "sri">;
	- defm SLI_ZZI : sve2_int_bin_cons_shift_imm_left< 0b1, "sli">;
	+ defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri">;
	+ defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli">;

	// SVE2 bitwise shift right and accumulate
	- defm SSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b00, "ssra">;
	- defm USRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b01, "usra">;
	- defm SRSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b10, "srsra">;
	- defm URSRA_ZZI : sve2_int_bin_accum_cons_shift_imm_right<0b11, "ursra">;
	+ defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra">;
	+ defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra">;
	+ defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra">;
	+ defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra">;

	// SVE2 complex integer add
	defm CADD_ZZI : sve2_int_cadd<0b0, "cadd">;
	defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd">;

	// SVE2 integer absolute difference and accumulate
	defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba">;
	defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba">;

	// SVE2 integer absolute difference and accumulate long
	defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb">;
	defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt">;
	defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb">;
	defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt">;

	// SVE2 integer add/subtract long with carry
	defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb">;
	defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt">;
	defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb">;
	defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">;

	- // SVE2 bitwise shift right narrow
	- defm SQSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0000, "sqshrunb">;
	- defm SQSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0001, "sqshrunt">;
	- defm SQRSHRUNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0010, "sqrshrunb">;
	- defm SQRSHRUNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0011, "sqrshrunt">;
	- defm SHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0100, "shrnb">;
	- defm SHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0101, "shrnt">;
	- defm RSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0110, "rshrnb">;
	- defm RSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b0111, "rshrnt">;
	- defm SQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1000, "sqshrnb">;
	- defm SQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1001, "sqshrnt">;
	- defm SQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1010, "sqrshrnb">;
	- defm SQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1011, "sqrshrnt">;
	- defm UQSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1100, "uqshrnb">;
	- defm UQSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1101, "uqshrnt">;
	- defm UQRSHRNB_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1110, "uqrshrnb">;
	- defm UQRSHRNT_ZZI : sve2_int_bin_cons_shift_imm_right_narrow<0b1111, "uqrshrnt">;
	+ // SVE2 bitwise shift right narrow (bottom)
	+ defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb">;
	+ defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb">;
	+ defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb">;
	+ defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb">;
	+ defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb">;
	+ defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb">;
	+ defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb">;
	+ defm UQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb">;

	- // SVE2 integer add/subtract narrow high part
	- defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b000, "addhnb">;
	- defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b001, "addhnt">;
	- defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high<0b010, "raddhnb">;
	- defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high<0b011, "raddhnt">;
	- defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b100, "subhnb">;
	- defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b101, "subhnt">;
	- defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high<0b110, "rsubhnb">;
	- defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high<0b111, "rsubhnt">;
	+ // SVE2 bitwise shift right narrow (top)
	+ defm SQSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt">;
	+ defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt">;
	+ defm SHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt">;
	+ defm RSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt">;
	+ defm SQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt">;
	+ defm SQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt">;
	+ defm UQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt">;
	+ defm UQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt">;

	- // SVE2 saturating extract narrow
	- defm SQXTNB_ZZ : sve2_int_sat_extract_narrow<0b000, "sqxtnb">;
	- defm SQXTNT_ZZ : sve2_int_sat_extract_narrow<0b001, "sqxtnt">;
	- defm UQXTNB_ZZ : sve2_int_sat_extract_narrow<0b010, "uqxtnb">;
	- defm UQXTNT_ZZ : sve2_int_sat_extract_narrow<0b011, "uqxtnt">;
	- defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow<0b100, "sqxtunb">;
	- defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow<0b101, "sqxtunt">;
	+ // SVE2 integer add/subtract narrow high part (bottom)
	+ defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb">;
	+ defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb">;
	+ defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb">;
	+ defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb">;

	+ // SVE2 integer add/subtract narrow high part (top)
	+ defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b00, "addhnt">;
	+ defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt">;
	+ defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b10, "subhnt">;
	+ defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt">;
	+
	+ // SVE2 saturating extract narrow (bottom)
	+ defm SQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb">;
	+ defm UQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb">;
	+ defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb">;
	+
	+ // SVE2 saturating extract narrow (top)
	+ defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt">;
	+ defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt">;
	+ defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt">;
	+
	// SVE2 character match
	defm MATCH_PPzZZ : sve2_char_match<0b0, "match">;
	defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch">;

	// SVE2 bitwise exclusive-or interleaved
	defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt">;
	defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb">;

	// SVE2 bitwise shift left long
	defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb">;
	defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt">;
	defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb">;
	defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt">;

	// SVE2 integer add/subtract interleaved long
	defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt">;
	defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt">;
	defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb">;

	// SVE2 histogram generation (segment)
	def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg">;

	// SVE2 histogram generation (vector)
	defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">;

	+ // SVE2 floating-point base 2 logarithm as integer
	+ defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
	+
	// SVE2 floating-point convert precision
	defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtxnt">;
	defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt">;
	defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt">;
	+ def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;

	// SVE2 floating-point pairwise operations
	defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp">;
	defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp">;
	defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp">;
	defm FMAXP_ZPmZZ : sve2_fp_pairwise_pred<0b110, "fmaxp">;
	defm FMINP_ZPmZZ : sve2_fp_pairwise_pred<0b111, "fminp">;

	// SVE2 floating-point multiply-add long (indexed)
	def FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb">;
	def FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt">;
	def FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb">;
	def FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt">;

	// SVE2 floating-point multiply-add long
	def FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb">;
	def FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt">;
	def FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb">;
	def FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt">;

	// SVE2 bitwise ternary operations
	defm EOR3_ZZZZ_D : sve2_int_bitwise_ternary_op<0b000, "eor3">;
	defm BCAX_ZZZZ_D : sve2_int_bitwise_ternary_op<0b010, "bcax">;
	def BSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b001, "bsl">;
	def BSL1N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b011, "bsl1n">;
	def BSL2N_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b101, "bsl2n">;
	def NBSL_ZZZZ_D : sve2_int_bitwise_ternary_op_d<0b111, "nbsl">;

	- // sve_int_rotate_imm
	+ // SVE2 bitwise xor and rotate right by immediate
	defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar">;

	// SVE2 extract vector (immediate offset, constructive)
	def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;

	- // SVE floating-point convert precision
	- def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;
	+ // SVE2 non-temporal gather loads
	+ defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
	+ defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>;
	+ defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
	+ defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>;
	+ defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>;

	- // SVE floating-point convert to integer
	- defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
	+ defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
	+ defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>;
	+ defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
	+ defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>;
	+ defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
	+ defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>;
	+ defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>;

	- // Non-temporal contiguous loads (vector + register)
	- defm LDNT1SB_ZZR_S : sve2_mem_cldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
	- defm LDNT1B_ZZR_S : sve2_mem_cldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>;
	- defm LDNT1SH_ZZR_S : sve2_mem_cldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
	- defm LDNT1H_ZZR_S : sve2_mem_cldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>;
	- defm LDNT1W_ZZR_S : sve2_mem_cldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>;
	-
	- defm LDNT1SB_ZZR_D : sve2_mem_cldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
	- defm LDNT1B_ZZR_D : sve2_mem_cldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>;
	- defm LDNT1SH_ZZR_D : sve2_mem_cldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
	- defm LDNT1H_ZZR_D : sve2_mem_cldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>;
	- defm LDNT1SW_ZZR_D : sve2_mem_cldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
	- defm LDNT1W_ZZR_D : sve2_mem_cldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>;
	- defm LDNT1D_ZZR_D : sve2_mem_cldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>;
	-
	// SVE2 vector splice (constructive)
	defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;

	- // Predicated shifts
	- defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
	- defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
	- defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
	- defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
	- defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
	+ // SVE2 non-temporal scatter stores
	+ defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
	+ defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
	+ defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;

	- // Non-temporal contiguous stores (vector + register)
	- defm STNT1B_ZZR_S : sve2_mem_cstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
	- defm STNT1H_ZZR_S : sve2_mem_cstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
	- defm STNT1W_ZZR_S : sve2_mem_cstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
	+ defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
	+ defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
	+ defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
	+ defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;

	- defm STNT1B_ZZR_D : sve2_mem_cstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
	- defm STNT1H_ZZR_D : sve2_mem_cstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
	- defm STNT1W_ZZR_D : sve2_mem_cstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
	- defm STNT1D_ZZR_D : sve2_mem_cstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
	-
	- // SVE table lookup (three sources)
	+ // SVE2 table lookup (three sources)
	defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">;
	defm TBX_ZZZ : sve2_int_perm_tbx<"tbx">;

	- // SVE integer compare scalar count and limit
	+ // SVE2 integer compare scalar count and limit
	defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege">;
	defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt">;
	defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs">;
	defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi">;

	defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege">;
	defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt">;
	defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs">;
	defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi">;

	- // SVE pointer conflict compare
	+ // SVE2 pointer conflict compare
	defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">;
	defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw">;
	}

	let Predicates = [HasSVE2AES] in {
	// SVE2 crypto destructive binary operations
	def AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8>;
	def AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8>;

	// SVE2 crypto unary operations
	def AESMC_ZZ_B : sve2_crypto_unary_op<0b0, "aesmc">;
	def AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc">;

	// PMULLB and PMULLT instructions which operate with 64-bit source and
	// 128-bit destination elements are enabled with crypto extensions, similar
	// to NEON PMULL2 instruction.
	def PMULLB_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11010, "pmullb",
	ZPR128, ZPR64, ZPR64>;
	def PMULLT_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11011, "pmullt",
	ZPR128, ZPR64, ZPR64>;
	}

	let Predicates = [HasSVE2SM4] in {
	// SVE2 crypto constructive binary operations
	def SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32>;
	// SVE2 crypto destructive binary operations
	def SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32>;
	}

	let Predicates = [HasSVE2SHA3] in {
	// SVE2 crypto constructive binary operations
	def RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64>;
	}

	let Predicates = [HasSVE2BitPerm] in {
	// SVE2 bitwise permute
	defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext">;
	defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep">;
	defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp">;
	}
	Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (revision 351303)
	@@ -1,1002 +1,1015 @@
	//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64ExpandImm.h"
	#include "AArch64TargetTransformInfo.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/CodeGen/BasicTTIImpl.h"
	#include "llvm/CodeGen/CostTable.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/Support/Debug.h"
	#include <algorithm>
	using namespace llvm;

	#define DEBUG_TYPE "aarch64tti"

	static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
	cl::init(true), cl::Hidden);

	bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
	const Function *Callee) const {
	const TargetMachine &TM = getTLI()->getTargetMachine();

	const FeatureBitset &CallerBits =
	TM.getSubtargetImpl(*Caller)->getFeatureBits();
	const FeatureBitset &CalleeBits =
	TM.getSubtargetImpl(*Callee)->getFeatureBits();

	// Inline a callee if its target-features are a subset of the callers
	// target-features.
	return (CallerBits & CalleeBits) == CalleeBits;
	}

	/// Calculate the cost of materializing a 64-bit value. This helper
	/// method might only calculate a fraction of a larger immediate. Therefore it
	/// is valid to return a cost of ZERO.
	int AArch64TTIImpl::getIntImmCost(int64_t Val) {
	// Check if the immediate can be encoded within an instruction.
	if (Val == 0 \|\| AArch64_AM::isLogicalImmediate(Val, 64))
	return 0;

	if (Val < 0)
	Val = ~Val;

	// Calculate how many moves we will need to materialize this constant.
	SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
	AArch64_IMM::expandMOVImm(Val, 64, Insn);
	return Insn.size();
	}

	/// Calculate the cost of materializing the given constant.
	int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0)
	return ~0U;

	// Sign-extend all constants to a multiple of 64-bit.
	APInt ImmVal = Imm;
	if (BitSize & 0x3f)
	ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);

	// Split the constant into 64-bit chunks and calculate the cost for each
	// chunk.
	int Cost = 0;
	for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
	APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
	int64_t Val = Tmp.getSExtValue();
	Cost += getIntImmCost(Val);
	}
	// We need at least one instruction to materialze the constant.
	return std::max(1, Cost);
	}

	int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
	const APInt &Imm, Type *Ty) {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	// There is no cost model for constants with a bit size of 0. Return TCC_Free
	// here, so that constant hoisting will ignore this constant.
	if (BitSize == 0)
	return TTI::TCC_Free;

	unsigned ImmIdx = ~0U;
	switch (Opcode) {
	default:
	return TTI::TCC_Free;
	case Instruction::GetElementPtr:
	// Always hoist the base address of a GetElementPtr.
	if (Idx == 0)
	return 2 * TTI::TCC_Basic;
	return TTI::TCC_Free;
	case Instruction::Store:
	ImmIdx = 0;
	break;
	case Instruction::Add:
	case Instruction::Sub:
	case Instruction::Mul:
	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::URem:
	case Instruction::SRem:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor:
	case Instruction::ICmp:
	ImmIdx = 1;
	break;
	// Always return TCC_Free for the shift value of a shift instruction.
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	if (Idx == 1)
	return TTI::TCC_Free;
	break;
	case Instruction::Trunc:
	case Instruction::ZExt:
	case Instruction::SExt:
	case Instruction::IntToPtr:
	case Instruction::PtrToInt:
	case Instruction::BitCast:
	case Instruction::PHI:
	case Instruction::Call:
	case Instruction::Select:
	case Instruction::Ret:
	case Instruction::Load:
	break;
	}

	if (Idx == ImmIdx) {
	int NumConstants = (BitSize + 63) / 64;
	int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
	return (Cost <= NumConstants * TTI::TCC_Basic)
	? static_cast<int>(TTI::TCC_Free)
	: Cost;
	}
	return AArch64TTIImpl::getIntImmCost(Imm, Ty);
	}

	int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
	const APInt &Imm, Type *Ty) {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	// There is no cost model for constants with a bit size of 0. Return TCC_Free
	// here, so that constant hoisting will ignore this constant.
	if (BitSize == 0)
	return TTI::TCC_Free;

	switch (IID) {
	default:
	return TTI::TCC_Free;
	case Intrinsic::sadd_with_overflow:
	case Intrinsic::uadd_with_overflow:
	case Intrinsic::ssub_with_overflow:
	case Intrinsic::usub_with_overflow:
	case Intrinsic::smul_with_overflow:
	case Intrinsic::umul_with_overflow:
	if (Idx == 1) {
	int NumConstants = (BitSize + 63) / 64;
	int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
	return (Cost <= NumConstants * TTI::TCC_Basic)
	? static_cast<int>(TTI::TCC_Free)
	: Cost;
	}
	break;
	case Intrinsic::experimental_stackmap:
	if ((Idx < 2) \|\| (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
	return TTI::TCC_Free;
	break;
	case Intrinsic::experimental_patchpoint_void:
	case Intrinsic::experimental_patchpoint_i64:
	if ((Idx < 4) \|\| (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
	return TTI::TCC_Free;
	break;
	}
	return AArch64TTIImpl::getIntImmCost(Imm, Ty);
	}

	TargetTransformInfo::PopcntSupportKind
	AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
	if (TyWidth == 32 \|\| TyWidth == 64)
	return TTI::PSK_FastHardware;
	// TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
	return TTI::PSK_Software;
	}

	bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
	ArrayRef<const Value *> Args) {

	// A helper that returns a vector type from the given type. The number of
	// elements in type Ty determine the vector width.
	auto toVectorTy = [&](Type *ArgTy) {
	return VectorType::get(ArgTy->getScalarType(),
	DstTy->getVectorNumElements());
	};

	// Exit early if DstTy is not a vector type whose elements are at least
	// 16-bits wide.
	if (!DstTy->isVectorTy() \|\| DstTy->getScalarSizeInBits() < 16)
	return false;

	// Determine if the operation has a widening variant. We consider both the
	// "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
	// instructions.
	//
	// TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
	// verify that their extending operands are eliminated during code
	// generation.
	switch (Opcode) {
	case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
	case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
	break;
	default:
	return false;
	}

	// To be a widening instruction (either the "wide" or "long" versions), the
	// second operand must be a sign- or zero extend having a single user. We
	// only consider extends having a single user because they may otherwise not
	// be eliminated.
	if (Args.size() != 2 \|\|
	(!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) \|\|
	!Args[1]->hasOneUse())
	return false;
	auto *Extend = cast<CastInst>(Args[1]);

	// Legalize the destination type and ensure it can be used in a widening
	// operation.
	auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
	unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
	if (!DstTyL.second.isVector() \|\| DstElTySize != DstTy->getScalarSizeInBits())
	return false;

	// Legalize the source type and ensure it can be used in a widening
	// operation.
	Type *SrcTy = toVectorTy(Extend->getSrcTy());
	auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
	unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
	if (!SrcTyL.second.isVector() \|\| SrcElTySize != SrcTy->getScalarSizeInBits())
	return false;

	// Get the total number of vector elements in the legalized types.
	unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
	unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();

	// Return true if the legalized types have the same number of vector elements
	// and the destination element type size is twice that of the source type.
	return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
	}

	int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type Dst, Type Src,
	const Instruction *I) {
	int ISD = TLI->InstructionOpcodeToISD(Opcode);
	assert(ISD && "Invalid opcode");

	// If the cast is observable, and it is used by a widening instruction (e.g.,
	// uaddl, saddw, etc.), it may be free.
	if (I && I->hasOneUse()) {
	auto SingleUser = cast<Instruction>(I->user_begin());
	SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
	if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
	// If the cast is the second operand, it is free. We will generate either
	// a "wide" or "long" version of the widening instruction.
	if (I == SingleUser->getOperand(1))
	return 0;
	// If the cast is not the second operand, it will be free if it looks the
	// same as the second operand. In this case, we will generate a "long"
	// version of the widening instruction.
	if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
	if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
	cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
	return 0;
	}
	}

	EVT SrcTy = TLI->getValueType(DL, Src);
	EVT DstTy = TLI->getValueType(DL, Dst);

	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
	return BaseT::getCastInstrCost(Opcode, Dst, Src);

	static const TypeConversionCostTblEntry
	ConversionTbl[] = {
	{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
	{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
	{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
	{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },

	// The number of shll instructions for the extension.
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
	{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
	{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
	{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
	{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
	{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
	{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
	{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
	{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },

	// LowerVectorINT_TO_FP:
	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },

	// Complex: to v2f32
	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },

	// Complex: to v4f32
	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },

	// Complex: to v8f32
	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },

	// Complex: to v16f32
	{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
	{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },

	// Complex: to v2f64
	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },


	// LowerVectorFP_TO_INT
	{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
	{ ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
	{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
	{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
	{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
	{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },

	// Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
	{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
	{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
	{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
	{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
	{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
	{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },

	// Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
	{ ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
	{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
	{ ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
	{ ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },

	// Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
	{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
	{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
	{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
	{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
	{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
	{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
	};

	if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
	DstTy.getSimpleVT(),
	SrcTy.getSimpleVT()))
	return Entry->Cost;

	return BaseT::getCastInstrCost(Opcode, Dst, Src);
	}

	int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
	VectorType *VecTy,
	unsigned Index) {

	// Make sure we were given a valid extend opcode.
	assert((Opcode == Instruction::SExt \|\| Opcode == Instruction::ZExt) &&
	"Invalid opcode");

	// We are extending an element we extract from a vector, so the source type
	// of the extend is the element type of the vector.
	auto *Src = VecTy->getElementType();

	// Sign- and zero-extends are for integer types only.
	assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");

	// Get the cost for the extract. We compute the cost (if any) for the extend
	// below.
	auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);

	// Legalize the types.
	auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
	auto DstVT = TLI->getValueType(DL, Dst);
	auto SrcVT = TLI->getValueType(DL, Src);

	// If the resulting type is still a vector and the destination type is legal,
	// we may get the extension for free. If not, get the default cost for the
	// extend.
	if (!VecLT.second.isVector() \|\| !TLI->isTypeLegal(DstVT))
	return Cost + getCastInstrCost(Opcode, Dst, Src);

	// The destination type should be larger than the element type. If not, get
	// the default cost for the extend.
	if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
	return Cost + getCastInstrCost(Opcode, Dst, Src);

	switch (Opcode) {
	default:
	llvm_unreachable("Opcode should be either SExt or ZExt");

	// For sign-extends, we only need a smov, which performs the extension
	// automatically.
	case Instruction::SExt:
	return Cost;

	// For zero-extends, the extend is performed automatically by a umov unless
	// the destination type is i64 and the element type is i8 or i16.
	case Instruction::ZExt:
	if (DstVT.getSizeInBits() != 64u \|\| SrcVT.getSizeInBits() == 32u)
	return Cost;
	}

	// If we are unable to perform the extend for free, get the default cost.
	return Cost + getCastInstrCost(Opcode, Dst, Src);
	}

	int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
	unsigned Index) {
	assert(Val->isVectorTy() && "This must be a vector type");

	if (Index != -1U) {
	// Legalize the type.
	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);

	// This type is legalized to a scalar type.
	if (!LT.second.isVector())
	return 0;

	// The type may be split. Normalize the index to the new type.
	unsigned Width = LT.second.getVectorNumElements();
	Index = Index % Width;

	// The element at index zero is already inside the vector.
	if (Index == 0)
	return 0;
	}

	// All other insert/extracts cost this much.
	return ST->getVectorInsertExtractBaseCost();
	}

	int AArch64TTIImpl::getArithmeticInstrCost(
	unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
	TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
	TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
	// Legalize the type.
	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);

	// If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
	// add in the widening overhead specified by the sub-target. Since the
	// extends feeding widening instructions are performed automatically, they
	// aren't present in the generated code and have a zero cost. By adding a
	// widening overhead here, we attach the total cost of the combined operation
	// to the widening instruction.
	int Cost = 0;
	if (isWideningInstruction(Ty, Opcode, Args))
	Cost += ST->getWideningBaseCost();

	int ISD = TLI->InstructionOpcodeToISD(Opcode);

	switch (ISD) {
	default:
	return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
	Opd1PropInfo, Opd2PropInfo);
	case ISD::SDIV:
	if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
	Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
	// On AArch64, scalar signed division by constants power-of-two are
	// normally expanded to the sequence ADD + CMP + SELECT + SRA.
	// The OperandValue properties many not be same as that of previous
	// operation; conservatively assume OP_None.
	Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
	TargetTransformInfo::OP_None,
	TargetTransformInfo::OP_None);
	Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
	TargetTransformInfo::OP_None,
	TargetTransformInfo::OP_None);
	Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
	TargetTransformInfo::OP_None,
	TargetTransformInfo::OP_None);
	Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
	TargetTransformInfo::OP_None,
	TargetTransformInfo::OP_None);
	return Cost;
	}
	LLVM_FALLTHROUGH;
	case ISD::UDIV:
	if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
	auto VT = TLI->getValueType(DL, Ty);
	if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
	// Vector signed division by constant are expanded to the
	// sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
	// to MULHS + SUB + SRL + ADD + SRL.
	int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info,
	Opd2Info,
	TargetTransformInfo::OP_None,
	TargetTransformInfo::OP_None);
	int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info,
	Opd2Info,
	TargetTransformInfo::OP_None,
	TargetTransformInfo::OP_None);
	int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info,
	Opd2Info,
	TargetTransformInfo::OP_None,
	TargetTransformInfo::OP_None);
	return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
	}
	}

	Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
	Opd1PropInfo, Opd2PropInfo);
	if (Ty->isVectorTy()) {
	// On AArch64, vector divisions are not supported natively and are
	// expanded into scalar divisions of each pair of elements.
	Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info,
	Opd2Info, Opd1PropInfo, Opd2PropInfo);
	Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info,
	Opd2Info, Opd1PropInfo, Opd2PropInfo);
	// TODO: if one of the arguments is scalar, then it's not necessary to
	// double the cost of handling the vector elements.
	Cost += Cost;
	}
	return Cost;

	case ISD::ADD:
	case ISD::MUL:
	case ISD::XOR:
	case ISD::OR:
	case ISD::AND:
	// These nodes are marked as 'custom' for combining purposes only.
	// We know that they are legal. See LowerAdd in ISelLowering.
	return (Cost + 1) * LT.first;
	}
	}

	int AArch64TTIImpl::getAddressComputationCost(Type Ty, ScalarEvolution SE,
	const SCEV *Ptr) {
	// Address computations in vectorized code with non-consecutive addresses will
	// likely result in more instructions compared to scalar code where the
	// computation can more often be merged into the index mode. The resulting
	// extra micro-ops can significantly decrease throughput.
	unsigned NumVectorInstToHideOverhead = 10;
	int MaxMergeDistance = 64;

	if (Ty->isVectorTy() && SE &&
	!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
	return NumVectorInstToHideOverhead;

	// In many cases the address computation is not merged into the instruction
	// addressing mode.
	return 1;
	}

	int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
	Type CondTy, const Instruction I) {

	int ISD = TLI->InstructionOpcodeToISD(Opcode);
	// We don't lower some vector selects well that are wider than the register
	// width.
	if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
	// We would need this many instructions to hide the scalarization happening.
	const int AmortizationCost = 20;
	static const TypeConversionCostTblEntry
	VectorSelectTbl[] = {
	{ ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
	{ ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
	{ ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
	{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
	{ ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
	{ ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
	};

	EVT SelCondTy = TLI->getValueType(DL, CondTy);
	EVT SelValTy = TLI->getValueType(DL, ValTy);
	if (SelCondTy.isSimple() && SelValTy.isSimple()) {
	if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
	SelCondTy.getSimpleVT(),
	SelValTy.getSimpleVT()))
	return Entry->Cost;
	}
	}
	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
	}

	+AArch64TTIImpl::TTI::MemCmpExpansionOptions
	+AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
	+ TTI::MemCmpExpansionOptions Options;
	+ Options.AllowOverlappingLoads = !ST->requiresStrictAlign();
	+ Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
	+ Options.NumLoadsPerBlock = Options.MaxNumLoads;
	+ // TODO: Though vector loads usually perform well on AArch64, in some targets
	+ // they may wake up the FP unit, which raises the power consumption. Perhaps
	+ // they could be used with no holds barred (-O3).
	+ Options.LoadSizes = {8, 4, 2, 1};
	+ return Options;
	+}
	+
	int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
	unsigned Alignment, unsigned AddressSpace,
	const Instruction *I) {
	auto LT = TLI->getTypeLegalizationCost(DL, Ty);

	if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
	LT.second.is128BitVector() && Alignment < 16) {
	// Unaligned stores are extremely inefficient. We don't split all
	// unaligned 128-bit stores because the negative impact that has shown in
	// practice on inlined block copy code.
	// We make such stores expensive so that we will only vectorize if there
	// are 6 other instructions getting vectorized.
	const int AmortizationCost = 6;

	return LT.first * 2 * AmortizationCost;
	}

	if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) {
	unsigned ProfitableNumElements;
	if (Opcode == Instruction::Store)
	// We use a custom trunc store lowering so v.4b should be profitable.
	ProfitableNumElements = 4;
	else
	// We scalarize the loads because there is not v.4b register and we
	// have to promote the elements to v.2.
	ProfitableNumElements = 8;

	if (Ty->getVectorNumElements() < ProfitableNumElements) {
	unsigned NumVecElts = Ty->getVectorNumElements();
	unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
	// We generate 2 instructions per vector element.
	return NumVectorizableInstsToAmortize * NumVecElts * 2;
	}
	}

	return LT.first;
	}

	int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
	unsigned Factor,
	ArrayRef<unsigned> Indices,
	unsigned Alignment,
	unsigned AddressSpace,
	bool UseMaskForCond,
	bool UseMaskForGaps) {
	assert(Factor >= 2 && "Invalid interleave factor");
	assert(isa<VectorType>(VecTy) && "Expect a vector type");

	if (!UseMaskForCond && !UseMaskForGaps &&
	Factor <= TLI->getMaxSupportedInterleaveFactor()) {
	unsigned NumElts = VecTy->getVectorNumElements();
	auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);

	// ldN/stN only support legal vector types of size 64 or 128 in bits.
	// Accesses having vector types that are a multiple of 128 bits can be
	// matched to more than one ldN/stN instruction.
	if (NumElts % Factor == 0 &&
	TLI->isLegalInterleavedAccessType(SubVecTy, DL))
	return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
	}

	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
	Alignment, AddressSpace,
	UseMaskForCond, UseMaskForGaps);
	}

	int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
	int Cost = 0;
	for (auto *I : Tys) {
	if (!I->isVectorTy())
	continue;
	if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
	Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) +
	getMemoryOpCost(Instruction::Load, I, 128, 0);
	}
	return Cost;
	}

	unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
	return ST->getMaxInterleaveFactor();
	}

	// For Falkor, we want to avoid having too many strided loads in a loop since
	// that can exhaust the HW prefetcher resources. We adjust the unroller
	// MaxCount preference below to attempt to ensure unrolling doesn't create too
	// many strided loads.
	static void
	getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
	TargetTransformInfo::UnrollingPreferences &UP) {
	enum { MaxStridedLoads = 7 };
	auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
	int StridedLoads = 0;
	// FIXME? We could make this more precise by looking at the CFG and
	// e.g. not counting loads in each side of an if-then-else diamond.
	for (const auto BB : L->blocks()) {
	for (auto &I : *BB) {
	LoadInst *LMemI = dyn_cast<LoadInst>(&I);
	if (!LMemI)
	continue;

	Value *PtrValue = LMemI->getPointerOperand();
	if (L->isLoopInvariant(PtrValue))
	continue;

	const SCEV *LSCEV = SE.getSCEV(PtrValue);
	const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
	if (!LSCEVAddRec \|\| !LSCEVAddRec->isAffine())
	continue;

	// FIXME? We could take pairing of unrolled load copies into account
	// by looking at the AddRec, but we would probably have to limit this
	// to loops with no stores or other memory optimization barriers.
	++StridedLoads;
	// We've seen enough strided loads that seeing more won't make a
	// difference.
	if (StridedLoads > MaxStridedLoads / 2)
	return StridedLoads;
	}
	}
	return StridedLoads;
	};

	int StridedLoads = countStridedLoads(L, SE);
	LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
	<< " strided loads\n");
	// Pick the largest power of 2 unroll count that won't result in too many
	// strided loads.
	if (StridedLoads) {
	UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
	LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
	<< UP.MaxCount << '\n');
	}
	}

	void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
	TTI::UnrollingPreferences &UP) {
	// Enable partial unrolling and runtime unrolling.
	BaseT::getUnrollingPreferences(L, SE, UP);

	// For inner loop, it is more likely to be a hot one, and the runtime check
	// can be promoted out from LICM pass, so the overhead is less, let's try
	// a larger threshold to unroll more loops.
	if (L->getLoopDepth() > 1)
	UP.PartialThreshold *= 2;

	// Disable partial & runtime unrolling on -Os.
	UP.PartialOptSizeThreshold = 0;

	if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
	EnableFalkorHWPFUnrollFix)
	getFalkorUnrollingPreferences(L, SE, UP);
	}

	Value AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst Inst,
	Type *ExpectedType) {
	switch (Inst->getIntrinsicID()) {
	default:
	return nullptr;
	case Intrinsic::aarch64_neon_st2:
	case Intrinsic::aarch64_neon_st3:
	case Intrinsic::aarch64_neon_st4: {
	// Create a struct type
	StructType *ST = dyn_cast<StructType>(ExpectedType);
	if (!ST)
	return nullptr;
	unsigned NumElts = Inst->getNumArgOperands() - 1;
	if (ST->getNumElements() != NumElts)
	return nullptr;
	for (unsigned i = 0, e = NumElts; i != e; ++i) {
	if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
	return nullptr;
	}
	Value *Res = UndefValue::get(ExpectedType);
	IRBuilder<> Builder(Inst);
	for (unsigned i = 0, e = NumElts; i != e; ++i) {
	Value *L = Inst->getArgOperand(i);
	Res = Builder.CreateInsertValue(Res, L, i);
	}
	return Res;
	}
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_ld4:
	if (Inst->getType() == ExpectedType)
	return Inst;
	return nullptr;
	}
	}

	bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
	MemIntrinsicInfo &Info) {
	switch (Inst->getIntrinsicID()) {
	default:
	break;
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_ld4:
	Info.ReadMem = true;
	Info.WriteMem = false;
	Info.PtrVal = Inst->getArgOperand(0);
	break;
	case Intrinsic::aarch64_neon_st2:
	case Intrinsic::aarch64_neon_st3:
	case Intrinsic::aarch64_neon_st4:
	Info.ReadMem = false;
	Info.WriteMem = true;
	Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
	break;
	}

	switch (Inst->getIntrinsicID()) {
	default:
	return false;
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_st2:
	Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
	break;
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_st3:
	Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
	break;
	case Intrinsic::aarch64_neon_ld4:
	case Intrinsic::aarch64_neon_st4:
	Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
	break;
	}
	return true;
	}

	/// See if \p I should be considered for address type promotion. We check if \p
	/// I is a sext with right type and used in memory accesses. If it used in a
	/// "complex" getelementptr, we allow it to be promoted without finding other
	/// sext instructions that sign extended the same initial value. A getelementptr
	/// is considered as "complex" if it has more than 2 operands.
	bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
	const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
	bool Considerable = false;
	AllowPromotionWithoutCommonHeader = false;
	if (!isa<SExtInst>(&I))
	return false;
	Type *ConsideredSExtType =
	Type::getInt64Ty(I.getParent()->getParent()->getContext());
	if (I.getType() != ConsideredSExtType)
	return false;
	// See if the sext is the one with the right type and used in at least one
	// GetElementPtrInst.
	for (const User *U : I.users()) {
	if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
	Considerable = true;
	// A getelementptr is considered as "complex" if it has more than 2
	// operands. We will promote a SExt used in such complex GEP as we
	// expect some computation to be merged if they are done on 64 bits.
	if (GEPInst->getNumOperands() > 2) {
	AllowPromotionWithoutCommonHeader = true;
	break;
	}
	}
	}
	return Considerable;
	}

	unsigned AArch64TTIImpl::getCacheLineSize() {
	return ST->getCacheLineSize();
	}

	unsigned AArch64TTIImpl::getPrefetchDistance() {
	return ST->getPrefetchDistance();
	}

	unsigned AArch64TTIImpl::getMinPrefetchStride() {
	return ST->getMinPrefetchStride();
	}

	unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
	return ST->getMaxPrefetchIterationsAhead();
	}

	bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
	TTI::ReductionFlags Flags) const {
	assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
	unsigned ScalarBits = Ty->getScalarSizeInBits();
	switch (Opcode) {
	case Instruction::FAdd:
	case Instruction::FMul:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor:
	case Instruction::Mul:
	return false;
	case Instruction::Add:
	return ScalarBits * Ty->getVectorNumElements() >= 128;
	case Instruction::ICmp:
	return (ScalarBits < 64) &&
	(ScalarBits * Ty->getVectorNumElements() >= 128);
	case Instruction::FCmp:
	return Flags.NoNaN;
	default:
	llvm_unreachable("Unhandled reduction opcode");
	}
	return false;
	}

	int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
	bool IsPairwiseForm) {

	if (IsPairwiseForm)
	return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);

	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
	MVT MTy = LT.second;
	int ISD = TLI->InstructionOpcodeToISD(Opcode);
	assert(ISD && "Invalid opcode");

	// Horizontal adds can use the 'addv' instruction. We model the cost of these
	// instructions as normal vector adds. This is the only arithmetic vector
	// reduction operation for which we have an instruction.
	static const CostTblEntry CostTblNoPairwise[]{
	{ISD::ADD, MVT::v8i8, 1},
	{ISD::ADD, MVT::v16i8, 1},
	{ISD::ADD, MVT::v4i16, 1},
	{ISD::ADD, MVT::v8i16, 1},
	{ISD::ADD, MVT::v4i32, 1},
	};

	if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
	return LT.first * Entry->Cost;

	return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
	}

	int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
	Type *SubTp) {
	if (Kind == TTI::SK_Broadcast \|\| Kind == TTI::SK_Transpose \|\|
	Kind == TTI::SK_Select \|\| Kind == TTI::SK_PermuteSingleSrc) {
	static const CostTblEntry ShuffleTbl[] = {
	// Broadcast shuffle kinds can be performed with 'dup'.
	{ TTI::SK_Broadcast, MVT::v8i8, 1 },
	{ TTI::SK_Broadcast, MVT::v16i8, 1 },
	{ TTI::SK_Broadcast, MVT::v4i16, 1 },
	{ TTI::SK_Broadcast, MVT::v8i16, 1 },
	{ TTI::SK_Broadcast, MVT::v2i32, 1 },
	{ TTI::SK_Broadcast, MVT::v4i32, 1 },
	{ TTI::SK_Broadcast, MVT::v2i64, 1 },
	{ TTI::SK_Broadcast, MVT::v2f32, 1 },
	{ TTI::SK_Broadcast, MVT::v4f32, 1 },
	{ TTI::SK_Broadcast, MVT::v2f64, 1 },
	// Transpose shuffle kinds can be performed with 'trn1/trn2' and
	// 'zip1/zip2' instructions.
	{ TTI::SK_Transpose, MVT::v8i8, 1 },
	{ TTI::SK_Transpose, MVT::v16i8, 1 },
	{ TTI::SK_Transpose, MVT::v4i16, 1 },
	{ TTI::SK_Transpose, MVT::v8i16, 1 },
	{ TTI::SK_Transpose, MVT::v2i32, 1 },
	{ TTI::SK_Transpose, MVT::v4i32, 1 },
	{ TTI::SK_Transpose, MVT::v2i64, 1 },
	{ TTI::SK_Transpose, MVT::v2f32, 1 },
	{ TTI::SK_Transpose, MVT::v4f32, 1 },
	{ TTI::SK_Transpose, MVT::v2f64, 1 },
	// Select shuffle kinds.
	// TODO: handle vXi8/vXi16.
	{ TTI::SK_Select, MVT::v2i32, 1 }, // mov.
	{ TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
	{ TTI::SK_Select, MVT::v2i64, 1 }, // mov.
	{ TTI::SK_Select, MVT::v2f32, 1 }, // mov.
	{ TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
	{ TTI::SK_Select, MVT::v2f64, 1 }, // mov.
	// PermuteSingleSrc shuffle kinds.
	// TODO: handle vXi8/vXi16.
	{ TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
	{ TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
	{ TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
	{ TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
	{ TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
	{ TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
	};
	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
	if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
	return LT.first * Entry->Cost;
	}

	return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
	}
	Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64TargetTransformInfo.h
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64TargetTransformInfo.h (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/AArch64/AArch64TargetTransformInfo.h (revision 351303)
	@@ -1,184 +1,187 @@
	//===- AArch64TargetTransformInfo.h - AArch64 specific TTI ------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	/// \file
	/// This file a TargetTransformInfo::Concept conforming object specific to the
	/// AArch64 target machine. It uses the target's detailed information to
	/// provide more precise answers to certain TTI queries, while letting the
	/// target independent and default TTI implementations handle the rest.
	///
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
	#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H

	#include "AArch64.h"
	#include "AArch64Subtarget.h"
	#include "AArch64TargetMachine.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/CodeGen/BasicTTIImpl.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Intrinsics.h"
	#include <cstdint>

	namespace llvm {

	class APInt;
	class Instruction;
	class IntrinsicInst;
	class Loop;
	class SCEV;
	class ScalarEvolution;
	class Type;
	class Value;
	class VectorType;

	class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
	using BaseT = BasicTTIImplBase<AArch64TTIImpl>;
	using TTI = TargetTransformInfo;

	friend BaseT;

	const AArch64Subtarget *ST;
	const AArch64TargetLowering *TLI;

	const AArch64Subtarget *getST() const { return ST; }
	const AArch64TargetLowering *getTLI() const { return TLI; }

	enum MemIntrinsicType {
	VECTOR_LDST_TWO_ELEMENTS,
	VECTOR_LDST_THREE_ELEMENTS,
	VECTOR_LDST_FOUR_ELEMENTS
	};

	bool isWideningInstruction(Type *Ty, unsigned Opcode,
	ArrayRef<const Value *> Args);

	public:
	explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
	: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
	TLI(ST->getTargetLowering()) {}

	bool areInlineCompatible(const Function *Caller,
	const Function *Callee) const;

	/// \name Scalar TTI Implementations
	/// @{

	using BaseT::getIntImmCost;
	int getIntImmCost(int64_t Val);
	int getIntImmCost(const APInt &Imm, Type *Ty);
	int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
	int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
	Type *Ty);
	TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);

	/// @}

	/// \name Vector TTI Implementations
	/// @{

	bool enableInterleavedAccessVectorization() { return true; }

	unsigned getNumberOfRegisters(bool Vector) {
	if (Vector) {
	if (ST->hasNEON())
	return 32;
	return 0;
	}
	return 31;
	}

	unsigned getRegisterBitWidth(bool Vector) const {
	if (Vector) {
	if (ST->hasNEON())
	return 128;
	return 0;
	}
	return 64;
	}

	unsigned getMinVectorRegisterBitWidth() {
	return ST->getMinVectorRegisterBitWidth();
	}

	unsigned getMaxInterleaveFactor(unsigned VF);

	int getCastInstrCost(unsigned Opcode, Type Dst, Type Src,
	const Instruction *I = nullptr);

	int getExtractWithExtendCost(unsigned Opcode, Type Dst, VectorType VecTy,
	unsigned Index);

	int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);

	int getArithmeticInstrCost(
	unsigned Opcode, Type *Ty,
	TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
	TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
	TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
	TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
	ArrayRef<const Value > Args = ArrayRef<const Value >());

	int getAddressComputationCost(Type Ty, ScalarEvolution SE, const SCEV *Ptr);

	int getCmpSelInstrCost(unsigned Opcode, Type ValTy, Type CondTy,
	const Instruction *I = nullptr);

	+ TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
	+ bool IsZeroCmp) const;
	+
	int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
	unsigned AddressSpace, const Instruction *I = nullptr);

	int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);

	void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
	TTI::UnrollingPreferences &UP);

	Value getOrCreateResultFromMemIntrinsic(IntrinsicInst Inst,
	Type *ExpectedType);

	bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);

	int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
	ArrayRef<unsigned> Indices, unsigned Alignment,
	unsigned AddressSpace,
	bool UseMaskForCond = false,
	bool UseMaskForGaps = false);

	bool
	shouldConsiderAddressTypePromotion(const Instruction &I,
	bool &AllowPromotionWithoutCommonHeader);

	unsigned getCacheLineSize();

	unsigned getPrefetchDistance();

	unsigned getMinPrefetchStride();

	unsigned getMaxPrefetchIterationsAhead();

	bool shouldExpandReduction(const IntrinsicInst *II) const {
	return false;
	}

	unsigned getGISelRematGlobalCost() const {
	return 2;
	}

	bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
	TTI::ReductionFlags Flags) const;

	int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
	bool IsPairwiseForm);

	int getShuffleCost(TTI::ShuffleKind Kind, Type Tp, int Index, Type SubTp);
	/// @}
	};

	} // end namespace llvm

	#endif // LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
	Index: vendor/llvm/dist-release_90/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp (revision 351303)
	@@ -1,5762 +1,5762 @@
	//==- AArch64AsmParser.cpp - Parse AArch64 assembly to MCInst instructions -==//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "MCTargetDesc/AArch64MCExpr.h"
	#include "MCTargetDesc/AArch64MCTargetDesc.h"
	#include "MCTargetDesc/AArch64TargetStreamer.h"
	#include "TargetInfo/AArch64TargetInfo.h"
	#include "AArch64InstrInfo.h"
	#include "Utils/AArch64BaseInfo.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringMap.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCInst.h"
	#include "llvm/MC/MCLinkerOptimizationHint.h"
	#include "llvm/MC/MCObjectFileInfo.h"
	#include "llvm/MC/MCParser/MCAsmLexer.h"
	#include "llvm/MC/MCParser/MCAsmParser.h"
	#include "llvm/MC/MCParser/MCAsmParserExtension.h"
	#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
	#include "llvm/MC/MCParser/MCTargetAsmParser.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MCSubtargetInfo.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/MC/MCTargetOptions.h"
	#include "llvm/MC/SubtargetFeature.h"
	#include "llvm/MC/MCValue.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/SMLoc.h"
	#include "llvm/Support/TargetParser.h"
	#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Support/raw_ostream.h"
	#include <cassert>
	#include <cctype>
	#include <cstdint>
	#include <cstdio>
	#include <string>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;

	namespace {

	enum class RegKind {
	Scalar,
	NeonVector,
	SVEDataVector,
	SVEPredicateVector
	};

	enum RegConstraintEqualityTy {
	EqualsReg,
	EqualsSuperReg,
	EqualsSubReg
	};

	class AArch64AsmParser : public MCTargetAsmParser {
	private:
	StringRef Mnemonic; ///< Instruction mnemonic.

	// Map of register aliases registers via the .req directive.
	StringMap<std::pair<RegKind, unsigned>> RegisterReqs;

	class PrefixInfo {
	public:
	static PrefixInfo CreateFromInst(const MCInst &Inst, uint64_t TSFlags) {
	PrefixInfo Prefix;
	switch (Inst.getOpcode()) {
	case AArch64::MOVPRFX_ZZ:
	Prefix.Active = true;
	Prefix.Dst = Inst.getOperand(0).getReg();
	break;
	case AArch64::MOVPRFX_ZPmZ_B:
	case AArch64::MOVPRFX_ZPmZ_H:
	case AArch64::MOVPRFX_ZPmZ_S:
	case AArch64::MOVPRFX_ZPmZ_D:
	Prefix.Active = true;
	Prefix.Predicated = true;
	Prefix.ElementSize = TSFlags & AArch64::ElementSizeMask;
	assert(Prefix.ElementSize != AArch64::ElementSizeNone &&
	"No destructive element size set for movprfx");
	Prefix.Dst = Inst.getOperand(0).getReg();
	Prefix.Pg = Inst.getOperand(2).getReg();
	break;
	case AArch64::MOVPRFX_ZPzZ_B:
	case AArch64::MOVPRFX_ZPzZ_H:
	case AArch64::MOVPRFX_ZPzZ_S:
	case AArch64::MOVPRFX_ZPzZ_D:
	Prefix.Active = true;
	Prefix.Predicated = true;
	Prefix.ElementSize = TSFlags & AArch64::ElementSizeMask;
	assert(Prefix.ElementSize != AArch64::ElementSizeNone &&
	"No destructive element size set for movprfx");
	Prefix.Dst = Inst.getOperand(0).getReg();
	Prefix.Pg = Inst.getOperand(1).getReg();
	break;
	default:
	break;
	}

	return Prefix;
	}

	PrefixInfo() : Active(false), Predicated(false) {}
	bool isActive() const { return Active; }
	bool isPredicated() const { return Predicated; }
	unsigned getElementSize() const {
	assert(Predicated);
	return ElementSize;
	}
	unsigned getDstReg() const { return Dst; }
	unsigned getPgReg() const {
	assert(Predicated);
	return Pg;
	}

	private:
	bool Active;
	bool Predicated;
	unsigned ElementSize;
	unsigned Dst;
	unsigned Pg;
	} NextPrefix;

	AArch64TargetStreamer &getTargetStreamer() {
	MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
	return static_cast<AArch64TargetStreamer &>(TS);
	}

	SMLoc getLoc() const { return getParser().getTok().getLoc(); }

	bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
	void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S);
	AArch64CC::CondCode parseCondCodeString(StringRef Cond);
	bool parseCondCode(OperandVector &Operands, bool invertCondCode);
	unsigned matchRegisterNameAlias(StringRef Name, RegKind Kind);
	bool parseRegister(OperandVector &Operands);
	bool parseSymbolicImmVal(const MCExpr *&ImmVal);
	bool parseNeonVectorList(OperandVector &Operands);
	bool parseOptionalMulOperand(OperandVector &Operands);
	bool parseOperand(OperandVector &Operands, bool isCondCode,
	bool invertCondCode);

	bool showMatchError(SMLoc Loc, unsigned ErrCode, uint64_t ErrorInfo,
	OperandVector &Operands);

	bool parseDirectiveArch(SMLoc L);
	bool parseDirectiveArchExtension(SMLoc L);
	bool parseDirectiveCPU(SMLoc L);
	bool parseDirectiveInst(SMLoc L);

	bool parseDirectiveTLSDescCall(SMLoc L);

	bool parseDirectiveLOH(StringRef LOH, SMLoc L);
	bool parseDirectiveLtorg(SMLoc L);

	bool parseDirectiveReq(StringRef Name, SMLoc L);
	bool parseDirectiveUnreq(SMLoc L);
	bool parseDirectiveCFINegateRAState();
	bool parseDirectiveCFIBKeyFrame();

	bool validateInstruction(MCInst &Inst, SMLoc &IDLoc,
	SmallVectorImpl<SMLoc> &Loc);
	bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
	OperandVector &Operands, MCStreamer &Out,
	uint64_t &ErrorInfo,
	bool MatchingInlineAsm) override;
	/// @name Auto-generated Match Functions
	/// {

	#define GET_ASSEMBLER_HEADER
	#include "AArch64GenAsmMatcher.inc"

	/// }

	OperandMatchResultTy tryParseScalarRegister(unsigned &Reg);
	OperandMatchResultTy tryParseVectorRegister(unsigned &Reg, StringRef &Kind,
	RegKind MatchKind);
	OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
	OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
	OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
	OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
	OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
	template <bool IsSVEPrefetch = false>
	OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
	OperandMatchResultTy tryParsePSBHint(OperandVector &Operands);
	OperandMatchResultTy tryParseBTIHint(OperandVector &Operands);
	OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
	OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
	template<bool AddFPZeroAsLiteral>
	OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
	OperandMatchResultTy tryParseImmWithOptionalShift(OperandVector &Operands);
	OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
	bool tryParseNeonVectorRegister(OperandVector &Operands);
	OperandMatchResultTy tryParseVectorIndex(OperandVector &Operands);
	OperandMatchResultTy tryParseGPRSeqPair(OperandVector &Operands);
	template <bool ParseShiftExtend,
	RegConstraintEqualityTy EqTy = RegConstraintEqualityTy::EqualsReg>
	OperandMatchResultTy tryParseGPROperand(OperandVector &Operands);
	template <bool ParseShiftExtend, bool ParseSuffix>
	OperandMatchResultTy tryParseSVEDataVector(OperandVector &Operands);
	OperandMatchResultTy tryParseSVEPredicateVector(OperandVector &Operands);
	template <RegKind VectorKind>
	OperandMatchResultTy tryParseVectorList(OperandVector &Operands,
	bool ExpectMatch = false);
	OperandMatchResultTy tryParseSVEPattern(OperandVector &Operands);

	public:
	enum AArch64MatchResultTy {
	Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
	#define GET_OPERAND_DIAGNOSTIC_TYPES
	#include "AArch64GenAsmMatcher.inc"
	};
	bool IsILP32;

	AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
	const MCInstrInfo &MII, const MCTargetOptions &Options)
	: MCTargetAsmParser(Options, STI, MII) {
	IsILP32 = Options.getABIName() == "ilp32";
	MCAsmParserExtension::Initialize(Parser);
	MCStreamer &S = getParser().getStreamer();
	if (S.getTargetStreamer() == nullptr)
	new AArch64TargetStreamer(S);

	// Alias .hword/.word/.[dx]word to the target-independent
	// .2byte/.4byte/.8byte directives as they have the same form and
	// semantics:
	/// ::= (.hword \| .word \| .dword \| .xword ) [ expression (, expression)* ]
	Parser.addAliasForDirective(".hword", ".2byte");
	Parser.addAliasForDirective(".word", ".4byte");
	Parser.addAliasForDirective(".dword", ".8byte");
	Parser.addAliasForDirective(".xword", ".8byte");

	// Initialize the set of available features.
	setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
	}

	bool regsEqual(const MCParsedAsmOperand &Op1,
	const MCParsedAsmOperand &Op2) const override;
	bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
	SMLoc NameLoc, OperandVector &Operands) override;
	bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
	bool ParseDirective(AsmToken DirectiveID) override;
	unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
	unsigned Kind) override;

	static bool classifySymbolRef(const MCExpr *Expr,
	AArch64MCExpr::VariantKind &ELFRefKind,
	MCSymbolRefExpr::VariantKind &DarwinRefKind,
	int64_t &Addend);
	};

	/// AArch64Operand - Instances of this class represent a parsed AArch64 machine
	/// instruction.
	class AArch64Operand : public MCParsedAsmOperand {
	private:
	enum KindTy {
	k_Immediate,
	k_ShiftedImm,
	k_CondCode,
	k_Register,
	k_VectorList,
	k_VectorIndex,
	k_Token,
	k_SysReg,
	k_SysCR,
	k_Prefetch,
	k_ShiftExtend,
	k_FPImm,
	k_Barrier,
	k_PSBHint,
	k_BTIHint,
	} Kind;

	SMLoc StartLoc, EndLoc;

	struct TokOp {
	const char *Data;
	unsigned Length;
	bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
	};

	// Separate shift/extend operand.
	struct ShiftExtendOp {
	AArch64_AM::ShiftExtendType Type;
	unsigned Amount;
	bool HasExplicitAmount;
	};

	struct RegOp {
	unsigned RegNum;
	RegKind Kind;
	int ElementWidth;

	// The register may be allowed as a different register class,
	// e.g. for GPR64as32 or GPR32as64.
	RegConstraintEqualityTy EqualityTy;

	// In some cases the shift/extend needs to be explicitly parsed together
	// with the register, rather than as a separate operand. This is needed
	// for addressing modes where the instruction as a whole dictates the
	// scaling/extend, rather than specific bits in the instruction.
	// By parsing them as a single operand, we avoid the need to pass an
	// extra operand in all CodeGen patterns (because all operands need to
	// have an associated value), and we avoid the need to update TableGen to
	// accept operands that have no associated bits in the instruction.
	//
	// An added benefit of parsing them together is that the assembler
	// can give a sensible diagnostic if the scaling is not correct.
	//
	// The default is 'lsl #0' (HasExplicitAmount = false) if no
	// ShiftExtend is specified.
	ShiftExtendOp ShiftExtend;
	};

	struct VectorListOp {
	unsigned RegNum;
	unsigned Count;
	unsigned NumElements;
	unsigned ElementWidth;
	RegKind RegisterKind;
	};

	struct VectorIndexOp {
	unsigned Val;
	};

	struct ImmOp {
	const MCExpr *Val;
	};

	struct ShiftedImmOp {
	const MCExpr *Val;
	unsigned ShiftAmount;
	};

	struct CondCodeOp {
	AArch64CC::CondCode Code;
	};

	struct FPImmOp {
	uint64_t Val; // APFloat value bitcasted to uint64_t.
	bool IsExact; // describes whether parsed value was exact.
	};

	struct BarrierOp {
	const char *Data;
	unsigned Length;
	unsigned Val; // Not the enum since not all values have names.
	};

	struct SysRegOp {
	const char *Data;
	unsigned Length;
	uint32_t MRSReg;
	uint32_t MSRReg;
	uint32_t PStateField;
	};

	struct SysCRImmOp {
	unsigned Val;
	};

	struct PrefetchOp {
	const char *Data;
	unsigned Length;
	unsigned Val;
	};

	struct PSBHintOp {
	const char *Data;
	unsigned Length;
	unsigned Val;
	};

	struct BTIHintOp {
	const char *Data;
	unsigned Length;
	unsigned Val;
	};

	struct ExtendOp {
	unsigned Val;
	};

	union {
	struct TokOp Tok;
	struct RegOp Reg;
	struct VectorListOp VectorList;
	struct VectorIndexOp VectorIndex;
	struct ImmOp Imm;
	struct ShiftedImmOp ShiftedImm;
	struct CondCodeOp CondCode;
	struct FPImmOp FPImm;
	struct BarrierOp Barrier;
	struct SysRegOp SysReg;
	struct SysCRImmOp SysCRImm;
	struct PrefetchOp Prefetch;
	struct PSBHintOp PSBHint;
	struct BTIHintOp BTIHint;
	struct ShiftExtendOp ShiftExtend;
	};

	// Keep the MCContext around as the MCExprs may need manipulated during
	// the add<>Operands() calls.
	MCContext &Ctx;

	public:
	AArch64Operand(KindTy K, MCContext &Ctx) : Kind(K), Ctx(Ctx) {}

	AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
	Kind = o.Kind;
	StartLoc = o.StartLoc;
	EndLoc = o.EndLoc;
	switch (Kind) {
	case k_Token:
	Tok = o.Tok;
	break;
	case k_Immediate:
	Imm = o.Imm;
	break;
	case k_ShiftedImm:
	ShiftedImm = o.ShiftedImm;
	break;
	case k_CondCode:
	CondCode = o.CondCode;
	break;
	case k_FPImm:
	FPImm = o.FPImm;
	break;
	case k_Barrier:
	Barrier = o.Barrier;
	break;
	case k_Register:
	Reg = o.Reg;
	break;
	case k_VectorList:
	VectorList = o.VectorList;
	break;
	case k_VectorIndex:
	VectorIndex = o.VectorIndex;
	break;
	case k_SysReg:
	SysReg = o.SysReg;
	break;
	case k_SysCR:
	SysCRImm = o.SysCRImm;
	break;
	case k_Prefetch:
	Prefetch = o.Prefetch;
	break;
	case k_PSBHint:
	PSBHint = o.PSBHint;
	break;
	case k_BTIHint:
	BTIHint = o.BTIHint;
	break;
	case k_ShiftExtend:
	ShiftExtend = o.ShiftExtend;
	break;
	}
	}

	/// getStartLoc - Get the location of the first token of this operand.
	SMLoc getStartLoc() const override { return StartLoc; }
	/// getEndLoc - Get the location of the last token of this operand.
	SMLoc getEndLoc() const override { return EndLoc; }

	StringRef getToken() const {
	assert(Kind == k_Token && "Invalid access!");
	return StringRef(Tok.Data, Tok.Length);
	}

	bool isTokenSuffix() const {
	assert(Kind == k_Token && "Invalid access!");
	return Tok.IsSuffix;
	}

	const MCExpr *getImm() const {
	assert(Kind == k_Immediate && "Invalid access!");
	return Imm.Val;
	}

	const MCExpr *getShiftedImmVal() const {
	assert(Kind == k_ShiftedImm && "Invalid access!");
	return ShiftedImm.Val;
	}

	unsigned getShiftedImmShift() const {
	assert(Kind == k_ShiftedImm && "Invalid access!");
	return ShiftedImm.ShiftAmount;
	}

	AArch64CC::CondCode getCondCode() const {
	assert(Kind == k_CondCode && "Invalid access!");
	return CondCode.Code;
	}

	APFloat getFPImm() const {
	assert (Kind == k_FPImm && "Invalid access!");
	return APFloat(APFloat::IEEEdouble(), APInt(64, FPImm.Val, true));
	}

	bool getFPImmIsExact() const {
	assert (Kind == k_FPImm && "Invalid access!");
	return FPImm.IsExact;
	}

	unsigned getBarrier() const {
	assert(Kind == k_Barrier && "Invalid access!");
	return Barrier.Val;
	}

	StringRef getBarrierName() const {
	assert(Kind == k_Barrier && "Invalid access!");
	return StringRef(Barrier.Data, Barrier.Length);
	}

	unsigned getReg() const override {
	assert(Kind == k_Register && "Invalid access!");
	return Reg.RegNum;
	}

	RegConstraintEqualityTy getRegEqualityTy() const {
	assert(Kind == k_Register && "Invalid access!");
	return Reg.EqualityTy;
	}

	unsigned getVectorListStart() const {
	assert(Kind == k_VectorList && "Invalid access!");
	return VectorList.RegNum;
	}

	unsigned getVectorListCount() const {
	assert(Kind == k_VectorList && "Invalid access!");
	return VectorList.Count;
	}

	unsigned getVectorIndex() const {
	assert(Kind == k_VectorIndex && "Invalid access!");
	return VectorIndex.Val;
	}

	StringRef getSysReg() const {
	assert(Kind == k_SysReg && "Invalid access!");
	return StringRef(SysReg.Data, SysReg.Length);
	}

	unsigned getSysCR() const {
	assert(Kind == k_SysCR && "Invalid access!");
	return SysCRImm.Val;
	}

	unsigned getPrefetch() const {
	assert(Kind == k_Prefetch && "Invalid access!");
	return Prefetch.Val;
	}

	unsigned getPSBHint() const {
	assert(Kind == k_PSBHint && "Invalid access!");
	return PSBHint.Val;
	}

	StringRef getPSBHintName() const {
	assert(Kind == k_PSBHint && "Invalid access!");
	return StringRef(PSBHint.Data, PSBHint.Length);
	}

	unsigned getBTIHint() const {
	assert(Kind == k_BTIHint && "Invalid access!");
	return BTIHint.Val;
	}

	StringRef getBTIHintName() const {
	assert(Kind == k_BTIHint && "Invalid access!");
	return StringRef(BTIHint.Data, BTIHint.Length);
	}

	StringRef getPrefetchName() const {
	assert(Kind == k_Prefetch && "Invalid access!");
	return StringRef(Prefetch.Data, Prefetch.Length);
	}

	AArch64_AM::ShiftExtendType getShiftExtendType() const {
	if (Kind == k_ShiftExtend)
	return ShiftExtend.Type;
	if (Kind == k_Register)
	return Reg.ShiftExtend.Type;
	llvm_unreachable("Invalid access!");
	}

	unsigned getShiftExtendAmount() const {
	if (Kind == k_ShiftExtend)
	return ShiftExtend.Amount;
	if (Kind == k_Register)
	return Reg.ShiftExtend.Amount;
	llvm_unreachable("Invalid access!");
	}

	bool hasShiftExtendAmount() const {
	if (Kind == k_ShiftExtend)
	return ShiftExtend.HasExplicitAmount;
	if (Kind == k_Register)
	return Reg.ShiftExtend.HasExplicitAmount;
	llvm_unreachable("Invalid access!");
	}

	bool isImm() const override { return Kind == k_Immediate; }
	bool isMem() const override { return false; }

	bool isUImm6() const {
	if (!isImm())
	return false;
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
	if (!MCE)
	return false;
	int64_t Val = MCE->getValue();
	return (Val >= 0 && Val < 64);
	}

	template <int Width> bool isSImm() const { return isSImmScaled<Width, 1>(); }

	template <int Bits, int Scale> DiagnosticPredicate isSImmScaled() const {
	return isImmScaled<Bits, Scale>(true);
	}

	template <int Bits, int Scale> DiagnosticPredicate isUImmScaled() const {
	return isImmScaled<Bits, Scale>(false);
	}

	template <int Bits, int Scale>
	DiagnosticPredicate isImmScaled(bool Signed) const {
	if (!isImm())
	return DiagnosticPredicateTy::NoMatch;

	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
	if (!MCE)
	return DiagnosticPredicateTy::NoMatch;

	int64_t MinVal, MaxVal;
	if (Signed) {
	int64_t Shift = Bits - 1;
	MinVal = (int64_t(1) << Shift) * -Scale;
	MaxVal = ((int64_t(1) << Shift) - 1) * Scale;
	} else {
	MinVal = 0;
	MaxVal = ((int64_t(1) << Bits) - 1) * Scale;
	}

	int64_t Val = MCE->getValue();
	if (Val >= MinVal && Val <= MaxVal && (Val % Scale) == 0)
	return DiagnosticPredicateTy::Match;

	return DiagnosticPredicateTy::NearMatch;
	}

	DiagnosticPredicate isSVEPattern() const {
	if (!isImm())
	return DiagnosticPredicateTy::NoMatch;
	auto *MCE = dyn_cast<MCConstantExpr>(getImm());
	if (!MCE)
	return DiagnosticPredicateTy::NoMatch;
	int64_t Val = MCE->getValue();
	if (Val >= 0 && Val < 32)
	return DiagnosticPredicateTy::Match;
	return DiagnosticPredicateTy::NearMatch;
	}

	bool isSymbolicUImm12Offset(const MCExpr *Expr) const {
	AArch64MCExpr::VariantKind ELFRefKind;
	MCSymbolRefExpr::VariantKind DarwinRefKind;
	int64_t Addend;
	if (!AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
	Addend)) {
	// If we don't understand the expression, assume the best and
	// let the fixup and relocation code deal with it.
	return true;
	}

	if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF \|\|
	ELFRefKind == AArch64MCExpr::VK_LO12 \|\|
	ELFRefKind == AArch64MCExpr::VK_GOT_LO12 \|\|
	ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 \|\|
	ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC \|\|
	ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 \|\|
	ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC \|\|
	ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC \|\|
	ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 \|\|
	ELFRefKind == AArch64MCExpr::VK_SECREL_LO12 \|\|
	ELFRefKind == AArch64MCExpr::VK_SECREL_HI12) {
	// Note that we don't range-check the addend. It's adjusted modulo page
	// size when converted, so there is no "out of range" condition when using
	// @pageoff.
	return true;
	} else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF \|\|
	DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
	// @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
	return Addend == 0;
	}

	return false;
	}

	template <int Scale> bool isUImm12Offset() const {
	if (!isImm())
	return false;

	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
	if (!MCE)
	return isSymbolicUImm12Offset(getImm());

	int64_t Val = MCE->getValue();
	return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
	}

	template <int N, int M>
	bool isImmInRange() const {
	if (!isImm())
	return false;
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
	if (!MCE)
	return false;
	int64_t Val = MCE->getValue();
	return (Val >= N && Val <= M);
	}

	// NOTE: Also used for isLogicalImmNot as anything that can be represented as
	// a logical immediate can always be represented when inverted.
	template <typename T>
	bool isLogicalImm() const {
	if (!isImm())
	return false;
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
	if (!MCE)
	return false;

	int64_t Val = MCE->getValue();
	int64_t SVal = typename std::make_signed<T>::type(Val);
	int64_t UVal = typename std::make_unsigned<T>::type(Val);
	if (Val != SVal && Val != UVal)
	return false;

	return AArch64_AM::isLogicalImmediate(UVal, sizeof(T) * 8);
	}

	bool isShiftedImm() const { return Kind == k_ShiftedImm; }

	/// Returns the immediate value as a pair of (imm, shift) if the immediate is
	/// a shifted immediate by value 'Shift' or '0', or if it is an unshifted
	/// immediate that can be shifted by 'Shift'.
	template <unsigned Width>
	Optional<std::pair<int64_t, unsigned> > getShiftedVal() const {
	if (isShiftedImm() && Width == getShiftedImmShift())
	if (auto *CE = dyn_cast<MCConstantExpr>(getShiftedImmVal()))
	return std::make_pair(CE->getValue(), Width);

	if (isImm())
	if (auto *CE = dyn_cast<MCConstantExpr>(getImm())) {
	int64_t Val = CE->getValue();
	if ((Val != 0) && (uint64_t(Val >> Width) << Width) == uint64_t(Val))
	return std::make_pair(Val >> Width, Width);
	else
	return std::make_pair(Val, 0u);
	}

	return {};
	}

	bool isAddSubImm() const {
	if (!isShiftedImm() && !isImm())
	return false;

	const MCExpr *Expr;

	// An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
	if (isShiftedImm()) {
	unsigned Shift = ShiftedImm.ShiftAmount;
	Expr = ShiftedImm.Val;
	if (Shift != 0 && Shift != 12)
	return false;
	} else {
	Expr = getImm();
	}

	AArch64MCExpr::VariantKind ELFRefKind;
	MCSymbolRefExpr::VariantKind DarwinRefKind;
	int64_t Addend;
	if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind,
	DarwinRefKind, Addend)) {
	return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF
	\|\| DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF
	\|\| (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0)
	\|\| ELFRefKind == AArch64MCExpr::VK_LO12
	\|\| ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12
	\|\| ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12
	\|\| ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC
	\|\| ELFRefKind == AArch64MCExpr::VK_TPREL_HI12
	\|\| ELFRefKind == AArch64MCExpr::VK_TPREL_LO12
	\|\| ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC
	\|\| ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12
	\|\| ELFRefKind == AArch64MCExpr::VK_SECREL_HI12
	\|\| ELFRefKind == AArch64MCExpr::VK_SECREL_LO12;
	}

	// If it's a constant, it should be a real immediate in range.
	if (auto ShiftedVal = getShiftedVal<12>())
	return ShiftedVal->first >= 0 && ShiftedVal->first <= 0xfff;

	// If it's an expression, we hope for the best and let the fixup/relocation
	// code deal with it.
	return true;
	}

	bool isAddSubImmNeg() const {
	if (!isShiftedImm() && !isImm())
	return false;

	// Otherwise it should be a real negative immediate in range.
	if (auto ShiftedVal = getShiftedVal<12>())
	return ShiftedVal->first < 0 && -ShiftedVal->first <= 0xfff;

	return false;
	}

	// Signed value in the range -128 to +127. For element widths of
	// 16 bits or higher it may also be a signed multiple of 256 in the
	// range -32768 to +32512.
	// For element-width of 8 bits a range of -128 to 255 is accepted,
	// since a copy of a byte can be either signed/unsigned.
	template <typename T>
	DiagnosticPredicate isSVECpyImm() const {
	if (!isShiftedImm() && (!isImm() \|\| !isa<MCConstantExpr>(getImm())))
	return DiagnosticPredicateTy::NoMatch;

	bool IsByte =
	std::is_same<int8_t, typename std::make_signed<T>::type>::value;
	if (auto ShiftedImm = getShiftedVal<8>())
	if (!(IsByte && ShiftedImm->second) &&
	AArch64_AM::isSVECpyImm<T>(uint64_t(ShiftedImm->first)
	<< ShiftedImm->second))
	return DiagnosticPredicateTy::Match;

	return DiagnosticPredicateTy::NearMatch;
	}

	// Unsigned value in the range 0 to 255. For element widths of
	// 16 bits or higher it may also be a signed multiple of 256 in the
	// range 0 to 65280.
	template <typename T> DiagnosticPredicate isSVEAddSubImm() const {
	if (!isShiftedImm() && (!isImm() \|\| !isa<MCConstantExpr>(getImm())))
	return DiagnosticPredicateTy::NoMatch;

	bool IsByte =
	std::is_same<int8_t, typename std::make_signed<T>::type>::value;
	if (auto ShiftedImm = getShiftedVal<8>())
	if (!(IsByte && ShiftedImm->second) &&
	AArch64_AM::isSVEAddSubImm<T>(ShiftedImm->first
	<< ShiftedImm->second))
	return DiagnosticPredicateTy::Match;

	return DiagnosticPredicateTy::NearMatch;
	}

	template <typename T> DiagnosticPredicate isSVEPreferredLogicalImm() const {
	if (isLogicalImm<T>() && !isSVECpyImm<T>())
	return DiagnosticPredicateTy::Match;
	return DiagnosticPredicateTy::NoMatch;
	}

	bool isCondCode() const { return Kind == k_CondCode; }

	bool isSIMDImmType10() const {
	if (!isImm())
	return false;
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
	if (!MCE)
	return false;
	return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
	}

	template<int N>
	bool isBranchTarget() const {
	if (!isImm())
	return false;
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
	if (!MCE)
	return true;
	int64_t Val = MCE->getValue();
	if (Val & 0x3)
	return false;
	assert(N > 0 && "Branch target immediate cannot be 0 bits!");
	return (Val >= -((1<<(N-1)) << 2) && Val <= (((1<<(N-1))-1) << 2));
	}

	bool
	isMovWSymbol(ArrayRef<AArch64MCExpr::VariantKind> AllowedModifiers) const {
	if (!isImm())
	return false;

	AArch64MCExpr::VariantKind ELFRefKind;
	MCSymbolRefExpr::VariantKind DarwinRefKind;
	int64_t Addend;
	if (!AArch64AsmParser::classifySymbolRef(getImm(), ELFRefKind,
	DarwinRefKind, Addend)) {
	return false;
	}
	if (DarwinRefKind != MCSymbolRefExpr::VK_None)
	return false;

	for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
	if (ELFRefKind == AllowedModifiers[i])
	return true;
	}

	return false;
	}

	bool isMovZSymbolG3() const {
	return isMovWSymbol(AArch64MCExpr::VK_ABS_G3);
	}

	bool isMovZSymbolG2() const {
	return isMovWSymbol({AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
	AArch64MCExpr::VK_TPREL_G2,
	AArch64MCExpr::VK_DTPREL_G2});
	}

	bool isMovZSymbolG1() const {
	return isMovWSymbol({
	AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S,
	AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1,
	AArch64MCExpr::VK_DTPREL_G1,
	});
	}

	bool isMovZSymbolG0() const {
	return isMovWSymbol({AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
	AArch64MCExpr::VK_TPREL_G0,
	AArch64MCExpr::VK_DTPREL_G0});
	}

	bool isMovKSymbolG3() const {
	return isMovWSymbol(AArch64MCExpr::VK_ABS_G3);
	}

	bool isMovKSymbolG2() const {
	return isMovWSymbol(AArch64MCExpr::VK_ABS_G2_NC);
	}

	bool isMovKSymbolG1() const {
	return isMovWSymbol({AArch64MCExpr::VK_ABS_G1_NC,
	AArch64MCExpr::VK_TPREL_G1_NC,
	AArch64MCExpr::VK_DTPREL_G1_NC});
	}

	bool isMovKSymbolG0() const {
	return isMovWSymbol(
	{AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC,
	AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC});
	}

	template<int RegWidth, int Shift>
	bool isMOVZMovAlias() const {
	if (!isImm()) return false;

	const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
	if (!CE) return false;
	uint64_t Value = CE->getValue();

	return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth);
	}

	template<int RegWidth, int Shift>
	bool isMOVNMovAlias() const {
	if (!isImm()) return false;

	const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
	if (!CE) return false;
	uint64_t Value = CE->getValue();

	return AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth);
	}

	bool isFPImm() const {
	return Kind == k_FPImm &&
	AArch64_AM::getFP64Imm(getFPImm().bitcastToAPInt()) != -1;
	}

	bool isBarrier() const { return Kind == k_Barrier; }
	bool isSysReg() const { return Kind == k_SysReg; }

	bool isMRSSystemRegister() const {
	if (!isSysReg()) return false;

	return SysReg.MRSReg != -1U;
	}

	bool isMSRSystemRegister() const {
	if (!isSysReg()) return false;
	return SysReg.MSRReg != -1U;
	}

	bool isSystemPStateFieldWithImm0_1() const {
	if (!isSysReg()) return false;
	return (SysReg.PStateField == AArch64PState::PAN \|\|
	SysReg.PStateField == AArch64PState::DIT \|\|
	SysReg.PStateField == AArch64PState::UAO \|\|
	SysReg.PStateField == AArch64PState::SSBS);
	}

	bool isSystemPStateFieldWithImm0_15() const {
	if (!isSysReg() \|\| isSystemPStateFieldWithImm0_1()) return false;
	return SysReg.PStateField != -1U;
	}

	bool isReg() const override {
	return Kind == k_Register;
	}

	bool isScalarReg() const {
	return Kind == k_Register && Reg.Kind == RegKind::Scalar;
	}

	bool isNeonVectorReg() const {
	return Kind == k_Register && Reg.Kind == RegKind::NeonVector;
	}

	bool isNeonVectorRegLo() const {
	return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
	AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
	Reg.RegNum);
	}

	template <unsigned Class> bool isSVEVectorReg() const {
	RegKind RK;
	switch (Class) {
	case AArch64::ZPRRegClassID:
	case AArch64::ZPR_3bRegClassID:
	case AArch64::ZPR_4bRegClassID:
	RK = RegKind::SVEDataVector;
	break;
	case AArch64::PPRRegClassID:
	case AArch64::PPR_3bRegClassID:
	RK = RegKind::SVEPredicateVector;
	break;
	default:
	llvm_unreachable("Unsupport register class");
	}

	return (Kind == k_Register && Reg.Kind == RK) &&
	AArch64MCRegisterClasses[Class].contains(getReg());
	}

	template <unsigned Class> bool isFPRasZPR() const {
	return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
	AArch64MCRegisterClasses[Class].contains(getReg());
	}

	template <int ElementWidth, unsigned Class>
	DiagnosticPredicate isSVEPredicateVectorRegOfWidth() const {
	if (Kind != k_Register \|\| Reg.Kind != RegKind::SVEPredicateVector)
	return DiagnosticPredicateTy::NoMatch;

	if (isSVEVectorReg<Class>() && (Reg.ElementWidth == ElementWidth))
	return DiagnosticPredicateTy::Match;

	return DiagnosticPredicateTy::NearMatch;
	}

	template <int ElementWidth, unsigned Class>
	DiagnosticPredicate isSVEDataVectorRegOfWidth() const {
	if (Kind != k_Register \|\| Reg.Kind != RegKind::SVEDataVector)
	return DiagnosticPredicateTy::NoMatch;

	if (isSVEVectorReg<Class>() && Reg.ElementWidth == ElementWidth)
	return DiagnosticPredicateTy::Match;

	return DiagnosticPredicateTy::NearMatch;
	}

	template <int ElementWidth, unsigned Class,
	AArch64_AM::ShiftExtendType ShiftExtendTy, int ShiftWidth,
	bool ShiftWidthAlwaysSame>
	DiagnosticPredicate isSVEDataVectorRegWithShiftExtend() const {
	auto VectorMatch = isSVEDataVectorRegOfWidth<ElementWidth, Class>();
	if (!VectorMatch.isMatch())
	return DiagnosticPredicateTy::NoMatch;

	// Give a more specific diagnostic when the user has explicitly typed in
	// a shift-amount that does not match what is expected, but for which
	// there is also an unscaled addressing mode (e.g. sxtw/uxtw).
	bool MatchShift = getShiftExtendAmount() == Log2_32(ShiftWidth / 8);
	if (!MatchShift && (ShiftExtendTy == AArch64_AM::UXTW \|\|
	ShiftExtendTy == AArch64_AM::SXTW) &&
	!ShiftWidthAlwaysSame && hasShiftExtendAmount() && ShiftWidth == 8)
	return DiagnosticPredicateTy::NoMatch;

	if (MatchShift && ShiftExtendTy == getShiftExtendType())
	return DiagnosticPredicateTy::Match;

	return DiagnosticPredicateTy::NearMatch;
	}

	bool isGPR32as64() const {
	return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
	AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
	}

	bool isGPR64as32() const {
	return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
	AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.RegNum);
	}

	bool isWSeqPair() const {
	return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
	AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains(
	Reg.RegNum);
	}

	bool isXSeqPair() const {
	return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
	AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains(
	Reg.RegNum);
	}

	template<int64_t Angle, int64_t Remainder>
	DiagnosticPredicate isComplexRotation() const {
	if (!isImm()) return DiagnosticPredicateTy::NoMatch;

	const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
	if (!CE) return DiagnosticPredicateTy::NoMatch;
	uint64_t Value = CE->getValue();

	if (Value % Angle == Remainder && Value <= 270)
	return DiagnosticPredicateTy::Match;
	return DiagnosticPredicateTy::NearMatch;
	}

	template <unsigned RegClassID> bool isGPR64() const {
	return Kind == k_Register && Reg.Kind == RegKind::Scalar &&
	AArch64MCRegisterClasses[RegClassID].contains(getReg());
	}

	template <unsigned RegClassID, int ExtWidth>
	DiagnosticPredicate isGPR64WithShiftExtend() const {
	if (Kind != k_Register \|\| Reg.Kind != RegKind::Scalar)
	return DiagnosticPredicateTy::NoMatch;

	if (isGPR64<RegClassID>() && getShiftExtendType() == AArch64_AM::LSL &&
	getShiftExtendAmount() == Log2_32(ExtWidth / 8))
	return DiagnosticPredicateTy::Match;
	return DiagnosticPredicateTy::NearMatch;
	}

	/// Is this a vector list with the type implicit (presumably attached to the
	/// instruction itself)?
	template <RegKind VectorKind, unsigned NumRegs>
	bool isImplicitlyTypedVectorList() const {
	return Kind == k_VectorList && VectorList.Count == NumRegs &&
	VectorList.NumElements == 0 &&
	VectorList.RegisterKind == VectorKind;
	}

	template <RegKind VectorKind, unsigned NumRegs, unsigned NumElements,
	unsigned ElementWidth>
	bool isTypedVectorList() const {
	if (Kind != k_VectorList)
	return false;
	if (VectorList.Count != NumRegs)
	return false;
	if (VectorList.RegisterKind != VectorKind)
	return false;
	if (VectorList.ElementWidth != ElementWidth)
	return false;
	return VectorList.NumElements == NumElements;
	}

	template <int Min, int Max>
	DiagnosticPredicate isVectorIndex() const {
	if (Kind != k_VectorIndex)
	return DiagnosticPredicateTy::NoMatch;
	if (VectorIndex.Val >= Min && VectorIndex.Val <= Max)
	return DiagnosticPredicateTy::Match;
	return DiagnosticPredicateTy::NearMatch;
	}

	bool isToken() const override { return Kind == k_Token; }

	bool isTokenEqual(StringRef Str) const {
	return Kind == k_Token && getToken() == Str;
	}
	bool isSysCR() const { return Kind == k_SysCR; }
	bool isPrefetch() const { return Kind == k_Prefetch; }
	bool isPSBHint() const { return Kind == k_PSBHint; }
	bool isBTIHint() const { return Kind == k_BTIHint; }
	bool isShiftExtend() const { return Kind == k_ShiftExtend; }
	bool isShifter() const {
	if (!isShiftExtend())
	return false;

	AArch64_AM::ShiftExtendType ST = getShiftExtendType();
	return (ST == AArch64_AM::LSL \|\| ST == AArch64_AM::LSR \|\|
	ST == AArch64_AM::ASR \|\| ST == AArch64_AM::ROR \|\|
	ST == AArch64_AM::MSL);
	}

	template <unsigned ImmEnum> DiagnosticPredicate isExactFPImm() const {
	if (Kind != k_FPImm)
	return DiagnosticPredicateTy::NoMatch;

	if (getFPImmIsExact()) {
	// Lookup the immediate from table of supported immediates.
	auto *Desc = AArch64ExactFPImm::lookupExactFPImmByEnum(ImmEnum);
	assert(Desc && "Unknown enum value");

	// Calculate its FP value.
	APFloat RealVal(APFloat::IEEEdouble());
	if (RealVal.convertFromString(Desc->Repr, APFloat::rmTowardZero) !=
	APFloat::opOK)
	llvm_unreachable("FP immediate is not exact");

	if (getFPImm().bitwiseIsEqual(RealVal))
	return DiagnosticPredicateTy::Match;
	}

	return DiagnosticPredicateTy::NearMatch;
	}

	template <unsigned ImmA, unsigned ImmB>
	DiagnosticPredicate isExactFPImm() const {
	DiagnosticPredicate Res = DiagnosticPredicateTy::NoMatch;
	if ((Res = isExactFPImm<ImmA>()))
	return DiagnosticPredicateTy::Match;
	if ((Res = isExactFPImm<ImmB>()))
	return DiagnosticPredicateTy::Match;
	return Res;
	}

	bool isExtend() const {
	if (!isShiftExtend())
	return false;

	AArch64_AM::ShiftExtendType ET = getShiftExtendType();
	return (ET == AArch64_AM::UXTB \|\| ET == AArch64_AM::SXTB \|\|
	ET == AArch64_AM::UXTH \|\| ET == AArch64_AM::SXTH \|\|
	ET == AArch64_AM::UXTW \|\| ET == AArch64_AM::SXTW \|\|
	ET == AArch64_AM::UXTX \|\| ET == AArch64_AM::SXTX \|\|
	ET == AArch64_AM::LSL) &&
	getShiftExtendAmount() <= 4;
	}

	bool isExtend64() const {
	if (!isExtend())
	return false;
	// Make sure the extend expects a 32-bit source register.
	AArch64_AM::ShiftExtendType ET = getShiftExtendType();
	return ET == AArch64_AM::UXTB \|\| ET == AArch64_AM::SXTB \|\|
	ET == AArch64_AM::UXTH \|\| ET == AArch64_AM::SXTH \|\|
	ET == AArch64_AM::UXTW \|\| ET == AArch64_AM::SXTW;
	}

	bool isExtendLSL64() const {
	if (!isExtend())
	return false;
	AArch64_AM::ShiftExtendType ET = getShiftExtendType();
	return (ET == AArch64_AM::UXTX \|\| ET == AArch64_AM::SXTX \|\|
	ET == AArch64_AM::LSL) &&
	getShiftExtendAmount() <= 4;
	}

	template<int Width> bool isMemXExtend() const {
	if (!isExtend())
	return false;
	AArch64_AM::ShiftExtendType ET = getShiftExtendType();
	return (ET == AArch64_AM::LSL \|\| ET == AArch64_AM::SXTX) &&
	(getShiftExtendAmount() == Log2_32(Width / 8) \|\|
	getShiftExtendAmount() == 0);
	}

	template<int Width> bool isMemWExtend() const {
	if (!isExtend())
	return false;
	AArch64_AM::ShiftExtendType ET = getShiftExtendType();
	return (ET == AArch64_AM::UXTW \|\| ET == AArch64_AM::SXTW) &&
	(getShiftExtendAmount() == Log2_32(Width / 8) \|\|
	getShiftExtendAmount() == 0);
	}

	template <unsigned width>
	bool isArithmeticShifter() const {
	if (!isShifter())
	return false;

	// An arithmetic shifter is LSL, LSR, or ASR.
	AArch64_AM::ShiftExtendType ST = getShiftExtendType();
	return (ST == AArch64_AM::LSL \|\| ST == AArch64_AM::LSR \|\|
	ST == AArch64_AM::ASR) && getShiftExtendAmount() < width;
	}

	template <unsigned width>
	bool isLogicalShifter() const {
	if (!isShifter())
	return false;

	// A logical shifter is LSL, LSR, ASR or ROR.
	AArch64_AM::ShiftExtendType ST = getShiftExtendType();
	return (ST == AArch64_AM::LSL \|\| ST == AArch64_AM::LSR \|\|
	ST == AArch64_AM::ASR \|\| ST == AArch64_AM::ROR) &&
	getShiftExtendAmount() < width;
	}

	bool isMovImm32Shifter() const {
	if (!isShifter())
	return false;

	// A MOVi shifter is LSL of 0, 16, 32, or 48.
	AArch64_AM::ShiftExtendType ST = getShiftExtendType();
	if (ST != AArch64_AM::LSL)
	return false;
	uint64_t Val = getShiftExtendAmount();
	return (Val == 0 \|\| Val == 16);
	}

	bool isMovImm64Shifter() const {
	if (!isShifter())
	return false;

	// A MOVi shifter is LSL of 0 or 16.
	AArch64_AM::ShiftExtendType ST = getShiftExtendType();
	if (ST != AArch64_AM::LSL)
	return false;
	uint64_t Val = getShiftExtendAmount();
	return (Val == 0 \|\| Val == 16 \|\| Val == 32 \|\| Val == 48);
	}

	bool isLogicalVecShifter() const {
	if (!isShifter())
	return false;

	// A logical vector shifter is a left shift by 0, 8, 16, or 24.
	unsigned Shift = getShiftExtendAmount();
	return getShiftExtendType() == AArch64_AM::LSL &&
	(Shift == 0 \|\| Shift == 8 \|\| Shift == 16 \|\| Shift == 24);
	}

	bool isLogicalVecHalfWordShifter() const {
	if (!isLogicalVecShifter())
	return false;

	// A logical vector shifter is a left shift by 0 or 8.
	unsigned Shift = getShiftExtendAmount();
	return getShiftExtendType() == AArch64_AM::LSL &&
	(Shift == 0 \|\| Shift == 8);
	}

	bool isMoveVecShifter() const {
	if (!isShiftExtend())
	return false;

	// A logical vector shifter is a left shift by 8 or 16.
	unsigned Shift = getShiftExtendAmount();
	return getShiftExtendType() == AArch64_AM::MSL &&
	(Shift == 8 \|\| Shift == 16);
	}

	// Fallback unscaled operands are for aliases of LDR/STR that fall back
	// to LDUR/STUR when the offset is not legal for the former but is for
	// the latter. As such, in addition to checking for being a legal unscaled
	// address, also check that it is not a legal scaled address. This avoids
	// ambiguity in the matcher.
	template<int Width>
	bool isSImm9OffsetFB() const {
	return isSImm<9>() && !isUImm12Offset<Width / 8>();
	}

	bool isAdrpLabel() const {
	// Validation was handled during parsing, so we just sanity check that
	// something didn't go haywire.
	if (!isImm())
	return false;

	if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
	int64_t Val = CE->getValue();
	int64_t Min = - (4096 * (1LL << (21 - 1)));
	int64_t Max = 4096 * ((1LL << (21 - 1)) - 1);
	return (Val % 4096) == 0 && Val >= Min && Val <= Max;
	}

	return true;
	}

	bool isAdrLabel() const {
	// Validation was handled during parsing, so we just sanity check that
	// something didn't go haywire.
	if (!isImm())
	return false;

	if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
	int64_t Val = CE->getValue();
	int64_t Min = - (1LL << (21 - 1));
	int64_t Max = ((1LL << (21 - 1)) - 1);
	return Val >= Min && Val <= Max;
	}

	return true;
	}

	void addExpr(MCInst &Inst, const MCExpr *Expr) const {
	// Add as immediates when possible. Null MCExpr = 0.
	if (!Expr)
	Inst.addOperand(MCOperand::createImm(0));
	else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
	Inst.addOperand(MCOperand::createImm(CE->getValue()));
	else
	Inst.addOperand(MCOperand::createExpr(Expr));
	}

	void addRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getReg()));
	}

	void addGPR32as64Operands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	assert(
	AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(getReg()));

	const MCRegisterInfo *RI = Ctx.getRegisterInfo();
	uint32_t Reg = RI->getRegClass(AArch64::GPR32RegClassID).getRegister(
	RI->getEncodingValue(getReg()));

	Inst.addOperand(MCOperand::createReg(Reg));
	}

	void addGPR64as32Operands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	assert(
	AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(getReg()));

	const MCRegisterInfo *RI = Ctx.getRegisterInfo();
	uint32_t Reg = RI->getRegClass(AArch64::GPR64RegClassID).getRegister(
	RI->getEncodingValue(getReg()));

	Inst.addOperand(MCOperand::createReg(Reg));
	}

	template <int Width>
	void addFPRasZPRRegOperands(MCInst &Inst, unsigned N) const {
	unsigned Base;
	switch (Width) {
	case 8: Base = AArch64::B0; break;
	case 16: Base = AArch64::H0; break;
	case 32: Base = AArch64::S0; break;
	case 64: Base = AArch64::D0; break;
	case 128: Base = AArch64::Q0; break;
	default:
	llvm_unreachable("Unsupported width");
	}
	Inst.addOperand(MCOperand::createReg(AArch64::Z0 + getReg() - Base));
	}

	void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	assert(
	AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
	Inst.addOperand(MCOperand::createReg(AArch64::D0 + getReg() - AArch64::Q0));
	}

	void addVectorReg128Operands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	assert(
	AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
	Inst.addOperand(MCOperand::createReg(getReg()));
	}

	void addVectorRegLoOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getReg()));
	}

	enum VecListIndexType {
	VecListIdx_DReg = 0,
	VecListIdx_QReg = 1,
	VecListIdx_ZReg = 2,
	};

	template <VecListIndexType RegTy, unsigned NumRegs>
	void addVectorListOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	static const unsigned FirstRegs[][5] = {
	/* DReg */ { AArch64::Q0,
	AArch64::D0, AArch64::D0_D1,
	AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 },
	/* QReg */ { AArch64::Q0,
	AArch64::Q0, AArch64::Q0_Q1,
	AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 },
	/* ZReg */ { AArch64::Z0,
	AArch64::Z0, AArch64::Z0_Z1,
	AArch64::Z0_Z1_Z2, AArch64::Z0_Z1_Z2_Z3 }
	};

	assert((RegTy != VecListIdx_ZReg \|\| NumRegs <= 4) &&
	" NumRegs must be <= 4 for ZRegs");

	unsigned FirstReg = FirstRegs[(unsigned)RegTy][NumRegs];
	Inst.addOperand(MCOperand::createReg(FirstReg + getVectorListStart() -
	FirstRegs[(unsigned)RegTy][0]));
	}

	void addVectorIndexOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createImm(getVectorIndex()));
	}

	template <unsigned ImmIs0, unsigned ImmIs1>
	void addExactFPImmOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	assert(bool(isExactFPImm<ImmIs0, ImmIs1>()) && "Invalid operand");
	Inst.addOperand(MCOperand::createImm(bool(isExactFPImm<ImmIs1>())));
	}

	void addImmOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	// If this is a pageoff symrefexpr with an addend, adjust the addend
	// to be only the page-offset portion. Otherwise, just add the expr
	// as-is.
	addExpr(Inst, getImm());
	}

	template <int Shift>
	void addImmWithOptionalShiftOperands(MCInst &Inst, unsigned N) const {
	assert(N == 2 && "Invalid number of operands!");
	if (auto ShiftedVal = getShiftedVal<Shift>()) {
	Inst.addOperand(MCOperand::createImm(ShiftedVal->first));
	Inst.addOperand(MCOperand::createImm(ShiftedVal->second));
	} else if (isShiftedImm()) {
	addExpr(Inst, getShiftedImmVal());
	Inst.addOperand(MCOperand::createImm(getShiftedImmShift()));
	} else {
	addExpr(Inst, getImm());
	Inst.addOperand(MCOperand::createImm(0));
	}
	}

	template <int Shift>
	void addImmNegWithOptionalShiftOperands(MCInst &Inst, unsigned N) const {
	assert(N == 2 && "Invalid number of operands!");
	if (auto ShiftedVal = getShiftedVal<Shift>()) {
	Inst.addOperand(MCOperand::createImm(-ShiftedVal->first));
	Inst.addOperand(MCOperand::createImm(ShiftedVal->second));
	} else
	llvm_unreachable("Not a shifted negative immediate");
	}

	void addCondCodeOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createImm(getCondCode()));
	}

	void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
	if (!MCE)
	addExpr(Inst, getImm());
	else
	Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 12));
	}

	void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
	addImmOperands(Inst, N);
	}

	template<int Scale>
	void addUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());

	if (!MCE) {
	Inst.addOperand(MCOperand::createExpr(getImm()));
	return;
	}
	Inst.addOperand(MCOperand::createImm(MCE->getValue() / Scale));
	}

	void addUImm6Operands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
	Inst.addOperand(MCOperand::createImm(MCE->getValue()));
	}

	template <int Scale>
	void addImmScaledOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
	Inst.addOperand(MCOperand::createImm(MCE->getValue() / Scale));
	}

	template <typename T>
	void addLogicalImmOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
	typename std::make_unsigned<T>::type Val = MCE->getValue();
	uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8);
	Inst.addOperand(MCOperand::createImm(encoding));
	}

	template <typename T>
	void addLogicalImmNotOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
	typename std::make_unsigned<T>::type Val = ~MCE->getValue();
	uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8);
	Inst.addOperand(MCOperand::createImm(encoding));
	}

	void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
	uint64_t encoding = AArch64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
	Inst.addOperand(MCOperand::createImm(encoding));
	}

	void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
	// Branch operands don't encode the low bits, so shift them off
	// here. If it's a label, however, just put it on directly as there's
	// not enough information now to do anything.
	assert(N == 1 && "Invalid number of operands!");
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
	if (!MCE) {
	addExpr(Inst, getImm());
	return;
	}
	assert(MCE && "Invalid constant immediate operand!");
	Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
	}

	void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
	// Branch operands don't encode the low bits, so shift them off
	// here. If it's a label, however, just put it on directly as there's
	// not enough information now to do anything.
	assert(N == 1 && "Invalid number of operands!");
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
	if (!MCE) {
	addExpr(Inst, getImm());
	return;
	}
	assert(MCE && "Invalid constant immediate operand!");
	Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
	}

	void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
	// Branch operands don't encode the low bits, so shift them off
	// here. If it's a label, however, just put it on directly as there's
	// not enough information now to do anything.
	assert(N == 1 && "Invalid number of operands!");
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
	if (!MCE) {
	addExpr(Inst, getImm());
	return;
	}
	assert(MCE && "Invalid constant immediate operand!");
	Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
	}

	void addFPImmOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createImm(
	AArch64_AM::getFP64Imm(getFPImm().bitcastToAPInt())));
	}

	void addBarrierOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createImm(getBarrier()));
	}

	void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");

	Inst.addOperand(MCOperand::createImm(SysReg.MRSReg));
	}

	void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");

	Inst.addOperand(MCOperand::createImm(SysReg.MSRReg));
	}

	void addSystemPStateFieldWithImm0_1Operands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");

	Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
	}

	void addSystemPStateFieldWithImm0_15Operands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");

	Inst.addOperand(MCOperand::createImm(SysReg.PStateField));
	}

	void addSysCROperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createImm(getSysCR()));
	}

	void addPrefetchOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createImm(getPrefetch()));
	}

	void addPSBHintOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createImm(getPSBHint()));
	}

	void addBTIHintOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createImm(getBTIHint()));
	}

	void addShifterOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	unsigned Imm =
	AArch64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount());
	Inst.addOperand(MCOperand::createImm(Imm));
	}

	void addExtendOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	AArch64_AM::ShiftExtendType ET = getShiftExtendType();
	if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTW;
	unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
	Inst.addOperand(MCOperand::createImm(Imm));
	}

	void addExtend64Operands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	AArch64_AM::ShiftExtendType ET = getShiftExtendType();
	if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTX;
	unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
	Inst.addOperand(MCOperand::createImm(Imm));
	}

	void addMemExtendOperands(MCInst &Inst, unsigned N) const {
	assert(N == 2 && "Invalid number of operands!");
	AArch64_AM::ShiftExtendType ET = getShiftExtendType();
	bool IsSigned = ET == AArch64_AM::SXTW \|\| ET == AArch64_AM::SXTX;
	Inst.addOperand(MCOperand::createImm(IsSigned));
	Inst.addOperand(MCOperand::createImm(getShiftExtendAmount() != 0));
	}

	// For 8-bit load/store instructions with a register offset, both the
	// "DoShift" and "NoShift" variants have a shift of 0. Because of this,
	// they're disambiguated by whether the shift was explicit or implicit rather
	// than its size.
	void addMemExtend8Operands(MCInst &Inst, unsigned N) const {
	assert(N == 2 && "Invalid number of operands!");
	AArch64_AM::ShiftExtendType ET = getShiftExtendType();
	bool IsSigned = ET == AArch64_AM::SXTW \|\| ET == AArch64_AM::SXTX;
	Inst.addOperand(MCOperand::createImm(IsSigned));
	Inst.addOperand(MCOperand::createImm(hasShiftExtendAmount()));
	}

	template<int Shift>
	void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");

	const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
	uint64_t Value = CE->getValue();
	Inst.addOperand(MCOperand::createImm((Value >> Shift) & 0xffff));
	}

	template<int Shift>
	void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");

	const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
	uint64_t Value = CE->getValue();
	Inst.addOperand(MCOperand::createImm((~Value >> Shift) & 0xffff));
	}

	void addComplexRotationEvenOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
	Inst.addOperand(MCOperand::createImm(MCE->getValue() / 90));
	}

	void addComplexRotationOddOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
	Inst.addOperand(MCOperand::createImm((MCE->getValue() - 90) / 180));
	}

	void print(raw_ostream &OS) const override;

	static std::unique_ptr<AArch64Operand>
	CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) {
	auto Op = make_unique<AArch64Operand>(k_Token, Ctx);
	Op->Tok.Data = Str.data();
	Op->Tok.Length = Str.size();
	Op->Tok.IsSuffix = IsSuffix;
	Op->StartLoc = S;
	Op->EndLoc = S;
	return Op;
	}

	static std::unique_ptr<AArch64Operand>
	CreateReg(unsigned RegNum, RegKind Kind, SMLoc S, SMLoc E, MCContext &Ctx,
	RegConstraintEqualityTy EqTy = RegConstraintEqualityTy::EqualsReg,
	AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL,
	unsigned ShiftAmount = 0,
	unsigned HasExplicitAmount = false) {
	auto Op = make_unique<AArch64Operand>(k_Register, Ctx);
	Op->Reg.RegNum = RegNum;
	Op->Reg.Kind = Kind;
	Op->Reg.ElementWidth = 0;
	Op->Reg.EqualityTy = EqTy;
	Op->Reg.ShiftExtend.Type = ExtTy;
	Op->Reg.ShiftExtend.Amount = ShiftAmount;
	Op->Reg.ShiftExtend.HasExplicitAmount = HasExplicitAmount;
	Op->StartLoc = S;
	Op->EndLoc = E;
	return Op;
	}

	static std::unique_ptr<AArch64Operand>
	CreateVectorReg(unsigned RegNum, RegKind Kind, unsigned ElementWidth,
	SMLoc S, SMLoc E, MCContext &Ctx,
	AArch64_AM::ShiftExtendType ExtTy = AArch64_AM::LSL,
	unsigned ShiftAmount = 0,
	unsigned HasExplicitAmount = false) {
	assert((Kind == RegKind::NeonVector \|\| Kind == RegKind::SVEDataVector \|\|
	Kind == RegKind::SVEPredicateVector) &&
	"Invalid vector kind");
	auto Op = CreateReg(RegNum, Kind, S, E, Ctx, EqualsReg, ExtTy, ShiftAmount,
	HasExplicitAmount);
	Op->Reg.ElementWidth = ElementWidth;
	return Op;
	}

	static std::unique_ptr<AArch64Operand>
	CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements,
	unsigned ElementWidth, RegKind RegisterKind, SMLoc S, SMLoc E,
	MCContext &Ctx) {
	auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx);
	Op->VectorList.RegNum = RegNum;
	Op->VectorList.Count = Count;
	Op->VectorList.NumElements = NumElements;
	Op->VectorList.ElementWidth = ElementWidth;
	Op->VectorList.RegisterKind = RegisterKind;
	Op->StartLoc = S;
	Op->EndLoc = E;
	return Op;
	}

	static std::unique_ptr<AArch64Operand>
	CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) {
	auto Op = make_unique<AArch64Operand>(k_VectorIndex, Ctx);
	Op->VectorIndex.Val = Idx;
	Op->StartLoc = S;
	Op->EndLoc = E;
	return Op;
	}

	static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S,
	SMLoc E, MCContext &Ctx) {
	auto Op = make_unique<AArch64Operand>(k_Immediate, Ctx);
	Op->Imm.Val = Val;
	Op->StartLoc = S;
	Op->EndLoc = E;
	return Op;
	}

	static std::unique_ptr<AArch64Operand> CreateShiftedImm(const MCExpr *Val,
	unsigned ShiftAmount,
	SMLoc S, SMLoc E,
	MCContext &Ctx) {
	auto Op = make_unique<AArch64Operand>(k_ShiftedImm, Ctx);
	Op->ShiftedImm .Val = Val;
	Op->ShiftedImm.ShiftAmount = ShiftAmount;
	Op->StartLoc = S;
	Op->EndLoc = E;
	return Op;
	}

	static std::unique_ptr<AArch64Operand>
	CreateCondCode(AArch64CC::CondCode Code, SMLoc S, SMLoc E, MCContext &Ctx) {
	auto Op = make_unique<AArch64Operand>(k_CondCode, Ctx);
	Op->CondCode.Code = Code;
	Op->StartLoc = S;
	Op->EndLoc = E;
	return Op;
	}

	static std::unique_ptr<AArch64Operand>
	CreateFPImm(APFloat Val, bool IsExact, SMLoc S, MCContext &Ctx) {
	auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx);
	Op->FPImm.Val = Val.bitcastToAPInt().getSExtValue();
	Op->FPImm.IsExact = IsExact;
	Op->StartLoc = S;
	Op->EndLoc = S;
	return Op;
	}

	static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val,
	StringRef Str,
	SMLoc S,
	MCContext &Ctx) {
	auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx);
	Op->Barrier.Val = Val;
	Op->Barrier.Data = Str.data();
	Op->Barrier.Length = Str.size();
	Op->StartLoc = S;
	Op->EndLoc = S;
	return Op;
	}

	static std::unique_ptr<AArch64Operand> CreateSysReg(StringRef Str, SMLoc S,
	uint32_t MRSReg,
	uint32_t MSRReg,
	uint32_t PStateField,
	MCContext &Ctx) {
	auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx);
	Op->SysReg.Data = Str.data();
	Op->SysReg.Length = Str.size();
	Op->SysReg.MRSReg = MRSReg;
	Op->SysReg.MSRReg = MSRReg;
	Op->SysReg.PStateField = PStateField;
	Op->StartLoc = S;
	Op->EndLoc = S;
	return Op;
	}

	static std::unique_ptr<AArch64Operand> CreateSysCR(unsigned Val, SMLoc S,
	SMLoc E, MCContext &Ctx) {
	auto Op = make_unique<AArch64Operand>(k_SysCR, Ctx);
	Op->SysCRImm.Val = Val;
	Op->StartLoc = S;
	Op->EndLoc = E;
	return Op;
	}

	static std::unique_ptr<AArch64Operand> CreatePrefetch(unsigned Val,
	StringRef Str,
	SMLoc S,
	MCContext &Ctx) {
	auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx);
	Op->Prefetch.Val = Val;
	Op->Barrier.Data = Str.data();
	Op->Barrier.Length = Str.size();
	Op->StartLoc = S;
	Op->EndLoc = S;
	return Op;
	}

	static std::unique_ptr<AArch64Operand> CreatePSBHint(unsigned Val,
	StringRef Str,
	SMLoc S,
	MCContext &Ctx) {
	auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx);
	Op->PSBHint.Val = Val;
	Op->PSBHint.Data = Str.data();
	Op->PSBHint.Length = Str.size();
	Op->StartLoc = S;
	Op->EndLoc = S;
	return Op;
	}

	static std::unique_ptr<AArch64Operand> CreateBTIHint(unsigned Val,
	StringRef Str,
	SMLoc S,
	MCContext &Ctx) {
	auto Op = make_unique<AArch64Operand>(k_BTIHint, Ctx);
	Op->BTIHint.Val = Val << 1 \| 32;
	Op->BTIHint.Data = Str.data();
	Op->BTIHint.Length = Str.size();
	Op->StartLoc = S;
	Op->EndLoc = S;
	return Op;
	}

	static std::unique_ptr<AArch64Operand>
	CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val,
	bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) {
	auto Op = make_unique<AArch64Operand>(k_ShiftExtend, Ctx);
	Op->ShiftExtend.Type = ShOp;
	Op->ShiftExtend.Amount = Val;
	Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
	Op->StartLoc = S;
	Op->EndLoc = E;
	return Op;
	}
	};

	} // end anonymous namespace.

	void AArch64Operand::print(raw_ostream &OS) const {
	switch (Kind) {
	case k_FPImm:
	OS << "<fpimm " << getFPImm().bitcastToAPInt().getZExtValue();
	if (!getFPImmIsExact())
	OS << " (inexact)";
	OS << ">";
	break;
	case k_Barrier: {
	StringRef Name = getBarrierName();
	if (!Name.empty())
	OS << "<barrier " << Name << ">";
	else
	OS << "<barrier invalid #" << getBarrier() << ">";
	break;
	}
	case k_Immediate:
	OS << *getImm();
	break;
	case k_ShiftedImm: {
	unsigned Shift = getShiftedImmShift();
	OS << "<shiftedimm ";
	OS << *getShiftedImmVal();
	OS << ", lsl #" << AArch64_AM::getShiftValue(Shift) << ">";
	break;
	}
	case k_CondCode:
	OS << "<condcode " << getCondCode() << ">";
	break;
	case k_VectorList: {
	OS << "<vectorlist ";
	unsigned Reg = getVectorListStart();
	for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
	OS << Reg + i << " ";
	OS << ">";
	break;
	}
	case k_VectorIndex:
	OS << "<vectorindex " << getVectorIndex() << ">";
	break;
	case k_SysReg:
	OS << "<sysreg: " << getSysReg() << '>';
	break;
	case k_Token:
	OS << "'" << getToken() << "'";
	break;
	case k_SysCR:
	OS << "c" << getSysCR();
	break;
	case k_Prefetch: {
	StringRef Name = getPrefetchName();
	if (!Name.empty())
	OS << "<prfop " << Name << ">";
	else
	OS << "<prfop invalid #" << getPrefetch() << ">";
	break;
	}
	case k_PSBHint:
	OS << getPSBHintName();
	break;
	case k_Register:
	OS << "<register " << getReg() << ">";
	if (!getShiftExtendAmount() && !hasShiftExtendAmount())
	break;
	LLVM_FALLTHROUGH;
	case k_BTIHint:
	OS << getBTIHintName();
	break;
	case k_ShiftExtend:
	OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
	<< getShiftExtendAmount();
	if (!hasShiftExtendAmount())
	OS << "<imp>";
	OS << '>';
	break;
	}
	}

	/// @name Auto-generated Match Functions
	/// {

	static unsigned MatchRegisterName(StringRef Name);

	/// }

	static unsigned MatchNeonVectorRegName(StringRef Name) {
	return StringSwitch<unsigned>(Name.lower())
	.Case("v0", AArch64::Q0)
	.Case("v1", AArch64::Q1)
	.Case("v2", AArch64::Q2)
	.Case("v3", AArch64::Q3)
	.Case("v4", AArch64::Q4)
	.Case("v5", AArch64::Q5)
	.Case("v6", AArch64::Q6)
	.Case("v7", AArch64::Q7)
	.Case("v8", AArch64::Q8)
	.Case("v9", AArch64::Q9)
	.Case("v10", AArch64::Q10)
	.Case("v11", AArch64::Q11)
	.Case("v12", AArch64::Q12)
	.Case("v13", AArch64::Q13)
	.Case("v14", AArch64::Q14)
	.Case("v15", AArch64::Q15)
	.Case("v16", AArch64::Q16)
	.Case("v17", AArch64::Q17)
	.Case("v18", AArch64::Q18)
	.Case("v19", AArch64::Q19)
	.Case("v20", AArch64::Q20)
	.Case("v21", AArch64::Q21)
	.Case("v22", AArch64::Q22)
	.Case("v23", AArch64::Q23)
	.Case("v24", AArch64::Q24)
	.Case("v25", AArch64::Q25)
	.Case("v26", AArch64::Q26)
	.Case("v27", AArch64::Q27)
	.Case("v28", AArch64::Q28)
	.Case("v29", AArch64::Q29)
	.Case("v30", AArch64::Q30)
	.Case("v31", AArch64::Q31)
	.Default(0);
	}

	/// Returns an optional pair of (#elements, element-width) if Suffix
	/// is a valid vector kind. Where the number of elements in a vector
	/// or the vector width is implicit or explicitly unknown (but still a
	/// valid suffix kind), 0 is used.
	static Optional<std::pair<int, int>> parseVectorKind(StringRef Suffix,
	RegKind VectorKind) {
	std::pair<int, int> Res = {-1, -1};

	switch (VectorKind) {
	case RegKind::NeonVector:
	Res =
	StringSwitch<std::pair<int, int>>(Suffix.lower())
	.Case("", {0, 0})
	.Case(".1d", {1, 64})
	.Case(".1q", {1, 128})
	// '.2h' needed for fp16 scalar pairwise reductions
	.Case(".2h", {2, 16})
	.Case(".2s", {2, 32})
	.Case(".2d", {2, 64})
	// '.4b' is another special case for the ARMv8.2a dot product
	// operand
	.Case(".4b", {4, 8})
	.Case(".4h", {4, 16})
	.Case(".4s", {4, 32})
	.Case(".8b", {8, 8})
	.Case(".8h", {8, 16})
	.Case(".16b", {16, 8})
	// Accept the width neutral ones, too, for verbose syntax. If those
	// aren't used in the right places, the token operand won't match so
	// all will work out.
	.Case(".b", {0, 8})
	.Case(".h", {0, 16})
	.Case(".s", {0, 32})
	.Case(".d", {0, 64})
	.Default({-1, -1});
	break;
	case RegKind::SVEPredicateVector:
	case RegKind::SVEDataVector:
	Res = StringSwitch<std::pair<int, int>>(Suffix.lower())
	.Case("", {0, 0})
	.Case(".b", {0, 8})
	.Case(".h", {0, 16})
	.Case(".s", {0, 32})
	.Case(".d", {0, 64})
	.Case(".q", {0, 128})
	.Default({-1, -1});
	break;
	default:
	llvm_unreachable("Unsupported RegKind");
	}

	if (Res == std::make_pair(-1, -1))
	return Optional<std::pair<int, int>>();

	return Optional<std::pair<int, int>>(Res);
	}

	static bool isValidVectorKind(StringRef Suffix, RegKind VectorKind) {
	return parseVectorKind(Suffix, VectorKind).hasValue();
	}

	static unsigned matchSVEDataVectorRegName(StringRef Name) {
	return StringSwitch<unsigned>(Name.lower())
	.Case("z0", AArch64::Z0)
	.Case("z1", AArch64::Z1)
	.Case("z2", AArch64::Z2)
	.Case("z3", AArch64::Z3)
	.Case("z4", AArch64::Z4)
	.Case("z5", AArch64::Z5)
	.Case("z6", AArch64::Z6)
	.Case("z7", AArch64::Z7)
	.Case("z8", AArch64::Z8)
	.Case("z9", AArch64::Z9)
	.Case("z10", AArch64::Z10)
	.Case("z11", AArch64::Z11)
	.Case("z12", AArch64::Z12)
	.Case("z13", AArch64::Z13)
	.Case("z14", AArch64::Z14)
	.Case("z15", AArch64::Z15)
	.Case("z16", AArch64::Z16)
	.Case("z17", AArch64::Z17)
	.Case("z18", AArch64::Z18)
	.Case("z19", AArch64::Z19)
	.Case("z20", AArch64::Z20)
	.Case("z21", AArch64::Z21)
	.Case("z22", AArch64::Z22)
	.Case("z23", AArch64::Z23)
	.Case("z24", AArch64::Z24)
	.Case("z25", AArch64::Z25)
	.Case("z26", AArch64::Z26)
	.Case("z27", AArch64::Z27)
	.Case("z28", AArch64::Z28)
	.Case("z29", AArch64::Z29)
	.Case("z30", AArch64::Z30)
	.Case("z31", AArch64::Z31)
	.Default(0);
	}

	static unsigned matchSVEPredicateVectorRegName(StringRef Name) {
	return StringSwitch<unsigned>(Name.lower())
	.Case("p0", AArch64::P0)
	.Case("p1", AArch64::P1)
	.Case("p2", AArch64::P2)
	.Case("p3", AArch64::P3)
	.Case("p4", AArch64::P4)
	.Case("p5", AArch64::P5)
	.Case("p6", AArch64::P6)
	.Case("p7", AArch64::P7)
	.Case("p8", AArch64::P8)
	.Case("p9", AArch64::P9)
	.Case("p10", AArch64::P10)
	.Case("p11", AArch64::P11)
	.Case("p12", AArch64::P12)
	.Case("p13", AArch64::P13)
	.Case("p14", AArch64::P14)
	.Case("p15", AArch64::P15)
	.Default(0);
	}

	bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
	SMLoc &EndLoc) {
	StartLoc = getLoc();
	auto Res = tryParseScalarRegister(RegNo);
	EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
	return Res != MatchOperand_Success;
	}

	// Matches a register name or register alias previously defined by '.req'
	unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
	RegKind Kind) {
	unsigned RegNum = 0;
	if ((RegNum = matchSVEDataVectorRegName(Name)))
	return Kind == RegKind::SVEDataVector ? RegNum : 0;

	if ((RegNum = matchSVEPredicateVectorRegName(Name)))
	return Kind == RegKind::SVEPredicateVector ? RegNum : 0;

	if ((RegNum = MatchNeonVectorRegName(Name)))
	return Kind == RegKind::NeonVector ? RegNum : 0;

	// The parsed register must be of RegKind Scalar
	if ((RegNum = MatchRegisterName(Name)))
	return Kind == RegKind::Scalar ? RegNum : 0;

	if (!RegNum) {
	// Handle a few common aliases of registers.
	if (auto RegNum = StringSwitch<unsigned>(Name.lower())
	.Case("fp", AArch64::FP)
	.Case("lr", AArch64::LR)
	.Case("x31", AArch64::XZR)
	.Case("w31", AArch64::WZR)
	.Default(0))
	return Kind == RegKind::Scalar ? RegNum : 0;

	// Check for aliases registered via .req. Canonicalize to lower case.
	// That's more consistent since register names are case insensitive, and
	// it's how the original entry was passed in from MC/MCParser/AsmParser.
	auto Entry = RegisterReqs.find(Name.lower());
	if (Entry == RegisterReqs.end())
	return 0;

	// set RegNum if the match is the right kind of register
	if (Kind == Entry->getValue().first)
	RegNum = Entry->getValue().second;
	}
	return RegNum;
	}

	/// tryParseScalarRegister - Try to parse a register name. The token must be an
	/// Identifier when called, and if it is a register name the token is eaten and
	/// the register is added to the operand list.
	OperandMatchResultTy
	AArch64AsmParser::tryParseScalarRegister(unsigned &RegNum) {
	MCAsmParser &Parser = getParser();
	const AsmToken &Tok = Parser.getTok();
	if (Tok.isNot(AsmToken::Identifier))
	return MatchOperand_NoMatch;

	std::string lowerCase = Tok.getString().lower();
	unsigned Reg = matchRegisterNameAlias(lowerCase, RegKind::Scalar);
	if (Reg == 0)
	return MatchOperand_NoMatch;

	RegNum = Reg;
	Parser.Lex(); // Eat identifier token.
	return MatchOperand_Success;
	}

	/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
	OperandMatchResultTy
	AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	SMLoc S = getLoc();

	if (Parser.getTok().isNot(AsmToken::Identifier)) {
	Error(S, "Expected cN operand where 0 <= N <= 15");
	return MatchOperand_ParseFail;
	}

	StringRef Tok = Parser.getTok().getIdentifier();
	if (Tok[0] != 'c' && Tok[0] != 'C') {
	Error(S, "Expected cN operand where 0 <= N <= 15");
	return MatchOperand_ParseFail;
	}

	uint32_t CRNum;
	bool BadNum = Tok.drop_front().getAsInteger(10, CRNum);
	if (BadNum \|\| CRNum > 15) {
	Error(S, "Expected cN operand where 0 <= N <= 15");
	return MatchOperand_ParseFail;
	}

	Parser.Lex(); // Eat identifier token.
	Operands.push_back(
	AArch64Operand::CreateSysCR(CRNum, S, getLoc(), getContext()));
	return MatchOperand_Success;
	}

	/// tryParsePrefetch - Try to parse a prefetch operand.
	template <bool IsSVEPrefetch>
	OperandMatchResultTy
	AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	SMLoc S = getLoc();
	const AsmToken &Tok = Parser.getTok();

	auto LookupByName = [](StringRef N) {
	if (IsSVEPrefetch) {
	if (auto Res = AArch64SVEPRFM::lookupSVEPRFMByName(N))
	return Optional<unsigned>(Res->Encoding);
	} else if (auto Res = AArch64PRFM::lookupPRFMByName(N))
	return Optional<unsigned>(Res->Encoding);
	return Optional<unsigned>();
	};

	auto LookupByEncoding = [](unsigned E) {
	if (IsSVEPrefetch) {
	if (auto Res = AArch64SVEPRFM::lookupSVEPRFMByEncoding(E))
	return Optional<StringRef>(Res->Name);
	} else if (auto Res = AArch64PRFM::lookupPRFMByEncoding(E))
	return Optional<StringRef>(Res->Name);
	return Optional<StringRef>();
	};
	unsigned MaxVal = IsSVEPrefetch ? 15 : 31;

	// Either an identifier for named values or a 5-bit immediate.
	// Eat optional hash.
	if (parseOptionalToken(AsmToken::Hash) \|\|
	Tok.is(AsmToken::Integer)) {
	const MCExpr *ImmVal;
	if (getParser().parseExpression(ImmVal))
	return MatchOperand_ParseFail;

	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
	if (!MCE) {
	TokError("immediate value expected for prefetch operand");
	return MatchOperand_ParseFail;
	}
	unsigned prfop = MCE->getValue();
	if (prfop > MaxVal) {
	TokError("prefetch operand out of range, [0," + utostr(MaxVal) +
	"] expected");
	return MatchOperand_ParseFail;
	}

	auto PRFM = LookupByEncoding(MCE->getValue());
	Operands.push_back(AArch64Operand::CreatePrefetch(
	prfop, PRFM.getValueOr(""), S, getContext()));
	return MatchOperand_Success;
	}

	if (Tok.isNot(AsmToken::Identifier)) {
	TokError("prefetch hint expected");
	return MatchOperand_ParseFail;
	}

	auto PRFM = LookupByName(Tok.getString());
	if (!PRFM) {
	TokError("prefetch hint expected");
	return MatchOperand_ParseFail;
	}

	Parser.Lex(); // Eat identifier token.
	Operands.push_back(AArch64Operand::CreatePrefetch(
	*PRFM, Tok.getString(), S, getContext()));
	return MatchOperand_Success;
	}

	/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command
	OperandMatchResultTy
	AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	SMLoc S = getLoc();
	const AsmToken &Tok = Parser.getTok();
	if (Tok.isNot(AsmToken::Identifier)) {
	TokError("invalid operand for instruction");
	return MatchOperand_ParseFail;
	}

	auto PSB = AArch64PSBHint::lookupPSBByName(Tok.getString());
	if (!PSB) {
	TokError("invalid operand for instruction");
	return MatchOperand_ParseFail;
	}

	Parser.Lex(); // Eat identifier token.
	Operands.push_back(AArch64Operand::CreatePSBHint(
	PSB->Encoding, Tok.getString(), S, getContext()));
	return MatchOperand_Success;
	}

	/// tryParseBTIHint - Try to parse a BTI operand, mapped to Hint command
	OperandMatchResultTy
	AArch64AsmParser::tryParseBTIHint(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	SMLoc S = getLoc();
	const AsmToken &Tok = Parser.getTok();
	if (Tok.isNot(AsmToken::Identifier)) {
	TokError("invalid operand for instruction");
	return MatchOperand_ParseFail;
	}

	auto BTI = AArch64BTIHint::lookupBTIByName(Tok.getString());
	if (!BTI) {
	TokError("invalid operand for instruction");
	return MatchOperand_ParseFail;
	}

	Parser.Lex(); // Eat identifier token.
	Operands.push_back(AArch64Operand::CreateBTIHint(
	BTI->Encoding, Tok.getString(), S, getContext()));
	return MatchOperand_Success;
	}

	/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
	/// instruction.
	OperandMatchResultTy
	AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	SMLoc S = getLoc();
	const MCExpr *Expr = nullptr;

	if (Parser.getTok().is(AsmToken::Hash)) {
	Parser.Lex(); // Eat hash token.
	}

	if (parseSymbolicImmVal(Expr))
	return MatchOperand_ParseFail;

	AArch64MCExpr::VariantKind ELFRefKind;
	MCSymbolRefExpr::VariantKind DarwinRefKind;
	int64_t Addend;
	if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
	if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
	ELFRefKind == AArch64MCExpr::VK_INVALID) {
	// No modifier was specified at all; this is the syntax for an ELF basic
	// ADRP relocation (unfortunately).
	Expr =
	AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext());
	} else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE \|\|
	DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
	Addend != 0) {
	Error(S, "gotpage label reference not allowed an addend");
	return MatchOperand_ParseFail;
	} else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
	DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
	DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
	ELFRefKind != AArch64MCExpr::VK_ABS_PAGE_NC &&
	ELFRefKind != AArch64MCExpr::VK_GOT_PAGE &&
	ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
	ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
	// The operand must be an @page or @gotpage qualified symbolref.
	Error(S, "page or gotpage label reference expected");
	return MatchOperand_ParseFail;
	}
	}

	// We have either a label reference possibly with addend or an immediate. The
	// addend is a raw value here. The linker will adjust it to only reference the
	// page.
	SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
	Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));

	return MatchOperand_Success;
	}

	/// tryParseAdrLabel - Parse and validate a source label for the ADR
	/// instruction.
	OperandMatchResultTy
	AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
	SMLoc S = getLoc();
	const MCExpr *Expr = nullptr;

	// Leave anything with a bracket to the default for SVE
	if (getParser().getTok().is(AsmToken::LBrac))
	return MatchOperand_NoMatch;

	if (getParser().getTok().is(AsmToken::Hash))
	getParser().Lex(); // Eat hash token.

	if (parseSymbolicImmVal(Expr))
	return MatchOperand_ParseFail;

	AArch64MCExpr::VariantKind ELFRefKind;
	MCSymbolRefExpr::VariantKind DarwinRefKind;
	int64_t Addend;
	if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
	if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
	ELFRefKind == AArch64MCExpr::VK_INVALID) {
	// No modifier was specified at all; this is the syntax for an ELF basic
	// ADR relocation (unfortunately).
	Expr = AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS, getContext());
	} else {
	Error(S, "unexpected adr label");
	return MatchOperand_ParseFail;
	}
	}

	SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
	Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
	return MatchOperand_Success;
	}

	/// tryParseFPImm - A floating point immediate expression operand.
	template<bool AddFPZeroAsLiteral>
	OperandMatchResultTy
	AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	SMLoc S = getLoc();

	bool Hash = parseOptionalToken(AsmToken::Hash);

	// Handle negation, as that still comes through as a separate token.
	bool isNegative = parseOptionalToken(AsmToken::Minus);

	const AsmToken &Tok = Parser.getTok();
	if (!Tok.is(AsmToken::Real) && !Tok.is(AsmToken::Integer)) {
	if (!Hash)
	return MatchOperand_NoMatch;
	TokError("invalid floating point immediate");
	return MatchOperand_ParseFail;
	}

	// Parse hexadecimal representation.
	if (Tok.is(AsmToken::Integer) && Tok.getString().startswith("0x")) {
	if (Tok.getIntVal() > 255 \|\| isNegative) {
	TokError("encoded floating point value out of range");
	return MatchOperand_ParseFail;
	}

	APFloat F((double)AArch64_AM::getFPImmFloat(Tok.getIntVal()));
	Operands.push_back(
	AArch64Operand::CreateFPImm(F, true, S, getContext()));
	} else {
	// Parse FP representation.
	APFloat RealVal(APFloat::IEEEdouble());
	auto Status =
	RealVal.convertFromString(Tok.getString(), APFloat::rmTowardZero);
	if (isNegative)
	RealVal.changeSign();

	if (AddFPZeroAsLiteral && RealVal.isPosZero()) {
	Operands.push_back(
	AArch64Operand::CreateToken("#0", false, S, getContext()));
	Operands.push_back(
	AArch64Operand::CreateToken(".0", false, S, getContext()));
	} else
	Operands.push_back(AArch64Operand::CreateFPImm(
	RealVal, Status == APFloat::opOK, S, getContext()));
	}

	Parser.Lex(); // Eat the token.

	return MatchOperand_Success;
	}

	/// tryParseImmWithOptionalShift - Parse immediate operand, optionally with
	/// a shift suffix, for example '#1, lsl #12'.
	OperandMatchResultTy
	AArch64AsmParser::tryParseImmWithOptionalShift(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	SMLoc S = getLoc();

	if (Parser.getTok().is(AsmToken::Hash))
	Parser.Lex(); // Eat '#'
	else if (Parser.getTok().isNot(AsmToken::Integer))
	// Operand should start from # or should be integer, emit error otherwise.
	return MatchOperand_NoMatch;

	const MCExpr *Imm = nullptr;
	if (parseSymbolicImmVal(Imm))
	return MatchOperand_ParseFail;
	else if (Parser.getTok().isNot(AsmToken::Comma)) {
	SMLoc E = Parser.getTok().getLoc();
	Operands.push_back(
	AArch64Operand::CreateImm(Imm, S, E, getContext()));
	return MatchOperand_Success;
	}

	// Eat ','
	Parser.Lex();

	// The optional operand must be "lsl #N" where N is non-negative.
	if (!Parser.getTok().is(AsmToken::Identifier) \|\|
	!Parser.getTok().getIdentifier().equals_lower("lsl")) {
	Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
	return MatchOperand_ParseFail;
	}

	// Eat 'lsl'
	Parser.Lex();

	parseOptionalToken(AsmToken::Hash);

	if (Parser.getTok().isNot(AsmToken::Integer)) {
	Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
	return MatchOperand_ParseFail;
	}

	int64_t ShiftAmount = Parser.getTok().getIntVal();

	if (ShiftAmount < 0) {
	Error(Parser.getTok().getLoc(), "positive shift amount required");
	return MatchOperand_ParseFail;
	}
	Parser.Lex(); // Eat the number

	// Just in case the optional lsl #0 is used for immediates other than zero.
	if (ShiftAmount == 0 && Imm != nullptr) {
	SMLoc E = Parser.getTok().getLoc();
	Operands.push_back(AArch64Operand::CreateImm(Imm, S, E, getContext()));
	return MatchOperand_Success;
	}

	SMLoc E = Parser.getTok().getLoc();
	Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount,
	S, E, getContext()));
	return MatchOperand_Success;
	}

	/// parseCondCodeString - Parse a Condition Code string.
	AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
	AArch64CC::CondCode CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
	.Case("eq", AArch64CC::EQ)
	.Case("ne", AArch64CC::NE)
	.Case("cs", AArch64CC::HS)
	.Case("hs", AArch64CC::HS)
	.Case("cc", AArch64CC::LO)
	.Case("lo", AArch64CC::LO)
	.Case("mi", AArch64CC::MI)
	.Case("pl", AArch64CC::PL)
	.Case("vs", AArch64CC::VS)
	.Case("vc", AArch64CC::VC)
	.Case("hi", AArch64CC::HI)
	.Case("ls", AArch64CC::LS)
	.Case("ge", AArch64CC::GE)
	.Case("lt", AArch64CC::LT)
	.Case("gt", AArch64CC::GT)
	.Case("le", AArch64CC::LE)
	.Case("al", AArch64CC::AL)
	.Case("nv", AArch64CC::NV)
	.Default(AArch64CC::Invalid);

	if (CC == AArch64CC::Invalid &&
	getSTI().getFeatureBits()[AArch64::FeatureSVE])
	CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
	.Case("none", AArch64CC::EQ)
	.Case("any", AArch64CC::NE)
	.Case("nlast", AArch64CC::HS)
	.Case("last", AArch64CC::LO)
	.Case("first", AArch64CC::MI)
	.Case("nfrst", AArch64CC::PL)
	.Case("pmore", AArch64CC::HI)
	.Case("plast", AArch64CC::LS)
	.Case("tcont", AArch64CC::GE)
	.Case("tstop", AArch64CC::LT)
	.Default(AArch64CC::Invalid);

	return CC;
	}

	/// parseCondCode - Parse a Condition Code operand.
	bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
	bool invertCondCode) {
	MCAsmParser &Parser = getParser();
	SMLoc S = getLoc();
	const AsmToken &Tok = Parser.getTok();
	assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");

	StringRef Cond = Tok.getString();
	AArch64CC::CondCode CC = parseCondCodeString(Cond);
	if (CC == AArch64CC::Invalid)
	return TokError("invalid condition code");
	Parser.Lex(); // Eat identifier token.

	if (invertCondCode) {
	if (CC == AArch64CC::AL \|\| CC == AArch64CC::NV)
	return TokError("condition codes AL and NV are invalid for this instruction");
	CC = AArch64CC::getInvertedCondCode(AArch64CC::CondCode(CC));
	}

	Operands.push_back(
	AArch64Operand::CreateCondCode(CC, S, getLoc(), getContext()));
	return false;
	}

	/// tryParseOptionalShift - Some operands take an optional shift argument. Parse
	/// them if present.
	OperandMatchResultTy
	AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	const AsmToken &Tok = Parser.getTok();
	std::string LowerID = Tok.getString().lower();
	AArch64_AM::ShiftExtendType ShOp =
	StringSwitch<AArch64_AM::ShiftExtendType>(LowerID)
	.Case("lsl", AArch64_AM::LSL)
	.Case("lsr", AArch64_AM::LSR)
	.Case("asr", AArch64_AM::ASR)
	.Case("ror", AArch64_AM::ROR)
	.Case("msl", AArch64_AM::MSL)
	.Case("uxtb", AArch64_AM::UXTB)
	.Case("uxth", AArch64_AM::UXTH)
	.Case("uxtw", AArch64_AM::UXTW)
	.Case("uxtx", AArch64_AM::UXTX)
	.Case("sxtb", AArch64_AM::SXTB)
	.Case("sxth", AArch64_AM::SXTH)
	.Case("sxtw", AArch64_AM::SXTW)
	.Case("sxtx", AArch64_AM::SXTX)
	.Default(AArch64_AM::InvalidShiftExtend);

	if (ShOp == AArch64_AM::InvalidShiftExtend)
	return MatchOperand_NoMatch;

	SMLoc S = Tok.getLoc();
	Parser.Lex();

	bool Hash = parseOptionalToken(AsmToken::Hash);

	if (!Hash && getLexer().isNot(AsmToken::Integer)) {
	if (ShOp == AArch64_AM::LSL \|\| ShOp == AArch64_AM::LSR \|\|
	ShOp == AArch64_AM::ASR \|\| ShOp == AArch64_AM::ROR \|\|
	ShOp == AArch64_AM::MSL) {
	// We expect a number here.
	TokError("expected #imm after shift specifier");
	return MatchOperand_ParseFail;
	}

	// "extend" type operations don't need an immediate, #0 is implicit.
	SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
	Operands.push_back(
	AArch64Operand::CreateShiftExtend(ShOp, 0, false, S, E, getContext()));
	return MatchOperand_Success;
	}

	// Make sure we do actually have a number, identifier or a parenthesized
	// expression.
	SMLoc E = Parser.getTok().getLoc();
	if (!Parser.getTok().is(AsmToken::Integer) &&
	!Parser.getTok().is(AsmToken::LParen) &&
	!Parser.getTok().is(AsmToken::Identifier)) {
	Error(E, "expected integer shift amount");
	return MatchOperand_ParseFail;
	}

	const MCExpr *ImmVal;
	if (getParser().parseExpression(ImmVal))
	return MatchOperand_ParseFail;

	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
	if (!MCE) {
	Error(E, "expected constant '#imm' after shift specifier");
	return MatchOperand_ParseFail;
	}

	E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
	Operands.push_back(AArch64Operand::CreateShiftExtend(
	ShOp, MCE->getValue(), true, S, E, getContext()));
	return MatchOperand_Success;
	}

	static const struct Extension {
	const char *Name;
	const FeatureBitset Features;
	} ExtensionMap[] = {
	{"crc", {AArch64::FeatureCRC}},
	{"sm4", {AArch64::FeatureSM4}},
	{"sha3", {AArch64::FeatureSHA3}},
	{"sha2", {AArch64::FeatureSHA2}},
	{"aes", {AArch64::FeatureAES}},
	{"crypto", {AArch64::FeatureCrypto}},
	{"fp", {AArch64::FeatureFPARMv8}},
	{"simd", {AArch64::FeatureNEON}},
	{"ras", {AArch64::FeatureRAS}},
	{"lse", {AArch64::FeatureLSE}},
	{"predres", {AArch64::FeaturePredRes}},
	{"ccdp", {AArch64::FeatureCacheDeepPersist}},
	{"mte", {AArch64::FeatureMTE}},
	{"tlb-rmi", {AArch64::FeatureTLB_RMI}},
	{"pan-rwv", {AArch64::FeaturePAN_RWV}},
	{"ccpp", {AArch64::FeatureCCPP}},
	{"sve", {AArch64::FeatureSVE}},
	{"sve2", {AArch64::FeatureSVE2}},
	{"sve2-aes", {AArch64::FeatureSVE2AES}},
	{"sve2-sm4", {AArch64::FeatureSVE2SM4}},
	{"sve2-sha3", {AArch64::FeatureSVE2SHA3}},
	- {"bitperm", {AArch64::FeatureSVE2BitPerm}},
	+ {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}},
	// FIXME: Unsupported extensions
	{"pan", {}},
	{"lor", {}},
	{"rdma", {}},
	{"profile", {}},
	};

	static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
	if (FBS[AArch64::HasV8_1aOps])
	Str += "ARMv8.1a";
	else if (FBS[AArch64::HasV8_2aOps])
	Str += "ARMv8.2a";
	else if (FBS[AArch64::HasV8_3aOps])
	Str += "ARMv8.3a";
	else if (FBS[AArch64::HasV8_4aOps])
	Str += "ARMv8.4a";
	else if (FBS[AArch64::HasV8_5aOps])
	Str += "ARMv8.5a";
	else {
	auto ext = std::find_if(std::begin(ExtensionMap),
	std::end(ExtensionMap),
	[&](const Extension& e)
	// Use & in case multiple features are enabled
	{ return (FBS & e.Features) != FeatureBitset(); }
	);

	Str += ext != std::end(ExtensionMap) ? ext->Name : "(unknown)";
	}
	}

	void AArch64AsmParser::createSysAlias(uint16_t Encoding, OperandVector &Operands,
	SMLoc S) {
	const uint16_t Op2 = Encoding & 7;
	const uint16_t Cm = (Encoding & 0x78) >> 3;
	const uint16_t Cn = (Encoding & 0x780) >> 7;
	const uint16_t Op1 = (Encoding & 0x3800) >> 11;

	const MCExpr *Expr = MCConstantExpr::create(Op1, getContext());

	Operands.push_back(
	AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));
	Operands.push_back(
	AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));
	Operands.push_back(
	AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));
	Expr = MCConstantExpr::create(Op2, getContext());
	Operands.push_back(
	AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));
	}

	/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
	/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
	bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
	OperandVector &Operands) {
	if (Name.find('.') != StringRef::npos)
	return TokError("invalid operand");

	Mnemonic = Name;
	Operands.push_back(
	AArch64Operand::CreateToken("sys", false, NameLoc, getContext()));

	MCAsmParser &Parser = getParser();
	const AsmToken &Tok = Parser.getTok();
	StringRef Op = Tok.getString();
	SMLoc S = Tok.getLoc();

	if (Mnemonic == "ic") {
	const AArch64IC::IC *IC = AArch64IC::lookupICByName(Op);
	if (!IC)
	return TokError("invalid operand for IC instruction");
	else if (!IC->haveFeatures(getSTI().getFeatureBits())) {
	std::string Str("IC " + std::string(IC->Name) + " requires ");
	setRequiredFeatureString(IC->getRequiredFeatures(), Str);
	return TokError(Str.c_str());
	}
	createSysAlias(IC->Encoding, Operands, S);
	} else if (Mnemonic == "dc") {
	const AArch64DC::DC *DC = AArch64DC::lookupDCByName(Op);
	if (!DC)
	return TokError("invalid operand for DC instruction");
	else if (!DC->haveFeatures(getSTI().getFeatureBits())) {
	std::string Str("DC " + std::string(DC->Name) + " requires ");
	setRequiredFeatureString(DC->getRequiredFeatures(), Str);
	return TokError(Str.c_str());
	}
	createSysAlias(DC->Encoding, Operands, S);
	} else if (Mnemonic == "at") {
	const AArch64AT::AT *AT = AArch64AT::lookupATByName(Op);
	if (!AT)
	return TokError("invalid operand for AT instruction");
	else if (!AT->haveFeatures(getSTI().getFeatureBits())) {
	std::string Str("AT " + std::string(AT->Name) + " requires ");
	setRequiredFeatureString(AT->getRequiredFeatures(), Str);
	return TokError(Str.c_str());
	}
	createSysAlias(AT->Encoding, Operands, S);
	} else if (Mnemonic == "tlbi") {
	const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByName(Op);
	if (!TLBI)
	return TokError("invalid operand for TLBI instruction");
	else if (!TLBI->haveFeatures(getSTI().getFeatureBits())) {
	std::string Str("TLBI " + std::string(TLBI->Name) + " requires ");
	setRequiredFeatureString(TLBI->getRequiredFeatures(), Str);
	return TokError(Str.c_str());
	}
	createSysAlias(TLBI->Encoding, Operands, S);
	} else if (Mnemonic == "cfp" \|\| Mnemonic == "dvp" \|\| Mnemonic == "cpp") {
	const AArch64PRCTX::PRCTX *PRCTX = AArch64PRCTX::lookupPRCTXByName(Op);
	if (!PRCTX)
	return TokError("invalid operand for prediction restriction instruction");
	else if (!PRCTX->haveFeatures(getSTI().getFeatureBits())) {
	std::string Str(
	Mnemonic.upper() + std::string(PRCTX->Name) + " requires ");
	setRequiredFeatureString(PRCTX->getRequiredFeatures(), Str);
	return TokError(Str.c_str());
	}
	uint16_t PRCTX_Op2 =
	Mnemonic == "cfp" ? 4 :
	Mnemonic == "dvp" ? 5 :
	Mnemonic == "cpp" ? 7 :
	0;
	assert(PRCTX_Op2 && "Invalid mnemonic for prediction restriction instruction");
	createSysAlias(PRCTX->Encoding << 3 \| PRCTX_Op2 , Operands, S);
	}

	Parser.Lex(); // Eat operand.

	bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
	bool HasRegister = false;

	// Check for the optional register operand.
	if (parseOptionalToken(AsmToken::Comma)) {
	if (Tok.isNot(AsmToken::Identifier) \|\| parseRegister(Operands))
	return TokError("expected register operand");
	HasRegister = true;
	}

	if (ExpectRegister && !HasRegister)
	return TokError("specified " + Mnemonic + " op requires a register");
	else if (!ExpectRegister && HasRegister)
	return TokError("specified " + Mnemonic + " op does not use a register");

	if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
	return true;

	return false;
	}

	OperandMatchResultTy
	AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	const AsmToken &Tok = Parser.getTok();

	if (Mnemonic == "tsb" && Tok.isNot(AsmToken::Identifier)) {
	TokError("'csync' operand expected");
	return MatchOperand_ParseFail;
	// Can be either a #imm style literal or an option name
	} else if (parseOptionalToken(AsmToken::Hash) \|\| Tok.is(AsmToken::Integer)) {
	// Immediate operand.
	const MCExpr *ImmVal;
	SMLoc ExprLoc = getLoc();
	if (getParser().parseExpression(ImmVal))
	return MatchOperand_ParseFail;
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
	if (!MCE) {
	Error(ExprLoc, "immediate value expected for barrier operand");
	return MatchOperand_ParseFail;
	}
	if (MCE->getValue() < 0 \|\| MCE->getValue() > 15) {
	Error(ExprLoc, "barrier operand out of range");
	return MatchOperand_ParseFail;
	}
	auto DB = AArch64DB::lookupDBByEncoding(MCE->getValue());
	Operands.push_back(AArch64Operand::CreateBarrier(
	MCE->getValue(), DB ? DB->Name : "", ExprLoc, getContext()));
	return MatchOperand_Success;
	}

	if (Tok.isNot(AsmToken::Identifier)) {
	TokError("invalid operand for instruction");
	return MatchOperand_ParseFail;
	}

	auto TSB = AArch64TSB::lookupTSBByName(Tok.getString());
	// The only valid named option for ISB is 'sy'
	auto DB = AArch64DB::lookupDBByName(Tok.getString());
	if (Mnemonic == "isb" && (!DB \|\| DB->Encoding != AArch64DB::sy)) {
	TokError("'sy' or #imm operand expected");
	return MatchOperand_ParseFail;
	// The only valid named option for TSB is 'csync'
	} else if (Mnemonic == "tsb" && (!TSB \|\| TSB->Encoding != AArch64TSB::csync)) {
	TokError("'csync' operand expected");
	return MatchOperand_ParseFail;
	} else if (!DB && !TSB) {
	TokError("invalid barrier option name");
	return MatchOperand_ParseFail;
	}

	Operands.push_back(AArch64Operand::CreateBarrier(
	DB ? DB->Encoding : TSB->Encoding, Tok.getString(), getLoc(), getContext()));
	Parser.Lex(); // Consume the option

	return MatchOperand_Success;
	}

	OperandMatchResultTy
	AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	const AsmToken &Tok = Parser.getTok();

	if (Tok.isNot(AsmToken::Identifier))
	return MatchOperand_NoMatch;

	int MRSReg, MSRReg;
	auto SysReg = AArch64SysReg::lookupSysRegByName(Tok.getString());
	if (SysReg && SysReg->haveFeatures(getSTI().getFeatureBits())) {
	MRSReg = SysReg->Readable ? SysReg->Encoding : -1;
	MSRReg = SysReg->Writeable ? SysReg->Encoding : -1;
	} else
	MRSReg = MSRReg = AArch64SysReg::parseGenericRegister(Tok.getString());

	auto PState = AArch64PState::lookupPStateByName(Tok.getString());
	unsigned PStateImm = -1;
	if (PState && PState->haveFeatures(getSTI().getFeatureBits()))
	PStateImm = PState->Encoding;

	Operands.push_back(
	AArch64Operand::CreateSysReg(Tok.getString(), getLoc(), MRSReg, MSRReg,
	PStateImm, getContext()));
	Parser.Lex(); // Eat identifier

	return MatchOperand_Success;
	}

	/// tryParseNeonVectorRegister - Parse a vector register operand.
	bool AArch64AsmParser::tryParseNeonVectorRegister(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	if (Parser.getTok().isNot(AsmToken::Identifier))
	return true;

	SMLoc S = getLoc();
	// Check for a vector register specifier first.
	StringRef Kind;
	unsigned Reg;
	OperandMatchResultTy Res =
	tryParseVectorRegister(Reg, Kind, RegKind::NeonVector);
	if (Res != MatchOperand_Success)
	return true;

	const auto &KindRes = parseVectorKind(Kind, RegKind::NeonVector);
	if (!KindRes)
	return true;

	unsigned ElementWidth = KindRes->second;
	Operands.push_back(
	AArch64Operand::CreateVectorReg(Reg, RegKind::NeonVector, ElementWidth,
	S, getLoc(), getContext()));

	// If there was an explicit qualifier, that goes on as a literal text
	// operand.
	if (!Kind.empty())
	Operands.push_back(
	AArch64Operand::CreateToken(Kind, false, S, getContext()));

	return tryParseVectorIndex(Operands) == MatchOperand_ParseFail;
	}

	OperandMatchResultTy
	AArch64AsmParser::tryParseVectorIndex(OperandVector &Operands) {
	SMLoc SIdx = getLoc();
	if (parseOptionalToken(AsmToken::LBrac)) {
	const MCExpr *ImmVal;
	if (getParser().parseExpression(ImmVal))
	return MatchOperand_NoMatch;
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
	if (!MCE) {
	TokError("immediate value expected for vector index");
	return MatchOperand_ParseFail;;
	}

	SMLoc E = getLoc();

	if (parseToken(AsmToken::RBrac, "']' expected"))
	return MatchOperand_ParseFail;;

	Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
	E, getContext()));
	return MatchOperand_Success;
	}

	return MatchOperand_NoMatch;
	}

	// tryParseVectorRegister - Try to parse a vector register name with
	// optional kind specifier. If it is a register specifier, eat the token
	// and return it.
	OperandMatchResultTy
	AArch64AsmParser::tryParseVectorRegister(unsigned &Reg, StringRef &Kind,
	RegKind MatchKind) {
	MCAsmParser &Parser = getParser();
	const AsmToken &Tok = Parser.getTok();

	if (Tok.isNot(AsmToken::Identifier))
	return MatchOperand_NoMatch;

	StringRef Name = Tok.getString();
	// If there is a kind specifier, it's separated from the register name by
	// a '.'.
	size_t Start = 0, Next = Name.find('.');
	StringRef Head = Name.slice(Start, Next);
	unsigned RegNum = matchRegisterNameAlias(Head, MatchKind);

	if (RegNum) {
	if (Next != StringRef::npos) {
	Kind = Name.slice(Next, StringRef::npos);
	if (!isValidVectorKind(Kind, MatchKind)) {
	TokError("invalid vector kind qualifier");
	return MatchOperand_ParseFail;
	}
	}
	Parser.Lex(); // Eat the register token.

	Reg = RegNum;
	return MatchOperand_Success;
	}

	return MatchOperand_NoMatch;
	}

	/// tryParseSVEPredicateVector - Parse a SVE predicate register operand.
	OperandMatchResultTy
	AArch64AsmParser::tryParseSVEPredicateVector(OperandVector &Operands) {
	// Check for a SVE predicate register specifier first.
	const SMLoc S = getLoc();
	StringRef Kind;
	unsigned RegNum;
	auto Res = tryParseVectorRegister(RegNum, Kind, RegKind::SVEPredicateVector);
	if (Res != MatchOperand_Success)
	return Res;

	const auto &KindRes = parseVectorKind(Kind, RegKind::SVEPredicateVector);
	if (!KindRes)
	return MatchOperand_NoMatch;

	unsigned ElementWidth = KindRes->second;
	Operands.push_back(AArch64Operand::CreateVectorReg(
	RegNum, RegKind::SVEPredicateVector, ElementWidth, S,
	getLoc(), getContext()));

	// Not all predicates are followed by a '/m' or '/z'.
	MCAsmParser &Parser = getParser();
	if (Parser.getTok().isNot(AsmToken::Slash))
	return MatchOperand_Success;

	// But when they do they shouldn't have an element type suffix.
	if (!Kind.empty()) {
	Error(S, "not expecting size suffix");
	return MatchOperand_ParseFail;
	}

	// Add a literal slash as operand
	Operands.push_back(
	AArch64Operand::CreateToken("/" , false, getLoc(), getContext()));

	Parser.Lex(); // Eat the slash.

	// Zeroing or merging?
	auto Pred = Parser.getTok().getString().lower();
	if (Pred != "z" && Pred != "m") {
	Error(getLoc(), "expecting 'm' or 'z' predication");
	return MatchOperand_ParseFail;
	}

	// Add zero/merge token.
	const char *ZM = Pred == "z" ? "z" : "m";
	Operands.push_back(
	AArch64Operand::CreateToken(ZM, false, getLoc(), getContext()));

	Parser.Lex(); // Eat zero/merge token.
	return MatchOperand_Success;
	}

	/// parseRegister - Parse a register operand.
	bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
	// Try for a Neon vector register.
	if (!tryParseNeonVectorRegister(Operands))
	return false;

	// Otherwise try for a scalar register.
	if (tryParseGPROperand<false>(Operands) == MatchOperand_Success)
	return false;

	return true;
	}

	bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
	MCAsmParser &Parser = getParser();
	bool HasELFModifier = false;
	AArch64MCExpr::VariantKind RefKind;

	if (parseOptionalToken(AsmToken::Colon)) {
	HasELFModifier = true;

	if (Parser.getTok().isNot(AsmToken::Identifier))
	return TokError("expect relocation specifier in operand after ':'");

	std::string LowerCase = Parser.getTok().getIdentifier().lower();
	RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
	.Case("lo12", AArch64MCExpr::VK_LO12)
	.Case("abs_g3", AArch64MCExpr::VK_ABS_G3)
	.Case("abs_g2", AArch64MCExpr::VK_ABS_G2)
	.Case("abs_g2_s", AArch64MCExpr::VK_ABS_G2_S)
	.Case("abs_g2_nc", AArch64MCExpr::VK_ABS_G2_NC)
	.Case("abs_g1", AArch64MCExpr::VK_ABS_G1)
	.Case("abs_g1_s", AArch64MCExpr::VK_ABS_G1_S)
	.Case("abs_g1_nc", AArch64MCExpr::VK_ABS_G1_NC)
	.Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
	.Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
	.Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
	.Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
	.Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
	.Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
	.Case("dtprel_g0", AArch64MCExpr::VK_DTPREL_G0)
	.Case("dtprel_g0_nc", AArch64MCExpr::VK_DTPREL_G0_NC)
	.Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12)
	.Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12)
	.Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC)
	.Case("pg_hi21_nc", AArch64MCExpr::VK_ABS_PAGE_NC)
	.Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2)
	.Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1)
	.Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC)
	.Case("tprel_g0", AArch64MCExpr::VK_TPREL_G0)
	.Case("tprel_g0_nc", AArch64MCExpr::VK_TPREL_G0_NC)
	.Case("tprel_hi12", AArch64MCExpr::VK_TPREL_HI12)
	.Case("tprel_lo12", AArch64MCExpr::VK_TPREL_LO12)
	.Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC)
	.Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12)
	.Case("got", AArch64MCExpr::VK_GOT_PAGE)
	.Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
	.Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
	.Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
	.Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
	.Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC)
	.Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE)
	.Case("secrel_lo12", AArch64MCExpr::VK_SECREL_LO12)
	.Case("secrel_hi12", AArch64MCExpr::VK_SECREL_HI12)
	.Default(AArch64MCExpr::VK_INVALID);

	if (RefKind == AArch64MCExpr::VK_INVALID)
	return TokError("expect relocation specifier in operand after ':'");

	Parser.Lex(); // Eat identifier

	if (parseToken(AsmToken::Colon, "expect ':' after relocation specifier"))
	return true;
	}

	if (getParser().parseExpression(ImmVal))
	return true;

	if (HasELFModifier)
	ImmVal = AArch64MCExpr::create(ImmVal, RefKind, getContext());

	return false;
	}

	template <RegKind VectorKind>
	OperandMatchResultTy
	AArch64AsmParser::tryParseVectorList(OperandVector &Operands,
	bool ExpectMatch) {
	MCAsmParser &Parser = getParser();
	if (!Parser.getTok().is(AsmToken::LCurly))
	return MatchOperand_NoMatch;

	// Wrapper around parse function
	auto ParseVector = [this, &Parser](unsigned &Reg, StringRef &Kind, SMLoc Loc,
	bool NoMatchIsError) {
	auto RegTok = Parser.getTok();
	auto ParseRes = tryParseVectorRegister(Reg, Kind, VectorKind);
	if (ParseRes == MatchOperand_Success) {
	if (parseVectorKind(Kind, VectorKind))
	return ParseRes;
	llvm_unreachable("Expected a valid vector kind");
	}

	if (RegTok.isNot(AsmToken::Identifier) \|\|
	ParseRes == MatchOperand_ParseFail \|\|
	(ParseRes == MatchOperand_NoMatch && NoMatchIsError)) {
	Error(Loc, "vector register expected");
	return MatchOperand_ParseFail;
	}

	return MatchOperand_NoMatch;
	};

	SMLoc S = getLoc();
	auto LCurly = Parser.getTok();
	Parser.Lex(); // Eat left bracket token.

	StringRef Kind;
	unsigned FirstReg;
	auto ParseRes = ParseVector(FirstReg, Kind, getLoc(), ExpectMatch);

	// Put back the original left bracket if there was no match, so that
	// different types of list-operands can be matched (e.g. SVE, Neon).
	if (ParseRes == MatchOperand_NoMatch)
	Parser.getLexer().UnLex(LCurly);

	if (ParseRes != MatchOperand_Success)
	return ParseRes;

	int64_t PrevReg = FirstReg;
	unsigned Count = 1;

	if (parseOptionalToken(AsmToken::Minus)) {
	SMLoc Loc = getLoc();
	StringRef NextKind;

	unsigned Reg;
	ParseRes = ParseVector(Reg, NextKind, getLoc(), true);
	if (ParseRes != MatchOperand_Success)
	return ParseRes;

	// Any Kind suffices must match on all regs in the list.
	if (Kind != NextKind) {
	Error(Loc, "mismatched register size suffix");
	return MatchOperand_ParseFail;
	}

	unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg);

	if (Space == 0 \|\| Space > 3) {
	Error(Loc, "invalid number of vectors");
	return MatchOperand_ParseFail;
	}

	Count += Space;
	}
	else {
	while (parseOptionalToken(AsmToken::Comma)) {
	SMLoc Loc = getLoc();
	StringRef NextKind;
	unsigned Reg;
	ParseRes = ParseVector(Reg, NextKind, getLoc(), true);
	if (ParseRes != MatchOperand_Success)
	return ParseRes;

	// Any Kind suffices must match on all regs in the list.
	if (Kind != NextKind) {
	Error(Loc, "mismatched register size suffix");
	return MatchOperand_ParseFail;
	}

	// Registers must be incremental (with wraparound at 31)
	if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
	(getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32) {
	Error(Loc, "registers must be sequential");
	return MatchOperand_ParseFail;
	}

	PrevReg = Reg;
	++Count;
	}
	}

	if (parseToken(AsmToken::RCurly, "'}' expected"))
	return MatchOperand_ParseFail;

	if (Count > 4) {
	Error(S, "invalid number of vectors");
	return MatchOperand_ParseFail;
	}

	unsigned NumElements = 0;
	unsigned ElementWidth = 0;
	if (!Kind.empty()) {
	if (const auto &VK = parseVectorKind(Kind, VectorKind))
	std::tie(NumElements, ElementWidth) = *VK;
	}

	Operands.push_back(AArch64Operand::CreateVectorList(
	FirstReg, Count, NumElements, ElementWidth, VectorKind, S, getLoc(),
	getContext()));

	return MatchOperand_Success;
	}

	/// parseNeonVectorList - Parse a vector list operand for AdvSIMD instructions.
	bool AArch64AsmParser::parseNeonVectorList(OperandVector &Operands) {
	auto ParseRes = tryParseVectorList<RegKind::NeonVector>(Operands, true);
	if (ParseRes != MatchOperand_Success)
	return true;

	return tryParseVectorIndex(Operands) == MatchOperand_ParseFail;
	}

	OperandMatchResultTy
	AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
	SMLoc StartLoc = getLoc();

	unsigned RegNum;
	OperandMatchResultTy Res = tryParseScalarRegister(RegNum);
	if (Res != MatchOperand_Success)
	return Res;

	if (!parseOptionalToken(AsmToken::Comma)) {
	Operands.push_back(AArch64Operand::CreateReg(
	RegNum, RegKind::Scalar, StartLoc, getLoc(), getContext()));
	return MatchOperand_Success;
	}

	parseOptionalToken(AsmToken::Hash);

	if (getParser().getTok().isNot(AsmToken::Integer)) {
	Error(getLoc(), "index must be absent or #0");
	return MatchOperand_ParseFail;
	}

	const MCExpr *ImmVal;
	if (getParser().parseExpression(ImmVal) \|\| !isa<MCConstantExpr>(ImmVal) \|\|
	cast<MCConstantExpr>(ImmVal)->getValue() != 0) {
	Error(getLoc(), "index must be absent or #0");
	return MatchOperand_ParseFail;
	}

	Operands.push_back(AArch64Operand::CreateReg(
	RegNum, RegKind::Scalar, StartLoc, getLoc(), getContext()));
	return MatchOperand_Success;
	}

	template <bool ParseShiftExtend, RegConstraintEqualityTy EqTy>
	OperandMatchResultTy
	AArch64AsmParser::tryParseGPROperand(OperandVector &Operands) {
	SMLoc StartLoc = getLoc();

	unsigned RegNum;
	OperandMatchResultTy Res = tryParseScalarRegister(RegNum);
	if (Res != MatchOperand_Success)
	return Res;

	// No shift/extend is the default.
	if (!ParseShiftExtend \|\| getParser().getTok().isNot(AsmToken::Comma)) {
	Operands.push_back(AArch64Operand::CreateReg(
	RegNum, RegKind::Scalar, StartLoc, getLoc(), getContext(), EqTy));
	return MatchOperand_Success;
	}

	// Eat the comma
	getParser().Lex();

	// Match the shift
	SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> ExtOpnd;
	Res = tryParseOptionalShiftExtend(ExtOpnd);
	if (Res != MatchOperand_Success)
	return Res;

	auto Ext = static_cast<AArch64Operand*>(ExtOpnd.back().get());
	Operands.push_back(AArch64Operand::CreateReg(
	RegNum, RegKind::Scalar, StartLoc, Ext->getEndLoc(), getContext(), EqTy,
	Ext->getShiftExtendType(), Ext->getShiftExtendAmount(),
	Ext->hasShiftExtendAmount()));

	return MatchOperand_Success;
	}

	bool AArch64AsmParser::parseOptionalMulOperand(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();

	// Some SVE instructions have a decoration after the immediate, i.e.
	// "mul vl". We parse them here and add tokens, which must be present in the
	// asm string in the tablegen instruction.
	bool NextIsVL = Parser.getLexer().peekTok().getString().equals_lower("vl");
	bool NextIsHash = Parser.getLexer().peekTok().is(AsmToken::Hash);
	if (!Parser.getTok().getString().equals_lower("mul") \|\|
	!(NextIsVL \|\| NextIsHash))
	return true;

	Operands.push_back(
	AArch64Operand::CreateToken("mul", false, getLoc(), getContext()));
	Parser.Lex(); // Eat the "mul"

	if (NextIsVL) {
	Operands.push_back(
	AArch64Operand::CreateToken("vl", false, getLoc(), getContext()));
	Parser.Lex(); // Eat the "vl"
	return false;
	}

	if (NextIsHash) {
	Parser.Lex(); // Eat the #
	SMLoc S = getLoc();

	// Parse immediate operand.
	const MCExpr *ImmVal;
	if (!Parser.parseExpression(ImmVal))
	if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal)) {
	Operands.push_back(AArch64Operand::CreateImm(
	MCConstantExpr::create(MCE->getValue(), getContext()), S, getLoc(),
	getContext()));
	return MatchOperand_Success;
	}
	}

	return Error(getLoc(), "expected 'vl' or '#<imm>'");
	}

	/// parseOperand - Parse a arm instruction operand. For now this parses the
	/// operand regardless of the mnemonic.
	bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
	bool invertCondCode) {
	MCAsmParser &Parser = getParser();

	OperandMatchResultTy ResTy =
	MatchOperandParserImpl(Operands, Mnemonic, /ParseForAllFeatures=/ true);

	// Check if the current operand has a custom associated parser, if so, try to
	// custom parse the operand, or fallback to the general approach.
	if (ResTy == MatchOperand_Success)
	return false;
	// If there wasn't a custom match, try the generic matcher below. Otherwise,
	// there was a match, but an error occurred, in which case, just return that
	// the operand parsing failed.
	if (ResTy == MatchOperand_ParseFail)
	return true;

	// Nothing custom, so do general case parsing.
	SMLoc S, E;
	switch (getLexer().getKind()) {
	default: {
	SMLoc S = getLoc();
	const MCExpr *Expr;
	if (parseSymbolicImmVal(Expr))
	return Error(S, "invalid operand");

	SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
	Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
	return false;
	}
	case AsmToken::LBrac: {
	SMLoc Loc = Parser.getTok().getLoc();
	Operands.push_back(AArch64Operand::CreateToken("[", false, Loc,
	getContext()));
	Parser.Lex(); // Eat '['

	// There's no comma after a '[', so we can parse the next operand
	// immediately.
	return parseOperand(Operands, false, false);
	}
	case AsmToken::LCurly:
	return parseNeonVectorList(Operands);
	case AsmToken::Identifier: {
	// If we're expecting a Condition Code operand, then just parse that.
	if (isCondCode)
	return parseCondCode(Operands, invertCondCode);

	// If it's a register name, parse it.
	if (!parseRegister(Operands))
	return false;

	// See if this is a "mul vl" decoration or "mul #<int>" operand used
	// by SVE instructions.
	if (!parseOptionalMulOperand(Operands))
	return false;

	// This could be an optional "shift" or "extend" operand.
	OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
	// We can only continue if no tokens were eaten.
	if (GotShift != MatchOperand_NoMatch)
	return GotShift;

	// This was not a register so parse other operands that start with an
	// identifier (like labels) as expressions and create them as immediates.
	const MCExpr *IdVal;
	S = getLoc();
	if (getParser().parseExpression(IdVal))
	return true;
	E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
	Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext()));
	return false;
	}
	case AsmToken::Integer:
	case AsmToken::Real:
	case AsmToken::Hash: {
	// #42 -> immediate.
	S = getLoc();

	parseOptionalToken(AsmToken::Hash);

	// Parse a negative sign
	bool isNegative = false;
	if (Parser.getTok().is(AsmToken::Minus)) {
	isNegative = true;
	// We need to consume this token only when we have a Real, otherwise
	// we let parseSymbolicImmVal take care of it
	if (Parser.getLexer().peekTok().is(AsmToken::Real))
	Parser.Lex();
	}

	// The only Real that should come through here is a literal #0.0 for
	// the fcmp[e] r, #0.0 instructions. They expect raw token operands,
	// so convert the value.
	const AsmToken &Tok = Parser.getTok();
	if (Tok.is(AsmToken::Real)) {
	APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
	uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
	if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" &&
	Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" &&
	Mnemonic != "fcmlt" && Mnemonic != "fcmne")
	return TokError("unexpected floating point literal");
	else if (IntVal != 0 \|\| isNegative)
	return TokError("expected floating-point constant #0.0");
	Parser.Lex(); // Eat the token.

	Operands.push_back(
	AArch64Operand::CreateToken("#0", false, S, getContext()));
	Operands.push_back(
	AArch64Operand::CreateToken(".0", false, S, getContext()));
	return false;
	}

	const MCExpr *ImmVal;
	if (parseSymbolicImmVal(ImmVal))
	return true;

	E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
	Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext()));
	return false;
	}
	case AsmToken::Equal: {
	SMLoc Loc = getLoc();
	if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val)
	return TokError("unexpected token in operand");
	Parser.Lex(); // Eat '='
	const MCExpr *SubExprVal;
	if (getParser().parseExpression(SubExprVal))
	return true;

	if (Operands.size() < 2 \|\|
	!static_cast<AArch64Operand &>(*Operands[1]).isScalarReg())
	return Error(Loc, "Only valid when first operand is register");

	bool IsXReg =
	AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
	Operands[1]->getReg());

	MCContext& Ctx = getContext();
	E = SMLoc::getFromPointer(Loc.getPointer() - 1);
	// If the op is an imm and can be fit into a mov, then replace ldr with mov.
	if (isa<MCConstantExpr>(SubExprVal)) {
	uint64_t Imm = (cast<MCConstantExpr>(SubExprVal))->getValue();
	uint32_t ShiftAmt = 0, MaxShiftAmt = IsXReg ? 48 : 16;
	while(Imm > 0xFFFF && countTrailingZeros(Imm) >= 16) {
	ShiftAmt += 16;
	Imm >>= 16;
	}
	if (ShiftAmt <= MaxShiftAmt && Imm <= 0xFFFF) {
	Operands[0] = AArch64Operand::CreateToken("movz", false, Loc, Ctx);
	Operands.push_back(AArch64Operand::CreateImm(
	MCConstantExpr::create(Imm, Ctx), S, E, Ctx));
	if (ShiftAmt)
	Operands.push_back(AArch64Operand::CreateShiftExtend(AArch64_AM::LSL,
	ShiftAmt, true, S, E, Ctx));
	return false;
	}
	APInt Simm = APInt(64, Imm << ShiftAmt);
	// check if the immediate is an unsigned or signed 32-bit int for W regs
	if (!IsXReg && !(Simm.isIntN(32) \|\| Simm.isSignedIntN(32)))
	return Error(Loc, "Immediate too large for register");
	}
	// If it is a label or an imm that cannot fit in a movz, put it into CP.
	const MCExpr *CPLoc =
	getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4, Loc);
	Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx));
	return false;
	}
	}
	}

	bool AArch64AsmParser::regsEqual(const MCParsedAsmOperand &Op1,
	const MCParsedAsmOperand &Op2) const {
	auto &AOp1 = static_cast<const AArch64Operand&>(Op1);
	auto &AOp2 = static_cast<const AArch64Operand&>(Op2);
	if (AOp1.getRegEqualityTy() == RegConstraintEqualityTy::EqualsReg &&
	AOp2.getRegEqualityTy() == RegConstraintEqualityTy::EqualsReg)
	return MCTargetAsmParser::regsEqual(Op1, Op2);

	assert(AOp1.isScalarReg() && AOp2.isScalarReg() &&
	"Testing equality of non-scalar registers not supported");

	// Check if a registers match their sub/super register classes.
	if (AOp1.getRegEqualityTy() == EqualsSuperReg)
	return getXRegFromWReg(Op1.getReg()) == Op2.getReg();
	if (AOp1.getRegEqualityTy() == EqualsSubReg)
	return getWRegFromXReg(Op1.getReg()) == Op2.getReg();
	if (AOp2.getRegEqualityTy() == EqualsSuperReg)
	return getXRegFromWReg(Op2.getReg()) == Op1.getReg();
	if (AOp2.getRegEqualityTy() == EqualsSubReg)
	return getWRegFromXReg(Op2.getReg()) == Op1.getReg();

	return false;
	}

	/// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its
	/// operands.
	bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
	StringRef Name, SMLoc NameLoc,
	OperandVector &Operands) {
	MCAsmParser &Parser = getParser();
	Name = StringSwitch<StringRef>(Name.lower())
	.Case("beq", "b.eq")
	.Case("bne", "b.ne")
	.Case("bhs", "b.hs")
	.Case("bcs", "b.cs")
	.Case("blo", "b.lo")
	.Case("bcc", "b.cc")
	.Case("bmi", "b.mi")
	.Case("bpl", "b.pl")
	.Case("bvs", "b.vs")
	.Case("bvc", "b.vc")
	.Case("bhi", "b.hi")
	.Case("bls", "b.ls")
	.Case("bge", "b.ge")
	.Case("blt", "b.lt")
	.Case("bgt", "b.gt")
	.Case("ble", "b.le")
	.Case("bal", "b.al")
	.Case("bnv", "b.nv")
	.Default(Name);

	// First check for the AArch64-specific .req directive.
	if (Parser.getTok().is(AsmToken::Identifier) &&
	Parser.getTok().getIdentifier() == ".req") {
	parseDirectiveReq(Name, NameLoc);
	// We always return 'error' for this, as we're done with this
	// statement and don't need to match the 'instruction."
	return true;
	}

	// Create the leading tokens for the mnemonic, split by '.' characters.
	size_t Start = 0, Next = Name.find('.');
	StringRef Head = Name.slice(Start, Next);

	// IC, DC, AT, TLBI and Prediction invalidation instructions are aliases for
	// the SYS instruction.
	if (Head == "ic" \|\| Head == "dc" \|\| Head == "at" \|\| Head == "tlbi" \|\|
	Head == "cfp" \|\| Head == "dvp" \|\| Head == "cpp")
	return parseSysAlias(Head, NameLoc, Operands);

	Operands.push_back(
	AArch64Operand::CreateToken(Head, false, NameLoc, getContext()));
	Mnemonic = Head;

	// Handle condition codes for a branch mnemonic
	if (Head == "b" && Next != StringRef::npos) {
	Start = Next;
	Next = Name.find('.', Start + 1);
	Head = Name.slice(Start + 1, Next);

	SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
	(Head.data() - Name.data()));
	AArch64CC::CondCode CC = parseCondCodeString(Head);
	if (CC == AArch64CC::Invalid)
	return Error(SuffixLoc, "invalid condition code");
	Operands.push_back(
	AArch64Operand::CreateToken(".", true, SuffixLoc, getContext()));
	Operands.push_back(
	AArch64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext()));
	}

	// Add the remaining tokens in the mnemonic.
	while (Next != StringRef::npos) {
	Start = Next;
	Next = Name.find('.', Start + 1);
	Head = Name.slice(Start, Next);
	SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
	(Head.data() - Name.data()) + 1);
	Operands.push_back(
	AArch64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
	}

	// Conditional compare instructions have a Condition Code operand, which needs
	// to be parsed and an immediate operand created.
	bool condCodeFourthOperand =
	(Head == "ccmp" \|\| Head == "ccmn" \|\| Head == "fccmp" \|\|
	Head == "fccmpe" \|\| Head == "fcsel" \|\| Head == "csel" \|\|
	Head == "csinc" \|\| Head == "csinv" \|\| Head == "csneg");

	// These instructions are aliases to some of the conditional select
	// instructions. However, the condition code is inverted in the aliased
	// instruction.
	//
	// FIXME: Is this the correct way to handle these? Or should the parser
	// generate the aliased instructions directly?
	bool condCodeSecondOperand = (Head == "cset" \|\| Head == "csetm");
	bool condCodeThirdOperand =
	(Head == "cinc" \|\| Head == "cinv" \|\| Head == "cneg");

	// Read the remaining operands.
	if (getLexer().isNot(AsmToken::EndOfStatement)) {

	unsigned N = 1;
	do {
	// Parse and remember the operand.
	if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) \|\|
	(N == 3 && condCodeThirdOperand) \|\|
	(N == 2 && condCodeSecondOperand),
	condCodeSecondOperand \|\| condCodeThirdOperand)) {
	return true;
	}

	// After successfully parsing some operands there are two special cases to
	// consider (i.e. notional operands not separated by commas). Both are due
	// to memory specifiers:
	// + An RBrac will end an address for load/store/prefetch
	// + An '!' will indicate a pre-indexed operation.
	//
	// It's someone else's responsibility to make sure these tokens are sane
	// in the given context!

	SMLoc RLoc = Parser.getTok().getLoc();
	if (parseOptionalToken(AsmToken::RBrac))
	Operands.push_back(
	AArch64Operand::CreateToken("]", false, RLoc, getContext()));
	SMLoc ELoc = Parser.getTok().getLoc();
	if (parseOptionalToken(AsmToken::Exclaim))
	Operands.push_back(
	AArch64Operand::CreateToken("!", false, ELoc, getContext()));

	++N;
	} while (parseOptionalToken(AsmToken::Comma));
	}

	if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
	return true;

	return false;
	}

	static inline bool isMatchingOrAlias(unsigned ZReg, unsigned Reg) {
	assert((ZReg >= AArch64::Z0) && (ZReg <= AArch64::Z31));
	return (ZReg == ((Reg - AArch64::B0) + AArch64::Z0)) \|\|
	(ZReg == ((Reg - AArch64::H0) + AArch64::Z0)) \|\|
	(ZReg == ((Reg - AArch64::S0) + AArch64::Z0)) \|\|
	(ZReg == ((Reg - AArch64::D0) + AArch64::Z0)) \|\|
	(ZReg == ((Reg - AArch64::Q0) + AArch64::Z0)) \|\|
	(ZReg == ((Reg - AArch64::Z0) + AArch64::Z0));
	}

	// FIXME: This entire function is a giant hack to provide us with decent
	// operand range validation/diagnostics until TableGen/MC can be extended
	// to support autogeneration of this kind of validation.
	bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
	SmallVectorImpl<SMLoc> &Loc) {
	const MCRegisterInfo *RI = getContext().getRegisterInfo();
	const MCInstrDesc &MCID = MII.get(Inst.getOpcode());

	// A prefix only applies to the instruction following it. Here we extract
	// prefix information for the next instruction before validating the current
	// one so that in the case of failure we don't erronously continue using the
	// current prefix.
	PrefixInfo Prefix = NextPrefix;
	NextPrefix = PrefixInfo::CreateFromInst(Inst, MCID.TSFlags);

	// Before validating the instruction in isolation we run through the rules
	// applicable when it follows a prefix instruction.
	// NOTE: brk & hlt can be prefixed but require no additional validation.
	if (Prefix.isActive() &&
	(Inst.getOpcode() != AArch64::BRK) &&
	(Inst.getOpcode() != AArch64::HLT)) {

	// Prefixed intructions must have a destructive operand.
	if ((MCID.TSFlags & AArch64::DestructiveInstTypeMask) ==
	AArch64::NotDestructive)
	return Error(IDLoc, "instruction is unpredictable when following a"
	" movprfx, suggest replacing movprfx with mov");

	// Destination operands must match.
	if (Inst.getOperand(0).getReg() != Prefix.getDstReg())
	return Error(Loc[0], "instruction is unpredictable when following a"
	" movprfx writing to a different destination");

	// Destination operand must not be used in any other location.
	for (unsigned i = 1; i < Inst.getNumOperands(); ++i) {
	if (Inst.getOperand(i).isReg() &&
	(MCID.getOperandConstraint(i, MCOI::TIED_TO) == -1) &&
	isMatchingOrAlias(Prefix.getDstReg(), Inst.getOperand(i).getReg()))
	return Error(Loc[0], "instruction is unpredictable when following a"
	" movprfx and destination also used as non-destructive"
	" source");
	}

	auto PPRRegClass = AArch64MCRegisterClasses[AArch64::PPRRegClassID];
	if (Prefix.isPredicated()) {
	int PgIdx = -1;

	// Find the instructions general predicate.
	for (unsigned i = 1; i < Inst.getNumOperands(); ++i)
	if (Inst.getOperand(i).isReg() &&
	PPRRegClass.contains(Inst.getOperand(i).getReg())) {
	PgIdx = i;
	break;
	}

	// Instruction must be predicated if the movprfx is predicated.
	if (PgIdx == -1 \|\|
	(MCID.TSFlags & AArch64::ElementSizeMask) == AArch64::ElementSizeNone)
	return Error(IDLoc, "instruction is unpredictable when following a"
	" predicated movprfx, suggest using unpredicated movprfx");

	// Instruction must use same general predicate as the movprfx.
	if (Inst.getOperand(PgIdx).getReg() != Prefix.getPgReg())
	return Error(IDLoc, "instruction is unpredictable when following a"
	" predicated movprfx using a different general predicate");

	// Instruction element type must match the movprfx.
	if ((MCID.TSFlags & AArch64::ElementSizeMask) != Prefix.getElementSize())
	return Error(IDLoc, "instruction is unpredictable when following a"
	" predicated movprfx with a different element size");
	}
	}

	// Check for indexed addressing modes w/ the base register being the
	// same as a destination/source register or pair load where
	// the Rt == Rt2. All of those are undefined behaviour.
	switch (Inst.getOpcode()) {
	case AArch64::LDPSWpre:
	case AArch64::LDPWpost:
	case AArch64::LDPWpre:
	case AArch64::LDPXpost:
	case AArch64::LDPXpre: {
	unsigned Rt = Inst.getOperand(1).getReg();
	unsigned Rt2 = Inst.getOperand(2).getReg();
	unsigned Rn = Inst.getOperand(3).getReg();
	if (RI->isSubRegisterEq(Rn, Rt))
	return Error(Loc[0], "unpredictable LDP instruction, writeback base "
	"is also a destination");
	if (RI->isSubRegisterEq(Rn, Rt2))
	return Error(Loc[1], "unpredictable LDP instruction, writeback base "
	"is also a destination");
	LLVM_FALLTHROUGH;
	}
	case AArch64::LDPDi:
	case AArch64::LDPQi:
	case AArch64::LDPSi:
	case AArch64::LDPSWi:
	case AArch64::LDPWi:
	case AArch64::LDPXi: {
	unsigned Rt = Inst.getOperand(0).getReg();
	unsigned Rt2 = Inst.getOperand(1).getReg();
	if (Rt == Rt2)
	return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
	break;
	}
	case AArch64::LDPDpost:
	case AArch64::LDPDpre:
	case AArch64::LDPQpost:
	case AArch64::LDPQpre:
	case AArch64::LDPSpost:
	case AArch64::LDPSpre:
	case AArch64::LDPSWpost: {
	unsigned Rt = Inst.getOperand(1).getReg();
	unsigned Rt2 = Inst.getOperand(2).getReg();
	if (Rt == Rt2)
	return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
	break;
	}
	case AArch64::STPDpost:
	case AArch64::STPDpre:
	case AArch64::STPQpost:
	case AArch64::STPQpre:
	case AArch64::STPSpost:
	case AArch64::STPSpre:
	case AArch64::STPWpost:
	case AArch64::STPWpre:
	case AArch64::STPXpost:
	case AArch64::STPXpre: {
	unsigned Rt = Inst.getOperand(1).getReg();
	unsigned Rt2 = Inst.getOperand(2).getReg();
	unsigned Rn = Inst.getOperand(3).getReg();
	if (RI->isSubRegisterEq(Rn, Rt))
	return Error(Loc[0], "unpredictable STP instruction, writeback base "
	"is also a source");
	if (RI->isSubRegisterEq(Rn, Rt2))
	return Error(Loc[1], "unpredictable STP instruction, writeback base "
	"is also a source");
	break;
	}
	case AArch64::LDRBBpre:
	case AArch64::LDRBpre:
	case AArch64::LDRHHpre:
	case AArch64::LDRHpre:
	case AArch64::LDRSBWpre:
	case AArch64::LDRSBXpre:
	case AArch64::LDRSHWpre:
	case AArch64::LDRSHXpre:
	case AArch64::LDRSWpre:
	case AArch64::LDRWpre:
	case AArch64::LDRXpre:
	case AArch64::LDRBBpost:
	case AArch64::LDRBpost:
	case AArch64::LDRHHpost:
	case AArch64::LDRHpost:
	case AArch64::LDRSBWpost:
	case AArch64::LDRSBXpost:
	case AArch64::LDRSHWpost:
	case AArch64::LDRSHXpost:
	case AArch64::LDRSWpost:
	case AArch64::LDRWpost:
	case AArch64::LDRXpost: {
	unsigned Rt = Inst.getOperand(1).getReg();
	unsigned Rn = Inst.getOperand(2).getReg();
	if (RI->isSubRegisterEq(Rn, Rt))
	return Error(Loc[0], "unpredictable LDR instruction, writeback base "
	"is also a source");
	break;
	}
	case AArch64::STRBBpost:
	case AArch64::STRBpost:
	case AArch64::STRHHpost:
	case AArch64::STRHpost:
	case AArch64::STRWpost:
	case AArch64::STRXpost:
	case AArch64::STRBBpre:
	case AArch64::STRBpre:
	case AArch64::STRHHpre:
	case AArch64::STRHpre:
	case AArch64::STRWpre:
	case AArch64::STRXpre: {
	unsigned Rt = Inst.getOperand(1).getReg();
	unsigned Rn = Inst.getOperand(2).getReg();
	if (RI->isSubRegisterEq(Rn, Rt))
	return Error(Loc[0], "unpredictable STR instruction, writeback base "
	"is also a source");
	break;
	}
	case AArch64::STXRB:
	case AArch64::STXRH:
	case AArch64::STXRW:
	case AArch64::STXRX:
	case AArch64::STLXRB:
	case AArch64::STLXRH:
	case AArch64::STLXRW:
	case AArch64::STLXRX: {
	unsigned Rs = Inst.getOperand(0).getReg();
	unsigned Rt = Inst.getOperand(1).getReg();
	unsigned Rn = Inst.getOperand(2).getReg();
	if (RI->isSubRegisterEq(Rt, Rs) \|\|
	(RI->isSubRegisterEq(Rn, Rs) && Rn != AArch64::SP))
	return Error(Loc[0],
	"unpredictable STXR instruction, status is also a source");
	break;
	}
	case AArch64::STXPW:
	case AArch64::STXPX:
	case AArch64::STLXPW:
	case AArch64::STLXPX: {
	unsigned Rs = Inst.getOperand(0).getReg();
	unsigned Rt1 = Inst.getOperand(1).getReg();
	unsigned Rt2 = Inst.getOperand(2).getReg();
	unsigned Rn = Inst.getOperand(3).getReg();
	if (RI->isSubRegisterEq(Rt1, Rs) \|\| RI->isSubRegisterEq(Rt2, Rs) \|\|
	(RI->isSubRegisterEq(Rn, Rs) && Rn != AArch64::SP))
	return Error(Loc[0],
	"unpredictable STXP instruction, status is also a source");
	break;
	}
	}


	// Now check immediate ranges. Separate from the above as there is overlap
	// in the instructions being checked and this keeps the nested conditionals
	// to a minimum.
	switch (Inst.getOpcode()) {
	case AArch64::ADDSWri:
	case AArch64::ADDSXri:
	case AArch64::ADDWri:
	case AArch64::ADDXri:
	case AArch64::SUBSWri:
	case AArch64::SUBSXri:
	case AArch64::SUBWri:
	case AArch64::SUBXri: {
	// Annoyingly we can't do this in the isAddSubImm predicate, so there is
	// some slight duplication here.
	if (Inst.getOperand(2).isExpr()) {
	const MCExpr *Expr = Inst.getOperand(2).getExpr();
	AArch64MCExpr::VariantKind ELFRefKind;
	MCSymbolRefExpr::VariantKind DarwinRefKind;
	int64_t Addend;
	if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {

	// Only allow these with ADDXri.
	if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF \|\|
	DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) &&
	Inst.getOpcode() == AArch64::ADDXri)
	return false;

	// Only allow these with ADDXri/ADDWri
	if ((ELFRefKind == AArch64MCExpr::VK_LO12 \|\|
	ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 \|\|
	ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 \|\|
	ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC \|\|
	ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 \|\|
	ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 \|\|
	ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC \|\|
	ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 \|\|
	ELFRefKind == AArch64MCExpr::VK_SECREL_LO12 \|\|
	ELFRefKind == AArch64MCExpr::VK_SECREL_HI12) &&
	(Inst.getOpcode() == AArch64::ADDXri \|\|
	Inst.getOpcode() == AArch64::ADDWri))
	return false;

	// Don't allow symbol refs in the immediate field otherwise
	// Note: Loc.back() may be Loc[1] or Loc[2] depending on the number of
	// operands of the original instruction (i.e. 'add w0, w1, borked' vs
	// 'cmp w0, 'borked')
	return Error(Loc.back(), "invalid immediate expression");
	}
	// We don't validate more complex expressions here
	}
	return false;
	}
	default:
	return false;
	}
	}

	static std::string AArch64MnemonicSpellCheck(StringRef S,
	const FeatureBitset &FBS,
	unsigned VariantID = 0);

	bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
	uint64_t ErrorInfo,
	OperandVector &Operands) {
	switch (ErrCode) {
	case Match_InvalidTiedOperand: {
	RegConstraintEqualityTy EqTy =
	static_cast<const AArch64Operand &>(*Operands[ErrorInfo])
	.getRegEqualityTy();
	switch (EqTy) {
	case RegConstraintEqualityTy::EqualsSubReg:
	return Error(Loc, "operand must be 64-bit form of destination register");
	case RegConstraintEqualityTy::EqualsSuperReg:
	return Error(Loc, "operand must be 32-bit form of destination register");
	case RegConstraintEqualityTy::EqualsReg:
	return Error(Loc, "operand must match destination register");
	}
	llvm_unreachable("Unknown RegConstraintEqualityTy");
	}
	case Match_MissingFeature:
	return Error(Loc,
	"instruction requires a CPU feature not currently enabled");
	case Match_InvalidOperand:
	return Error(Loc, "invalid operand for instruction");
	case Match_InvalidSuffix:
	return Error(Loc, "invalid type suffix for instruction");
	case Match_InvalidCondCode:
	return Error(Loc, "expected AArch64 condition code");
	case Match_AddSubRegExtendSmall:
	return Error(Loc,
	"expected '[su]xt[bhw]' with optional integer in range [0, 4]");
	case Match_AddSubRegExtendLarge:
	return Error(Loc,
	"expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
	case Match_AddSubSecondSource:
	return Error(Loc,
	"expected compatible register, symbol or integer in range [0, 4095]");
	case Match_LogicalSecondSource:
	return Error(Loc, "expected compatible register or logical immediate");
	case Match_InvalidMovImm32Shift:
	return Error(Loc, "expected 'lsl' with optional integer 0 or 16");
	case Match_InvalidMovImm64Shift:
	return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48");
	case Match_AddSubRegShift32:
	return Error(Loc,
	"expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
	case Match_AddSubRegShift64:
	return Error(Loc,
	"expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
	case Match_InvalidFPImm:
	return Error(Loc,
	"expected compatible register or floating-point constant");
	case Match_InvalidMemoryIndexedSImm6:
	return Error(Loc, "index must be an integer in range [-32, 31].");
	case Match_InvalidMemoryIndexedSImm5:
	return Error(Loc, "index must be an integer in range [-16, 15].");
	case Match_InvalidMemoryIndexed1SImm4:
	return Error(Loc, "index must be an integer in range [-8, 7].");
	case Match_InvalidMemoryIndexed2SImm4:
	return Error(Loc, "index must be a multiple of 2 in range [-16, 14].");
	case Match_InvalidMemoryIndexed3SImm4:
	return Error(Loc, "index must be a multiple of 3 in range [-24, 21].");
	case Match_InvalidMemoryIndexed4SImm4:
	return Error(Loc, "index must be a multiple of 4 in range [-32, 28].");
	case Match_InvalidMemoryIndexed16SImm4:
	return Error(Loc, "index must be a multiple of 16 in range [-128, 112].");
	case Match_InvalidMemoryIndexed1SImm6:
	return Error(Loc, "index must be an integer in range [-32, 31].");
	case Match_InvalidMemoryIndexedSImm8:
	return Error(Loc, "index must be an integer in range [-128, 127].");
	case Match_InvalidMemoryIndexedSImm9:
	return Error(Loc, "index must be an integer in range [-256, 255].");
	case Match_InvalidMemoryIndexed16SImm9:
	return Error(Loc, "index must be a multiple of 16 in range [-4096, 4080].");
	case Match_InvalidMemoryIndexed8SImm10:
	return Error(Loc, "index must be a multiple of 8 in range [-4096, 4088].");
	case Match_InvalidMemoryIndexed4SImm7:
	return Error(Loc, "index must be a multiple of 4 in range [-256, 252].");
	case Match_InvalidMemoryIndexed8SImm7:
	return Error(Loc, "index must be a multiple of 8 in range [-512, 504].");
	case Match_InvalidMemoryIndexed16SImm7:
	return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008].");
	case Match_InvalidMemoryIndexed8UImm5:
	return Error(Loc, "index must be a multiple of 8 in range [0, 248].");
	case Match_InvalidMemoryIndexed4UImm5:
	return Error(Loc, "index must be a multiple of 4 in range [0, 124].");
	case Match_InvalidMemoryIndexed2UImm5:
	return Error(Loc, "index must be a multiple of 2 in range [0, 62].");
	case Match_InvalidMemoryIndexed8UImm6:
	return Error(Loc, "index must be a multiple of 8 in range [0, 504].");
	case Match_InvalidMemoryIndexed16UImm6:
	return Error(Loc, "index must be a multiple of 16 in range [0, 1008].");
	case Match_InvalidMemoryIndexed4UImm6:
	return Error(Loc, "index must be a multiple of 4 in range [0, 252].");
	case Match_InvalidMemoryIndexed2UImm6:
	return Error(Loc, "index must be a multiple of 2 in range [0, 126].");
	case Match_InvalidMemoryIndexed1UImm6:
	return Error(Loc, "index must be in range [0, 63].");
	case Match_InvalidMemoryWExtend8:
	return Error(Loc,
	"expected 'uxtw' or 'sxtw' with optional shift of #0");
	case Match_InvalidMemoryWExtend16:
	return Error(Loc,
	"expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
	case Match_InvalidMemoryWExtend32:
	return Error(Loc,
	"expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
	case Match_InvalidMemoryWExtend64:
	return Error(Loc,
	"expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
	case Match_InvalidMemoryWExtend128:
	return Error(Loc,
	"expected 'uxtw' or 'sxtw' with optional shift of #0 or #4");
	case Match_InvalidMemoryXExtend8:
	return Error(Loc,
	"expected 'lsl' or 'sxtx' with optional shift of #0");
	case Match_InvalidMemoryXExtend16:
	return Error(Loc,
	"expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
	case Match_InvalidMemoryXExtend32:
	return Error(Loc,
	"expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
	case Match_InvalidMemoryXExtend64:
	return Error(Loc,
	"expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
	case Match_InvalidMemoryXExtend128:
	return Error(Loc,
	"expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
	case Match_InvalidMemoryIndexed1:
	return Error(Loc, "index must be an integer in range [0, 4095].");
	case Match_InvalidMemoryIndexed2:
	return Error(Loc, "index must be a multiple of 2 in range [0, 8190].");
	case Match_InvalidMemoryIndexed4:
	return Error(Loc, "index must be a multiple of 4 in range [0, 16380].");
	case Match_InvalidMemoryIndexed8:
	return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
	case Match_InvalidMemoryIndexed16:
	return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
	case Match_InvalidImm0_1:
	return Error(Loc, "immediate must be an integer in range [0, 1].");
	case Match_InvalidImm0_7:
	return Error(Loc, "immediate must be an integer in range [0, 7].");
	case Match_InvalidImm0_15:
	return Error(Loc, "immediate must be an integer in range [0, 15].");
	case Match_InvalidImm0_31:
	return Error(Loc, "immediate must be an integer in range [0, 31].");
	case Match_InvalidImm0_63:
	return Error(Loc, "immediate must be an integer in range [0, 63].");
	case Match_InvalidImm0_127:
	return Error(Loc, "immediate must be an integer in range [0, 127].");
	case Match_InvalidImm0_255:
	return Error(Loc, "immediate must be an integer in range [0, 255].");
	case Match_InvalidImm0_65535:
	return Error(Loc, "immediate must be an integer in range [0, 65535].");
	case Match_InvalidImm1_8:
	return Error(Loc, "immediate must be an integer in range [1, 8].");
	case Match_InvalidImm1_16:
	return Error(Loc, "immediate must be an integer in range [1, 16].");
	case Match_InvalidImm1_32:
	return Error(Loc, "immediate must be an integer in range [1, 32].");
	case Match_InvalidImm1_64:
	return Error(Loc, "immediate must be an integer in range [1, 64].");
	case Match_InvalidSVEAddSubImm8:
	return Error(Loc, "immediate must be an integer in range [0, 255]"
	" with a shift amount of 0");
	case Match_InvalidSVEAddSubImm16:
	case Match_InvalidSVEAddSubImm32:
	case Match_InvalidSVEAddSubImm64:
	return Error(Loc, "immediate must be an integer in range [0, 255] or a "
	"multiple of 256 in range [256, 65280]");
	case Match_InvalidSVECpyImm8:
	return Error(Loc, "immediate must be an integer in range [-128, 255]"
	" with a shift amount of 0");
	case Match_InvalidSVECpyImm16:
	return Error(Loc, "immediate must be an integer in range [-128, 127] or a "
	"multiple of 256 in range [-32768, 65280]");
	case Match_InvalidSVECpyImm32:
	case Match_InvalidSVECpyImm64:
	return Error(Loc, "immediate must be an integer in range [-128, 127] or a "
	"multiple of 256 in range [-32768, 32512]");
	case Match_InvalidIndexRange1_1:
	return Error(Loc, "expected lane specifier '[1]'");
	case Match_InvalidIndexRange0_15:
	return Error(Loc, "vector lane must be an integer in range [0, 15].");
	case Match_InvalidIndexRange0_7:
	return Error(Loc, "vector lane must be an integer in range [0, 7].");
	case Match_InvalidIndexRange0_3:
	return Error(Loc, "vector lane must be an integer in range [0, 3].");
	case Match_InvalidIndexRange0_1:
	return Error(Loc, "vector lane must be an integer in range [0, 1].");
	case Match_InvalidSVEIndexRange0_63:
	return Error(Loc, "vector lane must be an integer in range [0, 63].");
	case Match_InvalidSVEIndexRange0_31:
	return Error(Loc, "vector lane must be an integer in range [0, 31].");
	case Match_InvalidSVEIndexRange0_15:
	return Error(Loc, "vector lane must be an integer in range [0, 15].");
	case Match_InvalidSVEIndexRange0_7:
	return Error(Loc, "vector lane must be an integer in range [0, 7].");
	case Match_InvalidSVEIndexRange0_3:
	return Error(Loc, "vector lane must be an integer in range [0, 3].");
	case Match_InvalidLabel:
	return Error(Loc, "expected label or encodable integer pc offset");
	case Match_MRS:
	return Error(Loc, "expected readable system register");
	case Match_MSR:
	return Error(Loc, "expected writable system register or pstate");
	case Match_InvalidComplexRotationEven:
	return Error(Loc, "complex rotation must be 0, 90, 180 or 270.");
	case Match_InvalidComplexRotationOdd:
	return Error(Loc, "complex rotation must be 90 or 270.");
	case Match_MnemonicFail: {
	std::string Suggestion = AArch64MnemonicSpellCheck(
	((AArch64Operand &)*Operands[0]).getToken(),
	ComputeAvailableFeatures(STI->getFeatureBits()));
	return Error(Loc, "unrecognized instruction mnemonic" + Suggestion);
	}
	case Match_InvalidGPR64shifted8:
	return Error(Loc, "register must be x0..x30 or xzr, without shift");
	case Match_InvalidGPR64shifted16:
	return Error(Loc, "register must be x0..x30 or xzr, with required shift 'lsl #1'");
	case Match_InvalidGPR64shifted32:
	return Error(Loc, "register must be x0..x30 or xzr, with required shift 'lsl #2'");
	case Match_InvalidGPR64shifted64:
	return Error(Loc, "register must be x0..x30 or xzr, with required shift 'lsl #3'");
	case Match_InvalidGPR64NoXZRshifted8:
	return Error(Loc, "register must be x0..x30 without shift");
	case Match_InvalidGPR64NoXZRshifted16:
	return Error(Loc, "register must be x0..x30 with required shift 'lsl #1'");
	case Match_InvalidGPR64NoXZRshifted32:
	return Error(Loc, "register must be x0..x30 with required shift 'lsl #2'");
	case Match_InvalidGPR64NoXZRshifted64:
	return Error(Loc, "register must be x0..x30 with required shift 'lsl #3'");
	case Match_InvalidZPR32UXTW8:
	case Match_InvalidZPR32SXTW8:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw\|sxtw)'");
	case Match_InvalidZPR32UXTW16:
	case Match_InvalidZPR32SXTW16:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw\|sxtw) #1'");
	case Match_InvalidZPR32UXTW32:
	case Match_InvalidZPR32SXTW32:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw\|sxtw) #2'");
	case Match_InvalidZPR32UXTW64:
	case Match_InvalidZPR32SXTW64:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, (uxtw\|sxtw) #3'");
	case Match_InvalidZPR64UXTW8:
	case Match_InvalidZPR64SXTW8:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (uxtw\|sxtw)'");
	case Match_InvalidZPR64UXTW16:
	case Match_InvalidZPR64SXTW16:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (lsl\|uxtw\|sxtw) #1'");
	case Match_InvalidZPR64UXTW32:
	case Match_InvalidZPR64SXTW32:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (lsl\|uxtw\|sxtw) #2'");
	case Match_InvalidZPR64UXTW64:
	case Match_InvalidZPR64SXTW64:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, (lsl\|uxtw\|sxtw) #3'");
	case Match_InvalidZPR32LSL8:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s'");
	case Match_InvalidZPR32LSL16:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, lsl #1'");
	case Match_InvalidZPR32LSL32:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, lsl #2'");
	case Match_InvalidZPR32LSL64:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].s, lsl #3'");
	case Match_InvalidZPR64LSL8:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d'");
	case Match_InvalidZPR64LSL16:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, lsl #1'");
	case Match_InvalidZPR64LSL32:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, lsl #2'");
	case Match_InvalidZPR64LSL64:
	return Error(Loc, "invalid shift/extend specified, expected 'z[0..31].d, lsl #3'");
	case Match_InvalidZPR0:
	return Error(Loc, "expected register without element width suffix");
	case Match_InvalidZPR8:
	case Match_InvalidZPR16:
	case Match_InvalidZPR32:
	case Match_InvalidZPR64:
	case Match_InvalidZPR128:
	return Error(Loc, "invalid element width");
	case Match_InvalidZPR_3b8:
	return Error(Loc, "Invalid restricted vector register, expected z0.b..z7.b");
	case Match_InvalidZPR_3b16:
	return Error(Loc, "Invalid restricted vector register, expected z0.h..z7.h");
	case Match_InvalidZPR_3b32:
	return Error(Loc, "Invalid restricted vector register, expected z0.s..z7.s");
	case Match_InvalidZPR_4b16:
	return Error(Loc, "Invalid restricted vector register, expected z0.h..z15.h");
	case Match_InvalidZPR_4b32:
	return Error(Loc, "Invalid restricted vector register, expected z0.s..z15.s");
	case Match_InvalidZPR_4b64:
	return Error(Loc, "Invalid restricted vector register, expected z0.d..z15.d");
	case Match_InvalidSVEPattern:
	return Error(Loc, "invalid predicate pattern");
	case Match_InvalidSVEPredicateAnyReg:
	case Match_InvalidSVEPredicateBReg:
	case Match_InvalidSVEPredicateHReg:
	case Match_InvalidSVEPredicateSReg:
	case Match_InvalidSVEPredicateDReg:
	return Error(Loc, "invalid predicate register.");
	case Match_InvalidSVEPredicate3bAnyReg:
	return Error(Loc, "invalid restricted predicate register, expected p0..p7 (without element suffix)");
	case Match_InvalidSVEPredicate3bBReg:
	return Error(Loc, "invalid restricted predicate register, expected p0.b..p7.b");
	case Match_InvalidSVEPredicate3bHReg:
	return Error(Loc, "invalid restricted predicate register, expected p0.h..p7.h");
	case Match_InvalidSVEPredicate3bSReg:
	return Error(Loc, "invalid restricted predicate register, expected p0.s..p7.s");
	case Match_InvalidSVEPredicate3bDReg:
	return Error(Loc, "invalid restricted predicate register, expected p0.d..p7.d");
	case Match_InvalidSVEExactFPImmOperandHalfOne:
	return Error(Loc, "Invalid floating point constant, expected 0.5 or 1.0.");
	case Match_InvalidSVEExactFPImmOperandHalfTwo:
	return Error(Loc, "Invalid floating point constant, expected 0.5 or 2.0.");
	case Match_InvalidSVEExactFPImmOperandZeroOne:
	return Error(Loc, "Invalid floating point constant, expected 0.0 or 1.0.");
	default:
	llvm_unreachable("unexpected error code!");
	}
	}

	static const char *getSubtargetFeatureName(uint64_t Val);

	bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
	OperandVector &Operands,
	MCStreamer &Out,
	uint64_t &ErrorInfo,
	bool MatchingInlineAsm) {
	assert(!Operands.empty() && "Unexpect empty operand list!");
	AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[0]);
	assert(Op.isToken() && "Leading operand should always be a mnemonic!");

	StringRef Tok = Op.getToken();
	unsigned NumOperands = Operands.size();

	if (NumOperands == 4 && Tok == "lsl") {
	AArch64Operand &Op2 = static_cast<AArch64Operand &>(*Operands[2]);
	AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
	if (Op2.isScalarReg() && Op3.isImm()) {
	const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
	if (Op3CE) {
	uint64_t Op3Val = Op3CE->getValue();
	uint64_t NewOp3Val = 0;
	uint64_t NewOp4Val = 0;
	if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
	Op2.getReg())) {
	NewOp3Val = (32 - Op3Val) & 0x1f;
	NewOp4Val = 31 - Op3Val;
	} else {
	NewOp3Val = (64 - Op3Val) & 0x3f;
	NewOp4Val = 63 - Op3Val;
	}

	const MCExpr *NewOp3 = MCConstantExpr::create(NewOp3Val, getContext());
	const MCExpr *NewOp4 = MCConstantExpr::create(NewOp4Val, getContext());

	Operands[0] = AArch64Operand::CreateToken(
	"ubfm", false, Op.getStartLoc(), getContext());
	Operands.push_back(AArch64Operand::CreateImm(
	NewOp4, Op3.getStartLoc(), Op3.getEndLoc(), getContext()));
	Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3.getStartLoc(),
	Op3.getEndLoc(), getContext());
	}
	}
	} else if (NumOperands == 4 && Tok == "bfc") {
	// FIXME: Horrible hack to handle BFC->BFM alias.
	AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
	AArch64Operand LSBOp = static_cast<AArch64Operand &>(*Operands[2]);
	AArch64Operand WidthOp = static_cast<AArch64Operand &>(*Operands[3]);

	if (Op1.isScalarReg() && LSBOp.isImm() && WidthOp.isImm()) {
	const MCConstantExpr *LSBCE = dyn_cast<MCConstantExpr>(LSBOp.getImm());
	const MCConstantExpr *WidthCE = dyn_cast<MCConstantExpr>(WidthOp.getImm());

	if (LSBCE && WidthCE) {
	uint64_t LSB = LSBCE->getValue();
	uint64_t Width = WidthCE->getValue();

	uint64_t RegWidth = 0;
	if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
	Op1.getReg()))
	RegWidth = 64;
	else
	RegWidth = 32;

	if (LSB >= RegWidth)
	return Error(LSBOp.getStartLoc(),
	"expected integer in range [0, 31]");
	if (Width < 1 \|\| Width > RegWidth)
	return Error(WidthOp.getStartLoc(),
	"expected integer in range [1, 32]");

	uint64_t ImmR = 0;
	if (RegWidth == 32)
	ImmR = (32 - LSB) & 0x1f;
	else
	ImmR = (64 - LSB) & 0x3f;

	uint64_t ImmS = Width - 1;

	if (ImmR != 0 && ImmS >= ImmR)
	return Error(WidthOp.getStartLoc(),
	"requested insert overflows register");

	const MCExpr *ImmRExpr = MCConstantExpr::create(ImmR, getContext());
	const MCExpr *ImmSExpr = MCConstantExpr::create(ImmS, getContext());
	Operands[0] = AArch64Operand::CreateToken(
	"bfm", false, Op.getStartLoc(), getContext());
	Operands[2] = AArch64Operand::CreateReg(
	RegWidth == 32 ? AArch64::WZR : AArch64::XZR, RegKind::Scalar,
	SMLoc(), SMLoc(), getContext());
	Operands[3] = AArch64Operand::CreateImm(
	ImmRExpr, LSBOp.getStartLoc(), LSBOp.getEndLoc(), getContext());
	Operands.emplace_back(
	AArch64Operand::CreateImm(ImmSExpr, WidthOp.getStartLoc(),
	WidthOp.getEndLoc(), getContext()));
	}
	}
	} else if (NumOperands == 5) {
	// FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
	// UBFIZ -> UBFM aliases.
	if (Tok == "bfi" \|\| Tok == "sbfiz" \|\| Tok == "ubfiz") {
	AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
	AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
	AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);

	if (Op1.isScalarReg() && Op3.isImm() && Op4.isImm()) {
	const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
	const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());

	if (Op3CE && Op4CE) {
	uint64_t Op3Val = Op3CE->getValue();
	uint64_t Op4Val = Op4CE->getValue();

	uint64_t RegWidth = 0;
	if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
	Op1.getReg()))
	RegWidth = 64;
	else
	RegWidth = 32;

	if (Op3Val >= RegWidth)
	return Error(Op3.getStartLoc(),
	"expected integer in range [0, 31]");
	if (Op4Val < 1 \|\| Op4Val > RegWidth)
	return Error(Op4.getStartLoc(),
	"expected integer in range [1, 32]");

	uint64_t NewOp3Val = 0;
	if (RegWidth == 32)
	NewOp3Val = (32 - Op3Val) & 0x1f;
	else
	NewOp3Val = (64 - Op3Val) & 0x3f;

	uint64_t NewOp4Val = Op4Val - 1;

	if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val)
	return Error(Op4.getStartLoc(),
	"requested insert overflows register");

	const MCExpr *NewOp3 =
	MCConstantExpr::create(NewOp3Val, getContext());
	const MCExpr *NewOp4 =
	MCConstantExpr::create(NewOp4Val, getContext());
	Operands[3] = AArch64Operand::CreateImm(
	NewOp3, Op3.getStartLoc(), Op3.getEndLoc(), getContext());
	Operands[4] = AArch64Operand::CreateImm(
	NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
	if (Tok == "bfi")
	Operands[0] = AArch64Operand::CreateToken(
	"bfm", false, Op.getStartLoc(), getContext());
	else if (Tok == "sbfiz")
	Operands[0] = AArch64Operand::CreateToken(
	"sbfm", false, Op.getStartLoc(), getContext());
	else if (Tok == "ubfiz")
	Operands[0] = AArch64Operand::CreateToken(
	"ubfm", false, Op.getStartLoc(), getContext());
	else
	llvm_unreachable("No valid mnemonic for alias?");
	}
	}

	// FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
	// UBFX -> UBFM aliases.
	} else if (NumOperands == 5 &&
	(Tok == "bfxil" \|\| Tok == "sbfx" \|\| Tok == "ubfx")) {
	AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
	AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
	AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]);

	if (Op1.isScalarReg() && Op3.isImm() && Op4.isImm()) {
	const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm());
	const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm());

	if (Op3CE && Op4CE) {
	uint64_t Op3Val = Op3CE->getValue();
	uint64_t Op4Val = Op4CE->getValue();

	uint64_t RegWidth = 0;
	if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
	Op1.getReg()))
	RegWidth = 64;
	else
	RegWidth = 32;

	if (Op3Val >= RegWidth)
	return Error(Op3.getStartLoc(),
	"expected integer in range [0, 31]");
	if (Op4Val < 1 \|\| Op4Val > RegWidth)
	return Error(Op4.getStartLoc(),
	"expected integer in range [1, 32]");

	uint64_t NewOp4Val = Op3Val + Op4Val - 1;

	if (NewOp4Val >= RegWidth \|\| NewOp4Val < Op3Val)
	return Error(Op4.getStartLoc(),
	"requested extract overflows register");

	const MCExpr *NewOp4 =
	MCConstantExpr::create(NewOp4Val, getContext());
	Operands[4] = AArch64Operand::CreateImm(
	NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext());
	if (Tok == "bfxil")
	Operands[0] = AArch64Operand::CreateToken(
	"bfm", false, Op.getStartLoc(), getContext());
	else if (Tok == "sbfx")
	Operands[0] = AArch64Operand::CreateToken(
	"sbfm", false, Op.getStartLoc(), getContext());
	else if (Tok == "ubfx")
	Operands[0] = AArch64Operand::CreateToken(
	"ubfm", false, Op.getStartLoc(), getContext());
	else
	llvm_unreachable("No valid mnemonic for alias?");
	}
	}
	}
	}

	// The Cyclone CPU and early successors didn't execute the zero-cycle zeroing
	// instruction for FP registers correctly in some rare circumstances. Convert
	// it to a safe instruction and warn (because silently changing someone's
	// assembly is rude).
	if (getSTI().getFeatureBits()[AArch64::FeatureZCZeroingFPWorkaround] &&
	NumOperands == 4 && Tok == "movi") {
	AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]);
	AArch64Operand &Op2 = static_cast<AArch64Operand &>(*Operands[2]);
	AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]);
	if ((Op1.isToken() && Op2.isNeonVectorReg() && Op3.isImm()) \|\|
	(Op1.isNeonVectorReg() && Op2.isToken() && Op3.isImm())) {
	StringRef Suffix = Op1.isToken() ? Op1.getToken() : Op2.getToken();
	if (Suffix.lower() == ".2d" &&
	cast<MCConstantExpr>(Op3.getImm())->getValue() == 0) {
	Warning(IDLoc, "instruction movi.2d with immediate #0 may not function"
	" correctly on this CPU, converting to equivalent movi.16b");
	// Switch the suffix to .16b.
	unsigned Idx = Op1.isToken() ? 1 : 2;
	Operands[Idx] = AArch64Operand::CreateToken(".16b", false, IDLoc,
	getContext());
	}
	}
	}

	// FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
	// InstAlias can't quite handle this since the reg classes aren't
	// subclasses.
	if (NumOperands == 3 && (Tok == "sxtw" \|\| Tok == "uxtw")) {
	// The source register can be Wn here, but the matcher expects a
	// GPR64. Twiddle it here if necessary.
	AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
	if (Op.isScalarReg()) {
	unsigned Reg = getXRegFromWReg(Op.getReg());
	Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
	Op.getStartLoc(), Op.getEndLoc(),
	getContext());
	}
	}
	// FIXME: Likewise for sxt[bh] with a Xd dst operand
	else if (NumOperands == 3 && (Tok == "sxtb" \|\| Tok == "sxth")) {
	AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
	if (Op.isScalarReg() &&
	AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
	Op.getReg())) {
	// The source register can be Wn here, but the matcher expects a
	// GPR64. Twiddle it here if necessary.
	AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]);
	if (Op.isScalarReg()) {
	unsigned Reg = getXRegFromWReg(Op.getReg());
	Operands[2] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
	Op.getStartLoc(),
	Op.getEndLoc(), getContext());
	}
	}
	}
	// FIXME: Likewise for uxt[bh] with a Xd dst operand
	else if (NumOperands == 3 && (Tok == "uxtb" \|\| Tok == "uxth")) {
	AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
	if (Op.isScalarReg() &&
	AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
	Op.getReg())) {
	// The source register can be Wn here, but the matcher expects a
	// GPR32. Twiddle it here if necessary.
	AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]);
	if (Op.isScalarReg()) {
	unsigned Reg = getWRegFromXReg(Op.getReg());
	Operands[1] = AArch64Operand::CreateReg(Reg, RegKind::Scalar,
	Op.getStartLoc(),
	Op.getEndLoc(), getContext());
	}
	}
	}

	MCInst Inst;
	FeatureBitset MissingFeatures;
	// First try to match against the secondary set of tables containing the
	// short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
	unsigned MatchResult =
	MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures,
	MatchingInlineAsm, 1);

	// If that fails, try against the alternate table containing long-form NEON:
	// "fadd v0.2s, v1.2s, v2.2s"
	if (MatchResult != Match_Success) {
	// But first, save the short-form match result: we can use it in case the
	// long-form match also fails.
	auto ShortFormNEONErrorInfo = ErrorInfo;
	auto ShortFormNEONMatchResult = MatchResult;
	auto ShortFormNEONMissingFeatures = MissingFeatures;

	MatchResult =
	MatchInstructionImpl(Operands, Inst, ErrorInfo, MissingFeatures,
	MatchingInlineAsm, 0);

	// Now, both matches failed, and the long-form match failed on the mnemonic
	// suffix token operand. The short-form match failure is probably more
	// relevant: use it instead.
	if (MatchResult == Match_InvalidOperand && ErrorInfo == 1 &&
	Operands.size() > 1 && ((AArch64Operand &)*Operands[1]).isToken() &&
	((AArch64Operand &)*Operands[1]).isTokenSuffix()) {
	MatchResult = ShortFormNEONMatchResult;
	ErrorInfo = ShortFormNEONErrorInfo;
	MissingFeatures = ShortFormNEONMissingFeatures;
	}
	}

	switch (MatchResult) {
	case Match_Success: {
	// Perform range checking and other semantic validations
	SmallVector<SMLoc, 8> OperandLocs;
	NumOperands = Operands.size();
	for (unsigned i = 1; i < NumOperands; ++i)
	OperandLocs.push_back(Operands[i]->getStartLoc());
	if (validateInstruction(Inst, IDLoc, OperandLocs))
	return true;

	Inst.setLoc(IDLoc);
	Out.EmitInstruction(Inst, getSTI());
	return false;
	}
	case Match_MissingFeature: {
	assert(MissingFeatures.any() && "Unknown missing feature!");
	// Special case the error message for the very common case where only
	// a single subtarget feature is missing (neon, e.g.).
	std::string Msg = "instruction requires:";
	for (unsigned i = 0, e = MissingFeatures.size(); i != e; ++i) {
	if (MissingFeatures[i]) {
	Msg += " ";
	Msg += getSubtargetFeatureName(i);
	}
	}
	return Error(IDLoc, Msg);
	}
	case Match_MnemonicFail:
	return showMatchError(IDLoc, MatchResult, ErrorInfo, Operands);
	case Match_InvalidOperand: {
	SMLoc ErrorLoc = IDLoc;

	if (ErrorInfo != ~0ULL) {
	if (ErrorInfo >= Operands.size())
	return Error(IDLoc, "too few operands for instruction",
	SMRange(IDLoc, getTok().getLoc()));

	ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
	if (ErrorLoc == SMLoc())
	ErrorLoc = IDLoc;
	}
	// If the match failed on a suffix token operand, tweak the diagnostic
	// accordingly.
	if (((AArch64Operand &)*Operands[ErrorInfo]).isToken() &&
	((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix())
	MatchResult = Match_InvalidSuffix;

	return showMatchError(ErrorLoc, MatchResult, ErrorInfo, Operands);
	}
	case Match_InvalidTiedOperand:
	case Match_InvalidMemoryIndexed1:
	case Match_InvalidMemoryIndexed2:
	case Match_InvalidMemoryIndexed4:
	case Match_InvalidMemoryIndexed8:
	case Match_InvalidMemoryIndexed16:
	case Match_InvalidCondCode:
	case Match_AddSubRegExtendSmall:
	case Match_AddSubRegExtendLarge:
	case Match_AddSubSecondSource:
	case Match_LogicalSecondSource:
	case Match_AddSubRegShift32:
	case Match_AddSubRegShift64:
	case Match_InvalidMovImm32Shift:
	case Match_InvalidMovImm64Shift:
	case Match_InvalidFPImm:
	case Match_InvalidMemoryWExtend8:
	case Match_InvalidMemoryWExtend16:
	case Match_InvalidMemoryWExtend32:
	case Match_InvalidMemoryWExtend64:
	case Match_InvalidMemoryWExtend128:
	case Match_InvalidMemoryXExtend8:
	case Match_InvalidMemoryXExtend16:
	case Match_InvalidMemoryXExtend32:
	case Match_InvalidMemoryXExtend64:
	case Match_InvalidMemoryXExtend128:
	case Match_InvalidMemoryIndexed1SImm4:
	case Match_InvalidMemoryIndexed2SImm4:
	case Match_InvalidMemoryIndexed3SImm4:
	case Match_InvalidMemoryIndexed4SImm4:
	case Match_InvalidMemoryIndexed1SImm6:
	case Match_InvalidMemoryIndexed16SImm4:
	case Match_InvalidMemoryIndexed4SImm7:
	case Match_InvalidMemoryIndexed8SImm7:
	case Match_InvalidMemoryIndexed16SImm7:
	case Match_InvalidMemoryIndexed8UImm5:
	case Match_InvalidMemoryIndexed4UImm5:
	case Match_InvalidMemoryIndexed2UImm5:
	case Match_InvalidMemoryIndexed1UImm6:
	case Match_InvalidMemoryIndexed2UImm6:
	case Match_InvalidMemoryIndexed4UImm6:
	case Match_InvalidMemoryIndexed8UImm6:
	case Match_InvalidMemoryIndexed16UImm6:
	case Match_InvalidMemoryIndexedSImm6:
	case Match_InvalidMemoryIndexedSImm5:
	case Match_InvalidMemoryIndexedSImm8:
	case Match_InvalidMemoryIndexedSImm9:
	case Match_InvalidMemoryIndexed16SImm9:
	case Match_InvalidMemoryIndexed8SImm10:
	case Match_InvalidImm0_1:
	case Match_InvalidImm0_7:
	case Match_InvalidImm0_15:
	case Match_InvalidImm0_31:
	case Match_InvalidImm0_63:
	case Match_InvalidImm0_127:
	case Match_InvalidImm0_255:
	case Match_InvalidImm0_65535:
	case Match_InvalidImm1_8:
	case Match_InvalidImm1_16:
	case Match_InvalidImm1_32:
	case Match_InvalidImm1_64:
	case Match_InvalidSVEAddSubImm8:
	case Match_InvalidSVEAddSubImm16:
	case Match_InvalidSVEAddSubImm32:
	case Match_InvalidSVEAddSubImm64:
	case Match_InvalidSVECpyImm8:
	case Match_InvalidSVECpyImm16:
	case Match_InvalidSVECpyImm32:
	case Match_InvalidSVECpyImm64:
	case Match_InvalidIndexRange1_1:
	case Match_InvalidIndexRange0_15:
	case Match_InvalidIndexRange0_7:
	case Match_InvalidIndexRange0_3:
	case Match_InvalidIndexRange0_1:
	case Match_InvalidSVEIndexRange0_63:
	case Match_InvalidSVEIndexRange0_31:
	case Match_InvalidSVEIndexRange0_15:
	case Match_InvalidSVEIndexRange0_7:
	case Match_InvalidSVEIndexRange0_3:
	case Match_InvalidLabel:
	case Match_InvalidComplexRotationEven:
	case Match_InvalidComplexRotationOdd:
	case Match_InvalidGPR64shifted8:
	case Match_InvalidGPR64shifted16:
	case Match_InvalidGPR64shifted32:
	case Match_InvalidGPR64shifted64:
	case Match_InvalidGPR64NoXZRshifted8:
	case Match_InvalidGPR64NoXZRshifted16:
	case Match_InvalidGPR64NoXZRshifted32:
	case Match_InvalidGPR64NoXZRshifted64:
	case Match_InvalidZPR32UXTW8:
	case Match_InvalidZPR32UXTW16:
	case Match_InvalidZPR32UXTW32:
	case Match_InvalidZPR32UXTW64:
	case Match_InvalidZPR32SXTW8:
	case Match_InvalidZPR32SXTW16:
	case Match_InvalidZPR32SXTW32:
	case Match_InvalidZPR32SXTW64:
	case Match_InvalidZPR64UXTW8:
	case Match_InvalidZPR64SXTW8:
	case Match_InvalidZPR64UXTW16:
	case Match_InvalidZPR64SXTW16:
	case Match_InvalidZPR64UXTW32:
	case Match_InvalidZPR64SXTW32:
	case Match_InvalidZPR64UXTW64:
	case Match_InvalidZPR64SXTW64:
	case Match_InvalidZPR32LSL8:
	case Match_InvalidZPR32LSL16:
	case Match_InvalidZPR32LSL32:
	case Match_InvalidZPR32LSL64:
	case Match_InvalidZPR64LSL8:
	case Match_InvalidZPR64LSL16:
	case Match_InvalidZPR64LSL32:
	case Match_InvalidZPR64LSL64:
	case Match_InvalidZPR0:
	case Match_InvalidZPR8:
	case Match_InvalidZPR16:
	case Match_InvalidZPR32:
	case Match_InvalidZPR64:
	case Match_InvalidZPR128:
	case Match_InvalidZPR_3b8:
	case Match_InvalidZPR_3b16:
	case Match_InvalidZPR_3b32:
	case Match_InvalidZPR_4b16:
	case Match_InvalidZPR_4b32:
	case Match_InvalidZPR_4b64:
	case Match_InvalidSVEPredicateAnyReg:
	case Match_InvalidSVEPattern:
	case Match_InvalidSVEPredicateBReg:
	case Match_InvalidSVEPredicateHReg:
	case Match_InvalidSVEPredicateSReg:
	case Match_InvalidSVEPredicateDReg:
	case Match_InvalidSVEPredicate3bAnyReg:
	case Match_InvalidSVEPredicate3bBReg:
	case Match_InvalidSVEPredicate3bHReg:
	case Match_InvalidSVEPredicate3bSReg:
	case Match_InvalidSVEPredicate3bDReg:
	case Match_InvalidSVEExactFPImmOperandHalfOne:
	case Match_InvalidSVEExactFPImmOperandHalfTwo:
	case Match_InvalidSVEExactFPImmOperandZeroOne:
	case Match_MSR:
	case Match_MRS: {
	if (ErrorInfo >= Operands.size())
	return Error(IDLoc, "too few operands for instruction", SMRange(IDLoc, (*Operands.back()).getEndLoc()));
	// Any time we get here, there's nothing fancy to do. Just get the
	// operand SMLoc and display the diagnostic.
	SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
	if (ErrorLoc == SMLoc())
	ErrorLoc = IDLoc;
	return showMatchError(ErrorLoc, MatchResult, ErrorInfo, Operands);
	}
	}

	llvm_unreachable("Implement any new match types added!");
	}

	/// ParseDirective parses the arm specific directives
	bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
	const MCObjectFileInfo::Environment Format =
	getContext().getObjectFileInfo()->getObjectFileType();
	bool IsMachO = Format == MCObjectFileInfo::IsMachO;

	StringRef IDVal = DirectiveID.getIdentifier();
	SMLoc Loc = DirectiveID.getLoc();
	if (IDVal == ".arch")
	parseDirectiveArch(Loc);
	else if (IDVal == ".cpu")
	parseDirectiveCPU(Loc);
	else if (IDVal == ".tlsdesccall")
	parseDirectiveTLSDescCall(Loc);
	else if (IDVal == ".ltorg" \|\| IDVal == ".pool")
	parseDirectiveLtorg(Loc);
	else if (IDVal == ".unreq")
	parseDirectiveUnreq(Loc);
	else if (IDVal == ".inst")
	parseDirectiveInst(Loc);
	else if (IDVal == ".cfi_negate_ra_state")
	parseDirectiveCFINegateRAState();
	else if (IDVal == ".cfi_b_key_frame")
	parseDirectiveCFIBKeyFrame();
	else if (IDVal == ".arch_extension")
	parseDirectiveArchExtension(Loc);
	else if (IsMachO) {
	if (IDVal == MCLOHDirectiveName())
	parseDirectiveLOH(IDVal, Loc);
	else
	return true;
	} else
	return true;
	return false;
	}

	static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
	SmallVector<StringRef, 4> &RequestedExtensions) {
	const bool NoCrypto =
	(std::find(RequestedExtensions.begin(), RequestedExtensions.end(),
	"nocrypto") != std::end(RequestedExtensions));
	const bool Crypto =
	(std::find(RequestedExtensions.begin(), RequestedExtensions.end(),
	"crypto") != std::end(RequestedExtensions));

	if (!NoCrypto && Crypto) {
	switch (ArchKind) {
	default:
	// Map 'generic' (and others) to sha2 and aes, because
	// that was the traditional meaning of crypto.
	case AArch64::ArchKind::ARMV8_1A:
	case AArch64::ArchKind::ARMV8_2A:
	case AArch64::ArchKind::ARMV8_3A:
	RequestedExtensions.push_back("sha2");
	RequestedExtensions.push_back("aes");
	break;
	case AArch64::ArchKind::ARMV8_4A:
	case AArch64::ArchKind::ARMV8_5A:
	RequestedExtensions.push_back("sm4");
	RequestedExtensions.push_back("sha3");
	RequestedExtensions.push_back("sha2");
	RequestedExtensions.push_back("aes");
	break;
	}
	} else if (NoCrypto) {
	switch (ArchKind) {
	default:
	// Map 'generic' (and others) to sha2 and aes, because
	// that was the traditional meaning of crypto.
	case AArch64::ArchKind::ARMV8_1A:
	case AArch64::ArchKind::ARMV8_2A:
	case AArch64::ArchKind::ARMV8_3A:
	RequestedExtensions.push_back("nosha2");
	RequestedExtensions.push_back("noaes");
	break;
	case AArch64::ArchKind::ARMV8_4A:
	case AArch64::ArchKind::ARMV8_5A:
	RequestedExtensions.push_back("nosm4");
	RequestedExtensions.push_back("nosha3");
	RequestedExtensions.push_back("nosha2");
	RequestedExtensions.push_back("noaes");
	break;
	}
	}
	}

	/// parseDirectiveArch
	/// ::= .arch token
	bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
	SMLoc ArchLoc = getLoc();

	StringRef Arch, ExtensionString;
	std::tie(Arch, ExtensionString) =
	getParser().parseStringToEndOfStatement().trim().split('+');

	AArch64::ArchKind ID = AArch64::parseArch(Arch);
	if (ID == AArch64::ArchKind::INVALID)
	return Error(ArchLoc, "unknown arch name");

	if (parseToken(AsmToken::EndOfStatement))
	return true;

	// Get the architecture and extension features.
	std::vector<StringRef> AArch64Features;
	AArch64::getArchFeatures(ID, AArch64Features);
	AArch64::getExtensionFeatures(AArch64::getDefaultExtensions("generic", ID),
	AArch64Features);

	MCSubtargetInfo &STI = copySTI();
	std::vector<std::string> ArchFeatures(AArch64Features.begin(), AArch64Features.end());
	STI.setDefaultFeatures("generic", join(ArchFeatures.begin(), ArchFeatures.end(), ","));

	SmallVector<StringRef, 4> RequestedExtensions;
	if (!ExtensionString.empty())
	ExtensionString.split(RequestedExtensions, '+');

	ExpandCryptoAEK(ID, RequestedExtensions);

	FeatureBitset Features = STI.getFeatureBits();
	for (auto Name : RequestedExtensions) {
	bool EnableFeature = true;

	if (Name.startswith_lower("no")) {
	EnableFeature = false;
	Name = Name.substr(2);
	}

	for (const auto &Extension : ExtensionMap) {
	if (Extension.Name != Name)
	continue;

	if (Extension.Features.none())
	report_fatal_error("unsupported architectural extension: " + Name);

	FeatureBitset ToggleFeatures = EnableFeature
	? (~Features & Extension.Features)
	: ( Features & Extension.Features);
	FeatureBitset Features =
	ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
	setAvailableFeatures(Features);
	break;
	}
	}
	return false;
	}

	/// parseDirectiveArchExtension
	/// ::= .arch_extension [no]feature
	bool AArch64AsmParser::parseDirectiveArchExtension(SMLoc L) {
	SMLoc ExtLoc = getLoc();

	StringRef Name = getParser().parseStringToEndOfStatement().trim();

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.arch_extension' directive"))
	return true;

	bool EnableFeature = true;
	if (Name.startswith_lower("no")) {
	EnableFeature = false;
	Name = Name.substr(2);
	}

	MCSubtargetInfo &STI = copySTI();
	FeatureBitset Features = STI.getFeatureBits();
	for (const auto &Extension : ExtensionMap) {
	if (Extension.Name != Name)
	continue;

	if (Extension.Features.none())
	return Error(ExtLoc, "unsupported architectural extension: " + Name);

	FeatureBitset ToggleFeatures = EnableFeature
	? (~Features & Extension.Features)
	: (Features & Extension.Features);
	FeatureBitset Features =
	ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
	setAvailableFeatures(Features);
	return false;
	}

	return Error(ExtLoc, "unknown architectural extension: " + Name);
	}

	static SMLoc incrementLoc(SMLoc L, int Offset) {
	return SMLoc::getFromPointer(L.getPointer() + Offset);
	}

	/// parseDirectiveCPU
	/// ::= .cpu id
	bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
	SMLoc CurLoc = getLoc();

	StringRef CPU, ExtensionString;
	std::tie(CPU, ExtensionString) =
	getParser().parseStringToEndOfStatement().trim().split('+');

	if (parseToken(AsmToken::EndOfStatement))
	return true;

	SmallVector<StringRef, 4> RequestedExtensions;
	if (!ExtensionString.empty())
	ExtensionString.split(RequestedExtensions, '+');

	// FIXME This is using tablegen data, but should be moved to ARMTargetParser
	// once that is tablegen'ed
	if (!getSTI().isCPUStringValid(CPU)) {
	Error(CurLoc, "unknown CPU name");
	return false;
	}

	MCSubtargetInfo &STI = copySTI();
	STI.setDefaultFeatures(CPU, "");
	CurLoc = incrementLoc(CurLoc, CPU.size());

	ExpandCryptoAEK(llvm::AArch64::getCPUArchKind(CPU), RequestedExtensions);

	FeatureBitset Features = STI.getFeatureBits();
	for (auto Name : RequestedExtensions) {
	// Advance source location past '+'.
	CurLoc = incrementLoc(CurLoc, 1);

	bool EnableFeature = true;

	if (Name.startswith_lower("no")) {
	EnableFeature = false;
	Name = Name.substr(2);
	}

	bool FoundExtension = false;
	for (const auto &Extension : ExtensionMap) {
	if (Extension.Name != Name)
	continue;

	if (Extension.Features.none())
	report_fatal_error("unsupported architectural extension: " + Name);

	FeatureBitset ToggleFeatures = EnableFeature
	? (~Features & Extension.Features)
	: ( Features & Extension.Features);
	FeatureBitset Features =
	ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
	setAvailableFeatures(Features);
	FoundExtension = true;

	break;
	}

	if (!FoundExtension)
	Error(CurLoc, "unsupported architectural extension");

	CurLoc = incrementLoc(CurLoc, Name.size());
	}
	return false;
	}

	/// parseDirectiveInst
	/// ::= .inst opcode [, ...]
	bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) {
	if (getLexer().is(AsmToken::EndOfStatement))
	return Error(Loc, "expected expression following '.inst' directive");

	auto parseOp = [&]() -> bool {
	SMLoc L = getLoc();
	const MCExpr *Expr;
	if (check(getParser().parseExpression(Expr), L, "expected expression"))
	return true;
	const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
	if (check(!Value, L, "expected constant expression"))
	return true;
	getTargetStreamer().emitInst(Value->getValue());
	return false;
	};

	if (parseMany(parseOp))
	return addErrorSuffix(" in '.inst' directive");
	return false;
	}

	// parseDirectiveTLSDescCall:
	// ::= .tlsdesccall symbol
	bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
	StringRef Name;
	if (check(getParser().parseIdentifier(Name), L,
	"expected symbol after directive") \|\|
	parseToken(AsmToken::EndOfStatement))
	return true;

	MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
	const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
	Expr = AArch64MCExpr::create(Expr, AArch64MCExpr::VK_TLSDESC, getContext());

	MCInst Inst;
	Inst.setOpcode(AArch64::TLSDESCCALL);
	Inst.addOperand(MCOperand::createExpr(Expr));

	getParser().getStreamer().EmitInstruction(Inst, getSTI());
	return false;
	}

	/// ::= .loh <lohName \| lohId> label1, ..., labelN
	/// The number of arguments depends on the loh identifier.
	bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
	MCLOHType Kind;
	if (getParser().getTok().isNot(AsmToken::Identifier)) {
	if (getParser().getTok().isNot(AsmToken::Integer))
	return TokError("expected an identifier or a number in directive");
	// We successfully get a numeric value for the identifier.
	// Check if it is valid.
	int64_t Id = getParser().getTok().getIntVal();
	if (Id <= -1U && !isValidMCLOHType(Id))
	return TokError("invalid numeric identifier in directive");
	Kind = (MCLOHType)Id;
	} else {
	StringRef Name = getTok().getIdentifier();
	// We successfully parse an identifier.
	// Check if it is a recognized one.
	int Id = MCLOHNameToId(Name);

	if (Id == -1)
	return TokError("invalid identifier in directive");
	Kind = (MCLOHType)Id;
	}
	// Consume the identifier.
	Lex();
	// Get the number of arguments of this LOH.
	int NbArgs = MCLOHIdToNbArgs(Kind);

	assert(NbArgs != -1 && "Invalid number of arguments");

	SmallVector<MCSymbol *, 3> Args;
	for (int Idx = 0; Idx < NbArgs; ++Idx) {
	StringRef Name;
	if (getParser().parseIdentifier(Name))
	return TokError("expected identifier in directive");
	Args.push_back(getContext().getOrCreateSymbol(Name));

	if (Idx + 1 == NbArgs)
	break;
	if (parseToken(AsmToken::Comma,
	"unexpected token in '" + Twine(IDVal) + "' directive"))
	return true;
	}
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '" + Twine(IDVal) + "' directive"))
	return true;

	getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
	return false;
	}

	/// parseDirectiveLtorg
	/// ::= .ltorg \| .pool
	bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) {
	if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
	return true;
	getTargetStreamer().emitCurrentConstantPool();
	return false;
	}

	/// parseDirectiveReq
	/// ::= name .req registername
	bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
	MCAsmParser &Parser = getParser();
	Parser.Lex(); // Eat the '.req' token.
	SMLoc SRegLoc = getLoc();
	RegKind RegisterKind = RegKind::Scalar;
	unsigned RegNum;
	OperandMatchResultTy ParseRes = tryParseScalarRegister(RegNum);

	if (ParseRes != MatchOperand_Success) {
	StringRef Kind;
	RegisterKind = RegKind::NeonVector;
	ParseRes = tryParseVectorRegister(RegNum, Kind, RegKind::NeonVector);

	if (ParseRes == MatchOperand_ParseFail)
	return true;

	if (ParseRes == MatchOperand_Success && !Kind.empty())
	return Error(SRegLoc, "vector register without type specifier expected");
	}

	if (ParseRes != MatchOperand_Success) {
	StringRef Kind;
	RegisterKind = RegKind::SVEDataVector;
	ParseRes =
	tryParseVectorRegister(RegNum, Kind, RegKind::SVEDataVector);

	if (ParseRes == MatchOperand_ParseFail)
	return true;

	if (ParseRes == MatchOperand_Success && !Kind.empty())
	return Error(SRegLoc,
	"sve vector register without type specifier expected");
	}

	if (ParseRes != MatchOperand_Success) {
	StringRef Kind;
	RegisterKind = RegKind::SVEPredicateVector;
	ParseRes = tryParseVectorRegister(RegNum, Kind, RegKind::SVEPredicateVector);

	if (ParseRes == MatchOperand_ParseFail)
	return true;

	if (ParseRes == MatchOperand_Success && !Kind.empty())
	return Error(SRegLoc,
	"sve predicate register without type specifier expected");
	}

	if (ParseRes != MatchOperand_Success)
	return Error(SRegLoc, "register name or alias expected");

	// Shouldn't be anything else.
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected input in .req directive"))
	return true;

	auto pair = std::make_pair(RegisterKind, (unsigned) RegNum);
	if (RegisterReqs.insert(std::make_pair(Name, pair)).first->second != pair)
	Warning(L, "ignoring redefinition of register alias '" + Name + "'");

	return false;
	}

	/// parseDirectiveUneq
	/// ::= .unreq registername
	bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
	MCAsmParser &Parser = getParser();
	if (getTok().isNot(AsmToken::Identifier))
	return TokError("unexpected input in .unreq directive.");
	RegisterReqs.erase(Parser.getTok().getIdentifier().lower());
	Parser.Lex(); // Eat the identifier.
	if (parseToken(AsmToken::EndOfStatement))
	return addErrorSuffix("in '.unreq' directive");
	return false;
	}

	bool AArch64AsmParser::parseDirectiveCFINegateRAState() {
	if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
	return true;
	getStreamer().EmitCFINegateRAState();
	return false;
	}

	/// parseDirectiveCFIBKeyFrame
	/// ::= .cfi_b_key
	bool AArch64AsmParser::parseDirectiveCFIBKeyFrame() {
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.cfi_b_key_frame'"))
	return true;
	getStreamer().EmitCFIBKeyFrame();
	return false;
	}

	bool
	AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
	AArch64MCExpr::VariantKind &ELFRefKind,
	MCSymbolRefExpr::VariantKind &DarwinRefKind,
	int64_t &Addend) {
	ELFRefKind = AArch64MCExpr::VK_INVALID;
	DarwinRefKind = MCSymbolRefExpr::VK_None;
	Addend = 0;

	if (const AArch64MCExpr *AE = dyn_cast<AArch64MCExpr>(Expr)) {
	ELFRefKind = AE->getKind();
	Expr = AE->getSubExpr();
	}

	const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
	if (SE) {
	// It's a simple symbol reference with no addend.
	DarwinRefKind = SE->getKind();
	return true;
	}

	// Check that it looks like a symbol + an addend
	MCValue Res;
	bool Relocatable = Expr->evaluateAsRelocatable(Res, nullptr, nullptr);
	if (!Relocatable \|\| Res.getSymB())
	return false;

	// Treat expressions with an ELFRefKind (like ":abs_g1:3", or
	// ":abs_g1:x" where x is constant) as symbolic even if there is no symbol.
	if (!Res.getSymA() && ELFRefKind == AArch64MCExpr::VK_INVALID)
	return false;

	if (Res.getSymA())
	DarwinRefKind = Res.getSymA()->getKind();
	Addend = Res.getConstant();

	// It's some symbol reference + a constant addend, but really
	// shouldn't use both Darwin and ELF syntax.
	return ELFRefKind == AArch64MCExpr::VK_INVALID \|\|
	DarwinRefKind == MCSymbolRefExpr::VK_None;
	}

	/// Force static initialization.
	extern "C" void LLVMInitializeAArch64AsmParser() {
	RegisterMCAsmParser<AArch64AsmParser> X(getTheAArch64leTarget());
	RegisterMCAsmParser<AArch64AsmParser> Y(getTheAArch64beTarget());
	RegisterMCAsmParser<AArch64AsmParser> Z(getTheARM64Target());
	RegisterMCAsmParser<AArch64AsmParser> W(getTheARM64_32Target());
	RegisterMCAsmParser<AArch64AsmParser> V(getTheAArch64_32Target());
	}

	#define GET_REGISTER_MATCHER
	#define GET_SUBTARGET_FEATURE_NAME
	#define GET_MATCHER_IMPLEMENTATION
	#define GET_MNEMONIC_SPELL_CHECKER
	#include "AArch64GenAsmMatcher.inc"

	// Define this matcher function after the auto-generated include so we
	// have the match class enum definitions.
	unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
	unsigned Kind) {
	AArch64Operand &Op = static_cast<AArch64Operand &>(AsmOp);
	// If the kind is a token for a literal immediate, check if our asm
	// operand matches. This is for InstAliases which have a fixed-value
	// immediate in the syntax.
	int64_t ExpectedVal;
	switch (Kind) {
	default:
	return Match_InvalidOperand;
	case MCK__35_0:
	ExpectedVal = 0;
	break;
	case MCK__35_1:
	ExpectedVal = 1;
	break;
	case MCK__35_12:
	ExpectedVal = 12;
	break;
	case MCK__35_16:
	ExpectedVal = 16;
	break;
	case MCK__35_2:
	ExpectedVal = 2;
	break;
	case MCK__35_24:
	ExpectedVal = 24;
	break;
	case MCK__35_3:
	ExpectedVal = 3;
	break;
	case MCK__35_32:
	ExpectedVal = 32;
	break;
	case MCK__35_4:
	ExpectedVal = 4;
	break;
	case MCK__35_48:
	ExpectedVal = 48;
	break;
	case MCK__35_6:
	ExpectedVal = 6;
	break;
	case MCK__35_64:
	ExpectedVal = 64;
	break;
	case MCK__35_8:
	ExpectedVal = 8;
	break;
	}
	if (!Op.isImm())
	return Match_InvalidOperand;
	const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm());
	if (!CE)
	return Match_InvalidOperand;
	if (CE->getValue() == ExpectedVal)
	return Match_Success;
	return Match_InvalidOperand;
	}

	OperandMatchResultTy
	AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) {

	SMLoc S = getLoc();

	if (getParser().getTok().isNot(AsmToken::Identifier)) {
	Error(S, "expected register");
	return MatchOperand_ParseFail;
	}

	unsigned FirstReg;
	OperandMatchResultTy Res = tryParseScalarRegister(FirstReg);
	if (Res != MatchOperand_Success)
	return MatchOperand_ParseFail;

	const MCRegisterClass &WRegClass =
	AArch64MCRegisterClasses[AArch64::GPR32RegClassID];
	const MCRegisterClass &XRegClass =
	AArch64MCRegisterClasses[AArch64::GPR64RegClassID];

	bool isXReg = XRegClass.contains(FirstReg),
	isWReg = WRegClass.contains(FirstReg);
	if (!isXReg && !isWReg) {
	Error(S, "expected first even register of a "
	"consecutive same-size even/odd register pair");
	return MatchOperand_ParseFail;
	}

	const MCRegisterInfo *RI = getContext().getRegisterInfo();
	unsigned FirstEncoding = RI->getEncodingValue(FirstReg);

	if (FirstEncoding & 0x1) {
	Error(S, "expected first even register of a "
	"consecutive same-size even/odd register pair");
	return MatchOperand_ParseFail;
	}

	if (getParser().getTok().isNot(AsmToken::Comma)) {
	Error(getLoc(), "expected comma");
	return MatchOperand_ParseFail;
	}
	// Eat the comma
	getParser().Lex();

	SMLoc E = getLoc();
	unsigned SecondReg;
	Res = tryParseScalarRegister(SecondReg);
	if (Res != MatchOperand_Success)
	return MatchOperand_ParseFail;

	if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 \|\|
	(isXReg && !XRegClass.contains(SecondReg)) \|\|
	(isWReg && !WRegClass.contains(SecondReg))) {
	Error(E,"expected second odd register of a "
	"consecutive same-size even/odd register pair");
	return MatchOperand_ParseFail;
	}

	unsigned Pair = 0;
	if (isXReg) {
	Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube64,
	&AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID]);
	} else {
	Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube32,
	&AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID]);
	}

	Operands.push_back(AArch64Operand::CreateReg(Pair, RegKind::Scalar, S,
	getLoc(), getContext()));

	return MatchOperand_Success;
	}

	template <bool ParseShiftExtend, bool ParseSuffix>
	OperandMatchResultTy
	AArch64AsmParser::tryParseSVEDataVector(OperandVector &Operands) {
	const SMLoc S = getLoc();
	// Check for a SVE vector register specifier first.
	unsigned RegNum;
	StringRef Kind;

	OperandMatchResultTy Res =
	tryParseVectorRegister(RegNum, Kind, RegKind::SVEDataVector);

	if (Res != MatchOperand_Success)
	return Res;

	if (ParseSuffix && Kind.empty())
	return MatchOperand_NoMatch;

	const auto &KindRes = parseVectorKind(Kind, RegKind::SVEDataVector);
	if (!KindRes)
	return MatchOperand_NoMatch;

	unsigned ElementWidth = KindRes->second;

	// No shift/extend is the default.
	if (!ParseShiftExtend \|\| getParser().getTok().isNot(AsmToken::Comma)) {
	Operands.push_back(AArch64Operand::CreateVectorReg(
	RegNum, RegKind::SVEDataVector, ElementWidth, S, S, getContext()));

	OperandMatchResultTy Res = tryParseVectorIndex(Operands);
	if (Res == MatchOperand_ParseFail)
	return MatchOperand_ParseFail;
	return MatchOperand_Success;
	}

	// Eat the comma
	getParser().Lex();

	// Match the shift
	SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> ExtOpnd;
	Res = tryParseOptionalShiftExtend(ExtOpnd);
	if (Res != MatchOperand_Success)
	return Res;

	auto Ext = static_cast<AArch64Operand *>(ExtOpnd.back().get());
	Operands.push_back(AArch64Operand::CreateVectorReg(
	RegNum, RegKind::SVEDataVector, ElementWidth, S, Ext->getEndLoc(),
	getContext(), Ext->getShiftExtendType(), Ext->getShiftExtendAmount(),
	Ext->hasShiftExtendAmount()));

	return MatchOperand_Success;
	}

	OperandMatchResultTy
	AArch64AsmParser::tryParseSVEPattern(OperandVector &Operands) {
	MCAsmParser &Parser = getParser();

	SMLoc SS = getLoc();
	const AsmToken &TokE = Parser.getTok();
	bool IsHash = TokE.is(AsmToken::Hash);

	if (!IsHash && TokE.isNot(AsmToken::Identifier))
	return MatchOperand_NoMatch;

	int64_t Pattern;
	if (IsHash) {
	Parser.Lex(); // Eat hash

	// Parse the immediate operand.
	const MCExpr *ImmVal;
	SS = getLoc();
	if (Parser.parseExpression(ImmVal))
	return MatchOperand_ParseFail;

	auto *MCE = dyn_cast<MCConstantExpr>(ImmVal);
	if (!MCE)
	return MatchOperand_ParseFail;

	Pattern = MCE->getValue();
	} else {
	// Parse the pattern
	auto Pat = AArch64SVEPredPattern::lookupSVEPREDPATByName(TokE.getString());
	if (!Pat)
	return MatchOperand_NoMatch;

	Parser.Lex();
	Pattern = Pat->Encoding;
	assert(Pattern >= 0 && Pattern < 32);
	}

	Operands.push_back(
	AArch64Operand::CreateImm(MCConstantExpr::create(Pattern, getContext()),
	SS, getLoc(), getContext()));

	return MatchOperand_Success;
	}
	Index: vendor/llvm/dist-release_90/lib/Target/AArch64/SVEInstrFormats.td
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/AArch64/SVEInstrFormats.td (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/AArch64/SVEInstrFormats.td (revision 351303)
	@@ -1,5716 +1,5833 @@
	//=-- SVEInstrFormats.td - AArch64 SVE Instruction classes -- tablegen ---=//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// AArch64 Scalable Vector Extension (SVE) Instruction Class Definitions.
	//
	//===----------------------------------------------------------------------===//

	def SVEPatternOperand : AsmOperandClass {
	let Name = "SVEPattern";
	let ParserMethod = "tryParseSVEPattern";
	let PredicateMethod = "isSVEPattern";
	let RenderMethod = "addImmOperands";
	let DiagnosticType = "InvalidSVEPattern";
	}

	def sve_pred_enum : Operand<i32>, ImmLeaf<i32, [{
	return (((uint32_t)Imm) < 32);
	}]> {

	let PrintMethod = "printSVEPattern";
	let ParserMatchClass = SVEPatternOperand;
	}

	def SVEPrefetchOperand : AsmOperandClass {
	let Name = "SVEPrefetch";
	let ParserMethod = "tryParsePrefetch<true>";
	let PredicateMethod = "isPrefetch";
	let RenderMethod = "addPrefetchOperands";
	}

	def sve_prfop : Operand<i32>, ImmLeaf<i32, [{
	return (((uint32_t)Imm) <= 15);
	}]> {
	let PrintMethod = "printPrefetchOp<true>";
	let ParserMatchClass = SVEPrefetchOperand;
	}

	class SVELogicalImmOperand<int Width> : AsmOperandClass {
	let Name = "SVELogicalImm" # Width;
	let DiagnosticType = "LogicalSecondSource";
	let PredicateMethod = "isLogicalImm<int" # Width # "_t>";
	let RenderMethod = "addLogicalImmOperands<int" # Width # "_t>";
	}

	def sve_logical_imm8 : Operand<i64> {
	let ParserMatchClass = SVELogicalImmOperand<8>;
	let PrintMethod = "printLogicalImm<int8_t>";

	let MCOperandPredicate = [{
	if (!MCOp.isImm())
	return false;
	int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
	return AArch64_AM::isSVEMaskOfIdenticalElements<int8_t>(Val);
	}];
	}

	def sve_logical_imm16 : Operand<i64> {
	let ParserMatchClass = SVELogicalImmOperand<16>;
	let PrintMethod = "printLogicalImm<int16_t>";

	let MCOperandPredicate = [{
	if (!MCOp.isImm())
	return false;
	int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
	return AArch64_AM::isSVEMaskOfIdenticalElements<int16_t>(Val);
	}];
	}

	def sve_logical_imm32 : Operand<i64> {
	let ParserMatchClass = SVELogicalImmOperand<32>;
	let PrintMethod = "printLogicalImm<int32_t>";

	let MCOperandPredicate = [{
	if (!MCOp.isImm())
	return false;
	int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
	return AArch64_AM::isSVEMaskOfIdenticalElements<int32_t>(Val);
	}];
	}

	class SVEPreferredLogicalImmOperand<int Width> : AsmOperandClass {
	let Name = "SVEPreferredLogicalImm" # Width;
	let PredicateMethod = "isSVEPreferredLogicalImm<int" # Width # "_t>";
	let RenderMethod = "addLogicalImmOperands<int" # Width # "_t>";
	}

	def sve_preferred_logical_imm16 : Operand<i64> {
	let ParserMatchClass = SVEPreferredLogicalImmOperand<16>;
	let PrintMethod = "printSVELogicalImm<int16_t>";

	let MCOperandPredicate = [{
	if (!MCOp.isImm())
	return false;
	int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
	return AArch64_AM::isSVEMaskOfIdenticalElements<int16_t>(Val) &&
	AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
	}];
	}

	def sve_preferred_logical_imm32 : Operand<i64> {
	let ParserMatchClass = SVEPreferredLogicalImmOperand<32>;
	let PrintMethod = "printSVELogicalImm<int32_t>";

	let MCOperandPredicate = [{
	if (!MCOp.isImm())
	return false;
	int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
	return AArch64_AM::isSVEMaskOfIdenticalElements<int32_t>(Val) &&
	AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
	}];
	}

	def sve_preferred_logical_imm64 : Operand<i64> {
	let ParserMatchClass = SVEPreferredLogicalImmOperand<64>;
	let PrintMethod = "printSVELogicalImm<int64_t>";

	let MCOperandPredicate = [{
	if (!MCOp.isImm())
	return false;
	int64_t Val = AArch64_AM::decodeLogicalImmediate(MCOp.getImm(), 64);
	return AArch64_AM::isSVEMaskOfIdenticalElements<int64_t>(Val) &&
	AArch64_AM::isSVEMoveMaskPreferredLogicalImmediate(Val);
	}];
	}

	class SVELogicalImmNotOperand<int Width> : AsmOperandClass {
	let Name = "SVELogicalImm" # Width # "Not";
	let DiagnosticType = "LogicalSecondSource";
	let PredicateMethod = "isLogicalImm<int" # Width # "_t>";
	let RenderMethod = "addLogicalImmNotOperands<int" # Width # "_t>";
	}

	def sve_logical_imm8_not : Operand<i64> {
	let ParserMatchClass = SVELogicalImmNotOperand<8>;
	}

	def sve_logical_imm16_not : Operand<i64> {
	let ParserMatchClass = SVELogicalImmNotOperand<16>;
	}

	def sve_logical_imm32_not : Operand<i64> {
	let ParserMatchClass = SVELogicalImmNotOperand<32>;
	}

	class SVEShiftedImmOperand<int ElementWidth, string Infix, string Predicate>
	: AsmOperandClass {
	let Name = "SVE" # Infix # "Imm" # ElementWidth;
	let DiagnosticType = "Invalid" # Name;
	let RenderMethod = "addImmWithOptionalShiftOperands<8>";
	let ParserMethod = "tryParseImmWithOptionalShift";
	let PredicateMethod = Predicate;
	}

	def SVECpyImmOperand8 : SVEShiftedImmOperand<8, "Cpy", "isSVECpyImm<int8_t>">;
	def SVECpyImmOperand16 : SVEShiftedImmOperand<16, "Cpy", "isSVECpyImm<int16_t>">;
	def SVECpyImmOperand32 : SVEShiftedImmOperand<32, "Cpy", "isSVECpyImm<int32_t>">;
	def SVECpyImmOperand64 : SVEShiftedImmOperand<64, "Cpy", "isSVECpyImm<int64_t>">;

	def SVEAddSubImmOperand8 : SVEShiftedImmOperand<8, "AddSub", "isSVEAddSubImm<int8_t>">;
	def SVEAddSubImmOperand16 : SVEShiftedImmOperand<16, "AddSub", "isSVEAddSubImm<int16_t>">;
	def SVEAddSubImmOperand32 : SVEShiftedImmOperand<32, "AddSub", "isSVEAddSubImm<int32_t>">;
	def SVEAddSubImmOperand64 : SVEShiftedImmOperand<64, "AddSub", "isSVEAddSubImm<int64_t>">;

	class imm8_opt_lsl<int ElementWidth, string printType,
	AsmOperandClass OpndClass, code Predicate>
	: Operand<i32>, ImmLeaf<i32, Predicate> {
	let EncoderMethod = "getImm8OptLsl";
	let DecoderMethod = "DecodeImm8OptLsl<" # ElementWidth # ">";
	let PrintMethod = "printImm8OptLsl<" # printType # ">";
	let ParserMatchClass = OpndClass;
	let MIOperandInfo = (ops i32imm, i32imm);
	}

	def cpy_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "int8_t", SVECpyImmOperand8, [{
	return AArch64_AM::isSVECpyImm<int8_t>(Imm);
	}]>;
	def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16, [{
	return AArch64_AM::isSVECpyImm<int16_t>(Imm);
	}]>;
	def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32, [{
	return AArch64_AM::isSVECpyImm<int32_t>(Imm);
	}]>;
	def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64, [{
	return AArch64_AM::isSVECpyImm<int64_t>(Imm);
	}]>;

	def addsub_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "uint8_t", SVEAddSubImmOperand8, [{
	return AArch64_AM::isSVEAddSubImm<int8_t>(Imm);
	}]>;
	def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16, [{
	return AArch64_AM::isSVEAddSubImm<int16_t>(Imm);
	}]>;
	def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32, [{
	return AArch64_AM::isSVEAddSubImm<int32_t>(Imm);
	}]>;
	def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64, [{
	return AArch64_AM::isSVEAddSubImm<int64_t>(Imm);
	}]>;

	class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
	let Name = "SVEExactFPImmOperand" # Suffix;
	let DiagnosticType = "Invalid" # Name;
	let ParserMethod = "tryParseFPImm<false>";
	let PredicateMethod = "isExactFPImm<" # ValA # ", " # ValB # ">";
	let RenderMethod = "addExactFPImmOperands<" # ValA # ", " # ValB # ">";
	}

	class SVEExactFPImmOperand<string Suffix, string ValA, string ValB> : Operand<i32> {
	let PrintMethod = "printExactFPImm<" # ValA # ", " # ValB # ">";
	let ParserMatchClass = SVEExactFPImm<Suffix, ValA, ValB>;
	}

	def sve_fpimm_half_one
	: SVEExactFPImmOperand<"HalfOne", "AArch64ExactFPImm::half",
	"AArch64ExactFPImm::one">;
	def sve_fpimm_half_two
	: SVEExactFPImmOperand<"HalfTwo", "AArch64ExactFPImm::half",
	"AArch64ExactFPImm::two">;
	def sve_fpimm_zero_one
	: SVEExactFPImmOperand<"ZeroOne", "AArch64ExactFPImm::zero",
	"AArch64ExactFPImm::one">;

	def sve_incdec_imm : Operand<i32>, ImmLeaf<i32, [{
	return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
	}]> {
	let ParserMatchClass = Imm1_16Operand;
	let EncoderMethod = "getSVEIncDecImm";
	let DecoderMethod = "DecodeSVEIncDecImm";
	}

	//===----------------------------------------------------------------------===//
	// SVE PTrue - These are used extensively throughout the pattern matching so
	// it's important we define them first.
	//===----------------------------------------------------------------------===//

	class sve_int_ptrue<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty>
	: I<(outs pprty:$Pd), (ins sve_pred_enum:$pattern),
	asm, "\t$Pd, $pattern",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<5> pattern;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-19} = 0b011;
	let Inst{18-17} = opc{2-1};
	let Inst{16} = opc{0};
	let Inst{15-10} = 0b111000;
	let Inst{9-5} = pattern;
	let Inst{4} = 0b0;
	let Inst{3-0} = Pd;

	let Defs = !if(!eq (opc{0}, 1), [NZCV], []);
	}

	multiclass sve_int_ptrue<bits<3> opc, string asm> {
	def _B : sve_int_ptrue<0b00, opc, asm, PPR8>;
	def _H : sve_int_ptrue<0b01, opc, asm, PPR16>;
	def _S : sve_int_ptrue<0b10, opc, asm, PPR32>;
	def _D : sve_int_ptrue<0b11, opc, asm, PPR64>;

	def : InstAlias<asm # "\t$Pd",
	(!cast<Instruction>(NAME # _B) PPR8:$Pd, 0b11111), 1>;
	def : InstAlias<asm # "\t$Pd",
	(!cast<Instruction>(NAME # _H) PPR16:$Pd, 0b11111), 1>;
	def : InstAlias<asm # "\t$Pd",
	(!cast<Instruction>(NAME # _S) PPR32:$Pd, 0b11111), 1>;
	def : InstAlias<asm # "\t$Pd",
	(!cast<Instruction>(NAME # _D) PPR64:$Pd, 0b11111), 1>;
	}

	let Predicates = [HasSVE] in {
	defm PTRUE : sve_int_ptrue<0b000, "ptrue">;
	defm PTRUES : sve_int_ptrue<0b001, "ptrues">;
	}


	//===----------------------------------------------------------------------===//
	// SVE Predicate Misc Group
	//===----------------------------------------------------------------------===//

	class sve_int_pfalse<bits<6> opc, string asm>
	: I<(outs PPR8:$Pd), (ins),
	asm, "\t$Pd",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = opc{5-4};
	let Inst{21-19} = 0b011;
	let Inst{18-16} = opc{3-1};
	let Inst{15-10} = 0b111001;
	let Inst{9} = opc{0};
	let Inst{8-4} = 0b00000;
	let Inst{3-0} = Pd;
	}

	class sve_int_ptest<bits<6> opc, string asm>
	: I<(outs), (ins PPRAny:$Pg, PPR8:$Pn),
	asm, "\t$Pg, $Pn",
	"",
	[]>, Sched<[]> {
	bits<4> Pg;
	bits<4> Pn;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = opc{5-4};
	let Inst{21-19} = 0b010;
	let Inst{18-16} = opc{3-1};
	let Inst{15-14} = 0b11;
	let Inst{13-10} = Pg;
	let Inst{9} = opc{0};
	let Inst{8-5} = Pn;
	let Inst{4-0} = 0b00000;

	let Defs = [NZCV];
	}

	class sve_int_pfirst_next<bits<2> sz8_64, bits<5> opc, string asm,
	PPRRegOp pprty>
	: I<(outs pprty:$Pdn), (ins PPRAny:$Pg, pprty:$_Pdn),
	asm, "\t$Pdn, $Pg, $_Pdn",
	"",
	[]>, Sched<[]> {
	bits<4> Pdn;
	bits<4> Pg;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-19} = 0b011;
	let Inst{18-16} = opc{4-2};
	let Inst{15-11} = 0b11000;
	let Inst{10-9} = opc{1-0};
	let Inst{8-5} = Pg;
	let Inst{4} = 0;
	let Inst{3-0} = Pdn;

	let Constraints = "$Pdn = $_Pdn";
	let Defs = [NZCV];
	}

	multiclass sve_int_pfirst<bits<5> opc, string asm> {
	def : sve_int_pfirst_next<0b01, opc, asm, PPR8>;
	}

	multiclass sve_int_pnext<bits<5> opc, string asm> {
	def _B : sve_int_pfirst_next<0b00, opc, asm, PPR8>;
	def _H : sve_int_pfirst_next<0b01, opc, asm, PPR16>;
	def _S : sve_int_pfirst_next<0b10, opc, asm, PPR32>;
	def _D : sve_int_pfirst_next<0b11, opc, asm, PPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Predicate Count Group
	//===----------------------------------------------------------------------===//

	class sve_int_count_r<bits<2> sz8_64, bits<5> opc, string asm,
	RegisterOperand dty, PPRRegOp pprty, RegisterOperand sty>
	: I<(outs dty:$Rdn), (ins pprty:$Pg, sty:$_Rdn),
	asm, "\t$Rdn, $Pg",
	"",
	[]>, Sched<[]> {
	bits<5> Rdn;
	bits<4> Pg;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-19} = 0b101;
	let Inst{18-16} = opc{4-2};
	let Inst{15-11} = 0b10001;
	let Inst{10-9} = opc{1-0};
	let Inst{8-5} = Pg;
	let Inst{4-0} = Rdn;

	// Signed 32bit forms require their GPR operand printed.
	let AsmString = !if(!eq(opc{4,2-0}, 0b0000),
	!strconcat(asm, "\t$Rdn, $Pg, $_Rdn"),
	!strconcat(asm, "\t$Rdn, $Pg"));
	let Constraints = "$Rdn = $_Rdn";
	}

	multiclass sve_int_count_r_s32<bits<5> opc, string asm> {
	def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64as32>;
	def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64as32>;
	def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64as32>;
	def _D : sve_int_count_r<0b11, opc, asm, GPR64z, PPR64, GPR64as32>;
	}

	multiclass sve_int_count_r_u32<bits<5> opc, string asm> {
	def _B : sve_int_count_r<0b00, opc, asm, GPR32z, PPR8, GPR32z>;
	def _H : sve_int_count_r<0b01, opc, asm, GPR32z, PPR16, GPR32z>;
	def _S : sve_int_count_r<0b10, opc, asm, GPR32z, PPR32, GPR32z>;
	def _D : sve_int_count_r<0b11, opc, asm, GPR32z, PPR64, GPR32z>;
	}

	multiclass sve_int_count_r_x64<bits<5> opc, string asm> {
	def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64z>;
	def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64z>;
	def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64z>;
	def _D : sve_int_count_r<0b11, opc, asm, GPR64z, PPR64, GPR64z>;
	}

	class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
	- ZPRRegOp zprty>
	-: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, PPRAny:$Pg),
	- asm, "\t$Zdn, $Pg",
	+ ZPRRegOp zprty, PPRRegOp pprty>
	+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, pprty:$Pm),
	+ asm, "\t$Zdn, $Pm",
	"",
	[]>, Sched<[]> {
	- bits<4> Pg;
	+ bits<4> Pm;
	bits<5> Zdn;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-19} = 0b101;
	let Inst{18-16} = opc{4-2};
	let Inst{15-11} = 0b10000;
	let Inst{10-9} = opc{1-0};
	- let Inst{8-5} = Pg;
	+ let Inst{8-5} = Pm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_count_v<bits<5> opc, string asm> {
	- def _H : sve_int_count_v<0b01, opc, asm, ZPR16>;
	- def _S : sve_int_count_v<0b10, opc, asm, ZPR32>;
	- def _D : sve_int_count_v<0b11, opc, asm, ZPR64>;
	+ def _H : sve_int_count_v<0b01, opc, asm, ZPR16, PPR16>;
	+ def _S : sve_int_count_v<0b10, opc, asm, ZPR32, PPR32>;
	+ def _D : sve_int_count_v<0b11, opc, asm, ZPR64, PPR64>;
	+
	+ def : InstAlias<asm # "\t$Zdn, $Pm",
	+ (!cast<Instruction>(NAME # "_H") ZPR16:$Zdn, PPRAny:$Pm), 0>;
	+ def : InstAlias<asm # "\t$Zdn, $Pm",
	+ (!cast<Instruction>(NAME # "_S") ZPR32:$Zdn, PPRAny:$Pm), 0>;
	+ def : InstAlias<asm # "\t$Zdn, $Pm",
	+ (!cast<Instruction>(NAME # "_D") ZPR64:$Zdn, PPRAny:$Pm), 0>;
	}

	class sve_int_pcount_pred<bits<2> sz8_64, bits<4> opc, string asm,
	PPRRegOp pprty>
	: I<(outs GPR64:$Rd), (ins PPRAny:$Pg, pprty:$Pn),
	asm, "\t$Rd, $Pg, $Pn",
	"",
	[]>, Sched<[]> {
	bits<4> Pg;
	bits<4> Pn;
	bits<5> Rd;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-19} = 0b100;
	let Inst{18-16} = opc{3-1};
	let Inst{15-14} = 0b10;
	let Inst{13-10} = Pg;
	let Inst{9} = opc{0};
	let Inst{8-5} = Pn;
	let Inst{4-0} = Rd;
	}

	multiclass sve_int_pcount_pred<bits<4> opc, string asm> {
	def _B : sve_int_pcount_pred<0b00, opc, asm, PPR8>;
	def _H : sve_int_pcount_pred<0b01, opc, asm, PPR16>;
	def _S : sve_int_pcount_pred<0b10, opc, asm, PPR32>;
	def _D : sve_int_pcount_pred<0b11, opc, asm, PPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Element Count Group
	//===----------------------------------------------------------------------===//

	class sve_int_count<bits<3> opc, string asm>
	: I<(outs GPR64:$Rd), (ins sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
	asm, "\t$Rd, $pattern, mul $imm4",
	"",
	[]>, Sched<[]> {
	bits<5> Rd;
	bits<4> imm4;
	bits<5> pattern;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc{2-1};
	let Inst{21-20} = 0b10;
	let Inst{19-16} = imm4;
	let Inst{15-11} = 0b11100;
	let Inst{10} = opc{0};
	let Inst{9-5} = pattern;
	let Inst{4-0} = Rd;
	}

	multiclass sve_int_count<bits<3> opc, string asm> {
	def NAME : sve_int_count<opc, asm>;

	def : InstAlias<asm # "\t$Rd, $pattern",
	(!cast<Instruction>(NAME) GPR64:$Rd, sve_pred_enum:$pattern, 1), 1>;
	def : InstAlias<asm # "\t$Rd",
	(!cast<Instruction>(NAME) GPR64:$Rd, 0b11111, 1), 2>;
	}

	class sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
	asm, "\t$Zdn, $pattern, mul $imm4",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<5> pattern;
	bits<4> imm4;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc{4-3};
	let Inst{21} = 0b1;
	let Inst{20} = opc{2};
	let Inst{19-16} = imm4;
	let Inst{15-12} = 0b1100;
	let Inst{11-10} = opc{1-0};
	let Inst{9-5} = pattern;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty> {
	def NAME : sve_int_countvlv<opc, asm, zprty>;

	def : InstAlias<asm # "\t$Zdn, $pattern",
	(!cast<Instruction>(NAME) zprty:$Zdn, sve_pred_enum:$pattern, 1), 1>;
	def : InstAlias<asm # "\t$Zdn",
	(!cast<Instruction>(NAME) zprty:$Zdn, 0b11111, 1), 2>;
	}

	class sve_int_pred_pattern_a<bits<3> opc, string asm>
	: I<(outs GPR64:$Rdn), (ins GPR64:$_Rdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
	asm, "\t$Rdn, $pattern, mul $imm4",
	"",
	[]>, Sched<[]> {
	bits<5> Rdn;
	bits<5> pattern;
	bits<4> imm4;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc{2-1};
	let Inst{21-20} = 0b11;
	let Inst{19-16} = imm4;
	let Inst{15-11} = 0b11100;
	let Inst{10} = opc{0};
	let Inst{9-5} = pattern;
	let Inst{4-0} = Rdn;

	let Constraints = "$Rdn = $_Rdn";
	}

	multiclass sve_int_pred_pattern_a<bits<3> opc, string asm> {
	def NAME : sve_int_pred_pattern_a<opc, asm>;

	def : InstAlias<asm # "\t$Rdn, $pattern",
	(!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1), 1>;
	def : InstAlias<asm # "\t$Rdn",
	(!cast<Instruction>(NAME) GPR64:$Rdn, 0b11111, 1), 2>;
	}

	class sve_int_pred_pattern_b<bits<5> opc, string asm, RegisterOperand dt,
	RegisterOperand st>
	: I<(outs dt:$Rdn), (ins st:$_Rdn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4),
	asm, "\t$Rdn, $pattern, mul $imm4",
	"",
	[]>, Sched<[]> {
	bits<5> Rdn;
	bits<5> pattern;
	bits<4> imm4;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc{4-3};
	let Inst{21} = 0b1;
	let Inst{20} = opc{2};
	let Inst{19-16} = imm4;
	let Inst{15-12} = 0b1111;
	let Inst{11-10} = opc{1-0};
	let Inst{9-5} = pattern;
	let Inst{4-0} = Rdn;

	// Signed 32bit forms require their GPR operand printed.
	let AsmString = !if(!eq(opc{2,0}, 0b00),
	!strconcat(asm, "\t$Rdn, $_Rdn, $pattern, mul $imm4"),
	!strconcat(asm, "\t$Rdn, $pattern, mul $imm4"));

	let Constraints = "$Rdn = $_Rdn";
	}

	multiclass sve_int_pred_pattern_b_s32<bits<5> opc, string asm> {
	def NAME : sve_int_pred_pattern_b<opc, asm, GPR64z, GPR64as32>;

	def : InstAlias<asm # "\t$Rd, $Rn, $pattern",
	(!cast<Instruction>(NAME) GPR64z:$Rd, GPR64as32:$Rn, sve_pred_enum:$pattern, 1), 1>;
	def : InstAlias<asm # "\t$Rd, $Rn",
	(!cast<Instruction>(NAME) GPR64z:$Rd, GPR64as32:$Rn, 0b11111, 1), 2>;
	}

	multiclass sve_int_pred_pattern_b_u32<bits<5> opc, string asm> {
	def NAME : sve_int_pred_pattern_b<opc, asm, GPR32z, GPR32z>;

	def : InstAlias<asm # "\t$Rdn, $pattern",
	(!cast<Instruction>(NAME) GPR32z:$Rdn, sve_pred_enum:$pattern, 1), 1>;
	def : InstAlias<asm # "\t$Rdn",
	(!cast<Instruction>(NAME) GPR32z:$Rdn, 0b11111, 1), 2>;
	}

	multiclass sve_int_pred_pattern_b_x64<bits<5> opc, string asm> {
	def NAME : sve_int_pred_pattern_b<opc, asm, GPR64z, GPR64z>;

	def : InstAlias<asm # "\t$Rdn, $pattern",
	(!cast<Instruction>(NAME) GPR64z:$Rdn, sve_pred_enum:$pattern, 1), 1>;
	def : InstAlias<asm # "\t$Rdn",
	(!cast<Instruction>(NAME) GPR64z:$Rdn, 0b11111, 1), 2>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Permute - Cross Lane Group
	//===----------------------------------------------------------------------===//

	class sve_int_perm_dup_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegType>
	: I<(outs zprty:$Zd), (ins srcRegType:$Rn),
	asm, "\t$Zd, $Rn",
	"",
	[]>, Sched<[]> {
	bits<5> Rn;
	bits<5> Zd;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-10} = 0b100000001110;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_dup_r<string asm> {
	def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, GPR32sp>;
	def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, GPR32sp>;
	def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, GPR32sp>;
	def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, GPR64sp>;

	def : InstAlias<"mov $Zd, $Rn",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, GPR32sp:$Rn), 1>;
	def : InstAlias<"mov $Zd, $Rn",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, GPR32sp:$Rn), 1>;
	def : InstAlias<"mov $Zd, $Rn",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, GPR32sp:$Rn), 1>;
	def : InstAlias<"mov $Zd, $Rn",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, GPR64sp:$Rn), 1>;
	}

	class sve_int_perm_dup_i<bits<5> tsz, Operand immtype, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$idx),
	asm, "\t$Zd, $Zn$idx",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<7> idx;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = {?,?}; // imm3h
	let Inst{21} = 0b1;
	let Inst{20-16} = tsz;
	let Inst{15-10} = 0b001000;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_dup_i<string asm> {
	def _B : sve_int_perm_dup_i<{?,?,?,?,1}, sve_elm_idx_extdup_b, asm, ZPR8> {
	let Inst{23-22} = idx{5-4};
	let Inst{20-17} = idx{3-0};
	}
	def _H : sve_int_perm_dup_i<{?,?,?,1,0}, sve_elm_idx_extdup_h, asm, ZPR16> {
	let Inst{23-22} = idx{4-3};
	let Inst{20-18} = idx{2-0};
	}
	def _S : sve_int_perm_dup_i<{?,?,1,0,0}, sve_elm_idx_extdup_s, asm, ZPR32> {
	let Inst{23-22} = idx{3-2};
	let Inst{20-19} = idx{1-0};
	}
	def _D : sve_int_perm_dup_i<{?,1,0,0,0}, sve_elm_idx_extdup_d, asm, ZPR64> {
	let Inst{23-22} = idx{2-1};
	let Inst{20} = idx{0};
	}
	def _Q : sve_int_perm_dup_i<{1,0,0,0,0}, sve_elm_idx_extdup_q, asm, ZPR128> {
	let Inst{23-22} = idx{1-0};
	}

	def : InstAlias<"mov $Zd, $Zn$idx",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, sve_elm_idx_extdup_b:$idx), 1>;
	def : InstAlias<"mov $Zd, $Zn$idx",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, ZPR16:$Zn, sve_elm_idx_extdup_h:$idx), 1>;
	def : InstAlias<"mov $Zd, $Zn$idx",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, sve_elm_idx_extdup_s:$idx), 1>;
	def : InstAlias<"mov $Zd, $Zn$idx",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, sve_elm_idx_extdup_d:$idx), 1>;
	def : InstAlias<"mov $Zd, $Zn$idx",
	(!cast<Instruction>(NAME # _Q) ZPR128:$Zd, ZPR128:$Zn, sve_elm_idx_extdup_q:$idx), 1>;
	def : InstAlias<"mov $Zd, $Bn",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, FPR8asZPR:$Bn, 0), 2>;
	def : InstAlias<"mov $Zd, $Hn",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, FPR16asZPR:$Hn, 0), 2>;
	def : InstAlias<"mov $Zd, $Sn",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, FPR32asZPR:$Sn, 0), 2>;
	def : InstAlias<"mov $Zd, $Dn",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, FPR64asZPR:$Dn, 0), 2>;
	def : InstAlias<"mov $Zd, $Qn",
	(!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>;
	}

	class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm,
	ZPRRegOp zprty, RegisterOperand VecList>
	: I<(outs zprty:$Zd), (ins VecList:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b001;
	let Inst{12-11} = opc;
	let Inst{10} = 0b0;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_tbl<string asm> {
	def _B : sve_int_perm_tbl<0b00, 0b10, asm, ZPR8, Z_b>;
	def _H : sve_int_perm_tbl<0b01, 0b10, asm, ZPR16, Z_h>;
	def _S : sve_int_perm_tbl<0b10, 0b10, asm, ZPR32, Z_s>;
	def _D : sve_int_perm_tbl<0b11, 0b10, asm, ZPR64, Z_d>;

	def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 0>;
	def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, ZPR16:$Zn, ZPR16:$Zm), 0>;
	def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, ZPR32:$Zm), 0>;
	def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zm), 0>;
	}

	multiclass sve2_int_perm_tbl<string asm> {
	def _B : sve_int_perm_tbl<0b00, 0b01, asm, ZPR8, ZZ_b>;
	def _H : sve_int_perm_tbl<0b01, 0b01, asm, ZPR16, ZZ_h>;
	def _S : sve_int_perm_tbl<0b10, 0b01, asm, ZPR32, ZZ_s>;
	def _D : sve_int_perm_tbl<0b11, 0b01, asm, ZPR64, ZZ_d>;
	}

	class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
	-: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
	+: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-10} = 0b001011;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	+
	+ let Constraints = "$Zd = $_Zd";
	}

	multiclass sve2_int_perm_tbx<string asm> {
	def _B : sve2_int_perm_tbx<0b00, asm, ZPR8>;
	def _H : sve2_int_perm_tbx<0b01, asm, ZPR16>;
	def _S : sve2_int_perm_tbx<0b10, asm, ZPR32>;
	def _D : sve2_int_perm_tbx<0b11, asm, ZPR64>;
	}

	class sve_int_perm_reverse_z<bits<2> sz8_64, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn),
	asm, "\t$Zd, $Zn",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-10} = 0b111000001110;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_reverse_z<string asm> {
	def _B : sve_int_perm_reverse_z<0b00, asm, ZPR8>;
	def _H : sve_int_perm_reverse_z<0b01, asm, ZPR16>;
	def _S : sve_int_perm_reverse_z<0b10, asm, ZPR32>;
	def _D : sve_int_perm_reverse_z<0b11, asm, ZPR64>;
	}

	class sve_int_perm_reverse_p<bits<2> sz8_64, string asm, PPRRegOp pprty>
	: I<(outs pprty:$Pd), (ins pprty:$Pn),
	asm, "\t$Pd, $Pn",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<4> Pn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-9} = 0b1101000100000;
	let Inst{8-5} = Pn;
	let Inst{4} = 0b0;
	let Inst{3-0} = Pd;
	}

	multiclass sve_int_perm_reverse_p<string asm> {
	def _B : sve_int_perm_reverse_p<0b00, asm, PPR8>;
	def _H : sve_int_perm_reverse_p<0b01, asm, PPR16>;
	def _S : sve_int_perm_reverse_p<0b10, asm, PPR32>;
	def _D : sve_int_perm_reverse_p<0b11, asm, PPR64>;
	}

	class sve_int_perm_unpk<bits<2> sz16_64, bits<2> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn),
	asm, "\t$Zd, $Zn",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz16_64;
	let Inst{21-18} = 0b1100;
	let Inst{17-16} = opc;
	let Inst{15-10} = 0b001110;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_unpk<bits<2> opc, string asm> {
	def _H : sve_int_perm_unpk<0b01, opc, asm, ZPR16, ZPR8>;
	def _S : sve_int_perm_unpk<0b10, opc, asm, ZPR32, ZPR16>;
	def _D : sve_int_perm_unpk<0b11, opc, asm, ZPR64, ZPR32>;
	}

	class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegType>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Rm),
	asm, "\t$Zdn, $Rm",
	"",
	[]>, Sched<[]> {
	bits<5> Rm;
	bits<5> Zdn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-10} = 0b100100001110;
	let Inst{9-5} = Rm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_perm_insrs<string asm> {
	def _B : sve_int_perm_insrs<0b00, asm, ZPR8, GPR32>;
	def _H : sve_int_perm_insrs<0b01, asm, ZPR16, GPR32>;
	def _S : sve_int_perm_insrs<0b10, asm, ZPR32, GPR32>;
	def _D : sve_int_perm_insrs<0b11, asm, ZPR64, GPR64>;
	}

	class sve_int_perm_insrv<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegType>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Vm),
	asm, "\t$Zdn, $Vm",
	"",
	[]>, Sched<[]> {
	bits<5> Vm;
	bits<5> Zdn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-10} = 0b110100001110;
	let Inst{9-5} = Vm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_perm_insrv<string asm> {
	def _B : sve_int_perm_insrv<0b00, asm, ZPR8, FPR8>;
	def _H : sve_int_perm_insrv<0b01, asm, ZPR16, FPR16>;
	def _S : sve_int_perm_insrv<0b10, asm, ZPR32, FPR32>;
	def _D : sve_int_perm_insrv<0b11, asm, ZPR64, FPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Permute - Extract Group
	//===----------------------------------------------------------------------===//

	class sve_int_perm_extract_i<string asm>
	: I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn, ZPR8:$Zm, imm0_255:$imm8),
	asm, "\t$Zdn, $_Zdn, $Zm, $imm8",
	"", []>, Sched<[]> {
	bits<5> Zdn;
	bits<5> Zm;
	bits<8> imm8;
	let Inst{31-21} = 0b00000101001;
	let Inst{20-16} = imm8{7-3};
	let Inst{15-13} = 0b000;
	let Inst{12-10} = imm8{2-0};
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	class sve2_int_perm_extract_i_cons<string asm>
	: I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, imm0_255:$imm8),
	asm, "\t$Zd, $Zn, $imm8",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<8> imm8;
	let Inst{31-21} = 0b00000101011;
	let Inst{20-16} = imm8{7-3};
	let Inst{15-13} = 0b000;
	let Inst{12-10} = imm8{2-0};
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	//===----------------------------------------------------------------------===//
	// SVE Vector Select Group
	//===----------------------------------------------------------------------===//

	class sve_int_sel_vvv<bits<2> sz8_64, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins PPRAny:$Pg, zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Pg, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<4> Pg;
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-14} = 0b11;
	let Inst{13-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_sel_vvv<string asm> {
	def _B : sve_int_sel_vvv<0b00, asm, ZPR8>;
	def _H : sve_int_sel_vvv<0b01, asm, ZPR16>;
	def _S : sve_int_sel_vvv<0b10, asm, ZPR32>;
	def _D : sve_int_sel_vvv<0b11, asm, ZPR64>;

	def : InstAlias<"mov $Zd, $Pg/m, $Zn",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, ZPR8:$Zn, ZPR8:$Zd), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Zn",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, ZPR16:$Zn, ZPR16:$Zd), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Zn",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, ZPR32:$Zn, ZPR32:$Zd), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Zn",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, ZPR64:$Zn, ZPR64:$Zd), 1>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Predicate Logical Operations Group
	//===----------------------------------------------------------------------===//

	class sve_int_pred_log<bits<4> opc, string asm>
	: I<(outs PPR8:$Pd), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$Pm),
	asm, "\t$Pd, $Pg/z, $Pn, $Pm",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<4> Pg;
	bits<4> Pm;
	bits<4> Pn;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = opc{3-2};
	let Inst{21-20} = 0b00;
	let Inst{19-16} = Pm;
	let Inst{15-14} = 0b01;
	let Inst{13-10} = Pg;
	let Inst{9} = opc{1};
	let Inst{8-5} = Pn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;

	// SEL has no predication qualifier.
	let AsmString = !if(!eq(opc, 0b0011),
	!strconcat(asm, "\t$Pd, $Pg, $Pn, $Pm"),
	!strconcat(asm, "\t$Pd, $Pg/z, $Pn, $Pm"));

	let Defs = !if(!eq (opc{2}, 1), [NZCV], []);
	}


	//===----------------------------------------------------------------------===//
	// SVE Logical Mask Immediate Group
	//===----------------------------------------------------------------------===//

	class sve_int_log_imm<bits<2> opc, string asm>
	: I<(outs ZPR64:$Zdn), (ins ZPR64:$_Zdn, logical_imm64:$imms13),
	asm, "\t$Zdn, $_Zdn, $imms13",
	"", []>, Sched<[]> {
	bits<5> Zdn;
	bits<13> imms13;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = opc;
	let Inst{21-18} = 0b0000;
	let Inst{17-5} = imms13;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DecoderMethod = "DecodeSVELogicalImmInstruction";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_log_imm<bits<2> opc, string asm, string alias> {
	def NAME : sve_int_log_imm<opc, asm>;

	def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
	(!cast<Instruction>(NAME) ZPR8:$Zdn, sve_logical_imm8:$imm), 4>;
	def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
	(!cast<Instruction>(NAME) ZPR16:$Zdn, sve_logical_imm16:$imm), 3>;
	def : InstAlias<asm # "\t$Zdn, $Zdn, $imm",
	(!cast<Instruction>(NAME) ZPR32:$Zdn, sve_logical_imm32:$imm), 2>;

	def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
	(!cast<Instruction>(NAME) ZPR8:$Zdn, sve_logical_imm8_not:$imm), 0>;
	def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
	(!cast<Instruction>(NAME) ZPR16:$Zdn, sve_logical_imm16_not:$imm), 0>;
	def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
	(!cast<Instruction>(NAME) ZPR32:$Zdn, sve_logical_imm32_not:$imm), 0>;
	def : InstAlias<alias # "\t$Zdn, $Zdn, $imm",
	(!cast<Instruction>(NAME) ZPR64:$Zdn, logical_imm64_not:$imm), 0>;
	}

	class sve_int_dup_mask_imm<string asm>
	: I<(outs ZPR64:$Zd), (ins logical_imm64:$imms),
	asm, "\t$Zd, $imms",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<13> imms;
	let Inst{31-18} = 0b00000101110000;
	let Inst{17-5} = imms;
	let Inst{4-0} = Zd;

	let isReMaterializable = 1;
	let DecoderMethod = "DecodeSVELogicalImmInstruction";
	}

	multiclass sve_int_dup_mask_imm<string asm> {
	def NAME : sve_int_dup_mask_imm<asm>;

	def : InstAlias<"dupm $Zd, $imm",
	(!cast<Instruction>(NAME) ZPR8:$Zd, sve_logical_imm8:$imm), 4>;
	def : InstAlias<"dupm $Zd, $imm",
	(!cast<Instruction>(NAME) ZPR16:$Zd, sve_logical_imm16:$imm), 3>;
	def : InstAlias<"dupm $Zd, $imm",
	(!cast<Instruction>(NAME) ZPR32:$Zd, sve_logical_imm32:$imm), 2>;

	// All Zd.b forms have a CPY/DUP equivalent, hence no byte alias here.
	def : InstAlias<"mov $Zd, $imm",
	(!cast<Instruction>(NAME) ZPR16:$Zd, sve_preferred_logical_imm16:$imm), 7>;
	def : InstAlias<"mov $Zd, $imm",
	(!cast<Instruction>(NAME) ZPR32:$Zd, sve_preferred_logical_imm32:$imm), 6>;
	def : InstAlias<"mov $Zd, $imm",
	(!cast<Instruction>(NAME) ZPR64:$Zd, sve_preferred_logical_imm64:$imm), 5>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Arithmetic - Unpredicated Group.
	//===----------------------------------------------------------------------===//

	class sve_int_bin_cons_arit_0<bits<2> sz8_64, bits<3> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b000;
	let Inst{12-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm> {
	def _B : sve_int_bin_cons_arit_0<0b00, opc, asm, ZPR8>;
	def _H : sve_int_bin_cons_arit_0<0b01, opc, asm, ZPR16>;
	def _S : sve_int_bin_cons_arit_0<0b10, opc, asm, ZPR32>;
	def _D : sve_int_bin_cons_arit_0<0b11, opc, asm, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Arithmetic - Predicated Group
	//===----------------------------------------------------------------------===//

	class sve_fp_2op_i_p_zds<bits<2> sz, bits<3> opc, string asm,
	ZPRRegOp zprty,
	Operand imm_ty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, imm_ty:$i1),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $i1",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bit i1;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21-19} = 0b011;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-6} = 0b0000;
	let Inst{5} = i1;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_fp_2op_i_p_zds<bits<3> opc, string asm, Operand imm_ty> {
	def _H : sve_fp_2op_i_p_zds<0b01, opc, asm, ZPR16, imm_ty>;
	def _S : sve_fp_2op_i_p_zds<0b10, opc, asm, ZPR32, imm_ty>;
	def _D : sve_fp_2op_i_p_zds<0b11, opc, asm, ZPR64, imm_ty>;
	}

	class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bits<5> Zm;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21-20} = 0b00;
	let Inst{19-16} = opc;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_fp_2op_p_zds<bits<4> opc, string asm> {
	def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>;
	def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>;
	def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>;
	}

	class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm0_7:$imm3),
	asm, "\t$Zdn, $_Zdn, $Zm, $imm3",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<5> Zm;
	bits<3> imm3;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21-19} = 0b010;
	let Inst{18-16} = imm3;
	let Inst{15-10} = 0b100000;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_fp_ftmad<string asm> {
	def _H : sve_fp_ftmad<0b01, asm, ZPR16>;
	def _S : sve_fp_ftmad<0b10, asm, ZPR32>;
	def _D : sve_fp_ftmad<0b11, asm, ZPR64>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Floating Point Arithmetic - Unpredicated Group
	//===----------------------------------------------------------------------===//

	class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b000;
	let Inst{12-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_fp_3op_u_zd<bits<3> opc, string asm> {
	def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>;
	def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>;
	def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Fused Multiply-Add Group
	//===----------------------------------------------------------------------===//

	class sve_fp_3op_p_zds_a<bits<2> sz, bits<2> opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm),
	asm, "\t$Zda, $Pg/m, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zda;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15} = 0b0;
	let Inst{14-13} = opc;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm> {
	def _H : sve_fp_3op_p_zds_a<0b01, opc, asm, ZPR16>;
	def _S : sve_fp_3op_p_zds_a<0b10, opc, asm, ZPR32>;
	def _D : sve_fp_3op_p_zds_a<0b11, opc, asm, ZPR64>;
	}

	class sve_fp_3op_p_zds_b<bits<2> sz, bits<2> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za),
	asm, "\t$Zdn, $Pg/m, $Zm, $Za",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Za;
	bits<5> Zdn;
	bits<5> Zm;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Za;
	let Inst{15} = 0b1;
	let Inst{14-13} = opc;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_fp_3op_p_zds_b<bits<2> opc, string asm> {
	def _H : sve_fp_3op_p_zds_b<0b01, opc, asm, ZPR16>;
	def _S : sve_fp_3op_p_zds_b<0b10, opc, asm, ZPR32>;
	def _D : sve_fp_3op_p_zds_b<0b11, opc, asm, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Multiply-Add - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve_fp_fma_by_indexed_elem<bits<2> sz, bit opc, string asm,
	ZPRRegOp zprty1,
	ZPRRegOp zprty2, Operand itype>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty1:$Zn, zprty2:$Zm, itype:$iop),
	asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	let Inst{31-24} = 0b01100100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{15-11} = 0;
	let Inst{10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_fp_fma_by_indexed_elem<bit opc, string asm> {
	def _H : sve_fp_fma_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR3b16, VectorIndexH> {
	bits<3> Zm;
	bits<3> iop;
	let Inst{22} = iop{2};
	let Inst{20-19} = iop{1-0};
	let Inst{18-16} = Zm;
	}
	def _S : sve_fp_fma_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR3b32, VectorIndexS> {
	bits<3> Zm;
	bits<2> iop;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _D : sve_fp_fma_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR4b64, VectorIndexD> {
	bits<4> Zm;
	bit iop;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}
	}


	//===----------------------------------------------------------------------===//
	// SVE Floating Point Multiply - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve_fp_fmul_by_indexed_elem<bits<2> sz, string asm, ZPRRegOp zprty,
	ZPRRegOp zprty2, Operand itype>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty2:$Zm, itype:$iop),
	asm, "\t$Zd, $Zn, $Zm$iop", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b01100100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{15-10} = 0b001000;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_fp_fmul_by_indexed_elem<string asm> {
	def _H : sve_fp_fmul_by_indexed_elem<{0, ?}, asm, ZPR16, ZPR3b16, VectorIndexH> {
	bits<3> Zm;
	bits<3> iop;
	let Inst{22} = iop{2};
	let Inst{20-19} = iop{1-0};
	let Inst{18-16} = Zm;
	}
	def _S : sve_fp_fmul_by_indexed_elem<0b10, asm, ZPR32, ZPR3b32, VectorIndexS> {
	bits<3> Zm;
	bits<2> iop;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _D : sve_fp_fmul_by_indexed_elem<0b11, asm, ZPR64, ZPR4b64, VectorIndexD> {
	bits<4> Zm;
	bit iop;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Complex Multiply-Add Group
	//===----------------------------------------------------------------------===//

	class sve_fp_fcmla<bits<2> sz, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm,
	complexrotateop:$imm),
	asm, "\t$Zda, $Pg/m, $Zn, $Zm, $imm",
	"", []>, Sched<[]> {
	bits<5> Zda;
	bits<3> Pg;
	bits<5> Zn;
	bits<5> Zm;
	bits<2> imm;
	let Inst{31-24} = 0b01100100;
	let Inst{23-22} = sz;
	let Inst{21} = 0;
	let Inst{20-16} = Zm;
	let Inst{15} = 0;
	let Inst{14-13} = imm;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_fp_fcmla<string asm> {
	def _H : sve_fp_fcmla<0b01, asm, ZPR16>;
	def _S : sve_fp_fcmla<0b10, asm, ZPR32>;
	def _D : sve_fp_fcmla<0b11, asm, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Complex Multiply-Add - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve_fp_fcmla_by_indexed_elem<bits<2> sz, string asm,
	ZPRRegOp zprty,
	ZPRRegOp zprty2, Operand itype>
	: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty2:$Zm, itype:$iop,
	complexrotateop:$imm),
	asm, "\t$Zda, $Zn, $Zm$iop, $imm",
	"", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<2> imm;
	let Inst{31-24} = 0b01100100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{15-12} = 0b0001;
	let Inst{11-10} = imm;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_fp_fcmla_by_indexed_elem<string asm> {
	def _H : sve_fp_fcmla_by_indexed_elem<0b10, asm, ZPR16, ZPR3b16, VectorIndexS> {
	bits<3> Zm;
	bits<2> iop;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _S : sve_fp_fcmla_by_indexed_elem<0b11, asm, ZPR32, ZPR4b32, VectorIndexD> {
	bits<4> Zm;
	bits<1> iop;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Complex Addition Group
	//===----------------------------------------------------------------------===//

	class sve_fp_fcadd<bits<2> sz, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm,
	complexrotateopodd:$imm),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm, $imm",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<5> Zm;
	bits<3> Pg;
	bit imm;
	let Inst{31-24} = 0b01100100;
	let Inst{23-22} = sz;
	let Inst{21-17} = 0;
	let Inst{16} = imm;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_fp_fcadd<string asm> {
	def _H : sve_fp_fcadd<0b01, asm, ZPR16>;
	def _S : sve_fp_fcadd<0b10, asm, ZPR32>;
	def _D : sve_fp_fcadd<0b11, asm, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Floating Point Convert Group
	//===----------------------------------------------------------------------===//

	class sve2_fp_convert_precision<bits<4> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	-: I<(outs zprty1:$Zd), (ins PPR3bAny:$Pg, zprty2:$Zn),
	+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, PPR3bAny:$Pg, zprty2:$Zn),
	asm, "\t$Zd, $Pg/m, $Zn",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<3> Pg;
	let Inst{31-24} = 0b01100100;
	let Inst{23-22} = opc{3-2};
	let Inst{21-18} = 0b0010;
	let Inst{17-16} = opc{1-0};
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	+
	+ let Constraints = "$Zd = $_Zd";
	}

	multiclass sve2_fp_convert_down_narrow<string asm> {
	def _StoH : sve2_fp_convert_precision<0b1000, asm, ZPR16, ZPR32>;
	def _DtoS : sve2_fp_convert_precision<0b1110, asm, ZPR32, ZPR64>;
	}

	multiclass sve2_fp_convert_up_long<string asm> {
	def _HtoS : sve2_fp_convert_precision<0b1001, asm, ZPR32, ZPR16>;
	def _StoD : sve2_fp_convert_precision<0b1111, asm, ZPR64, ZPR32>;
	}

	multiclass sve2_fp_convert_down_odd_rounding<string asm> {
	def _DtoS : sve2_fp_convert_precision<0b0010, asm, ZPR32, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Floating Point Pairwise Group
	//===----------------------------------------------------------------------===//

	class sve2_fp_pairwise_pred<bits<2> sz, bits<3> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zm;
	bits<5> Zdn;
	let Inst{31-24} = 0b01100100;
	let Inst{23-22} = sz;
	let Inst{21-19} = 0b010;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve2_fp_pairwise_pred<bits<3> opc, string asm> {
	def _H : sve2_fp_pairwise_pred<0b01, opc, asm, ZPR16>;
	def _S : sve2_fp_pairwise_pred<0b10, opc, asm, ZPR32>;
	def _D : sve2_fp_pairwise_pred<0b11, opc, asm, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Floating Point Widening Multiply-Add - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm>
	: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm,
	VectorIndexH:$iop),
	asm, "\t$Zda, $Zn, $Zm$iop",
	"",
	[]>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<3> Zm;
	bits<3> iop;
	let Inst{31-21} = 0b01100100101;
	let Inst{20-19} = iop{2-1};
	let Inst{18-16} = Zm;
	let Inst{15-14} = 0b01;
	let Inst{13} = opc{1};
	let Inst{12} = 0b0;
	let Inst{11} = iop{0};
	let Inst{10} = opc{0};
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Floating Point Widening Multiply-Add Group
	//===----------------------------------------------------------------------===//

	class sve2_fp_mla_long<bits<2> opc, string asm>
	: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm),
	asm, "\t$Zda, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-21} = 0b01100100101;
	let Inst{20-16} = Zm;
	let Inst{15-14} = 0b10;
	let Inst{13} = opc{1};
	let Inst{12-11} = 0b00;
	let Inst{10} = opc{0};
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	//===----------------------------------------------------------------------===//
	// SVE Stack Allocation Group
	//===----------------------------------------------------------------------===//

	class sve_int_arith_vl<bit opc, string asm>
	: I<(outs GPR64sp:$Rd), (ins GPR64sp:$Rn, simm6_32b:$imm6),
	asm, "\t$Rd, $Rn, $imm6",
	"",
	[]>, Sched<[]> {
	bits<5> Rd;
	bits<5> Rn;
	bits<6> imm6;
	let Inst{31-23} = 0b000001000;
	let Inst{22} = opc;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rn;
	let Inst{15-11} = 0b01010;
	let Inst{10-5} = imm6;
	let Inst{4-0} = Rd;
	}

	class sve_int_read_vl_a<bit op, bits<5> opc2, string asm>
	: I<(outs GPR64:$Rd), (ins simm6_32b:$imm6),
	asm, "\t$Rd, $imm6",
	"",
	[]>, Sched<[]> {
	bits<5> Rd;
	bits<6> imm6;
	let Inst{31-23} = 0b000001001;
	let Inst{22} = op;
	let Inst{21} = 0b1;
	let Inst{20-16} = opc2{4-0};
	let Inst{15-11} = 0b01010;
	let Inst{10-5} = imm6;
	let Inst{4-0} = Rd;
	}

	//===----------------------------------------------------------------------===//
	// SVE Permute - In Lane Group
	//===----------------------------------------------------------------------===//

	class sve_int_perm_bin_perm_zz<bits<3> opc, bits<2> sz8_64, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b011;
	let Inst{12-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_bin_perm_zz<bits<3> opc, string asm> {
	def _B : sve_int_perm_bin_perm_zz<opc, 0b00, asm, ZPR8>;
	def _H : sve_int_perm_bin_perm_zz<opc, 0b01, asm, ZPR16>;
	def _S : sve_int_perm_bin_perm_zz<opc, 0b10, asm, ZPR32>;
	def _D : sve_int_perm_bin_perm_zz<opc, 0b11, asm, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Unary Operations Group
	//===----------------------------------------------------------------------===//

	class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype,
	RegisterOperand o_zprtype, ElementSizeEnum size>
	: I<(outs o_zprtype:$Zd), (ins i_zprtype:$_Zd, PPR3bAny:$Pg, i_zprtype:$Zn),
	asm, "\t$Zd, $Pg/m, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = opc{6-5};
	let Inst{21} = 0b0;
	let Inst{20-16} = opc{4-0};
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = Destructive;
	let ElementSize = size;
	}

	multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm> {
	def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16, ElementSizeH>;
	def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>;
	def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>;
	}

	multiclass sve2_fp_flogb<string asm> {
	def _H : sve_fp_2op_p_zd<0b0011010, asm, ZPR16, ZPR16, ElementSizeH>;
	def _S : sve_fp_2op_p_zd<0b0011100, asm, ZPR32, ZPR32, ElementSizeS>;
	def _D : sve_fp_2op_p_zd<0b0011110, asm, ZPR64, ZPR64, ElementSizeD>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Unary Operations - Unpredicated Group
	//===----------------------------------------------------------------------===//

	class sve_fp_2op_u_zd<bits<2> sz, bits<3> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn),
	asm, "\t$Zd, $Zn",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21-19} = 0b001;
	let Inst{18-16} = opc;
	let Inst{15-10} = 0b001100;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_fp_2op_u_zd<bits<3> opc, string asm> {
	def _H : sve_fp_2op_u_zd<0b01, opc, asm, ZPR16>;
	def _S : sve_fp_2op_u_zd<0b10, opc, asm, ZPR32>;
	def _D : sve_fp_2op_u_zd<0b11, opc, asm, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Arithmetic - Binary Predicated Group
	//===----------------------------------------------------------------------===//

	class sve_int_bin_pred_arit_log<bits<2> sz8_64, bits<2> fmt, bits<3> opc,
	string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", "", []>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bits<5> Zm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b0;
	let Inst{20-19} = fmt;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b000;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_bin_pred_log<bits<3> opc, string asm> {
	def _B : sve_int_bin_pred_arit_log<0b00, 0b11, opc, asm, ZPR8>;
	def _H : sve_int_bin_pred_arit_log<0b01, 0b11, opc, asm, ZPR16>;
	def _S : sve_int_bin_pred_arit_log<0b10, 0b11, opc, asm, ZPR32>;
	def _D : sve_int_bin_pred_arit_log<0b11, 0b11, opc, asm, ZPR64>;
	}

	multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm> {
	def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>;
	def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>;
	def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>;
	def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>;
	}

	multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm> {
	def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>;
	def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>;
	def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>;
	def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>;
	}

	multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm> {
	def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>;
	def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>;
	def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>;
	def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>;
	}

	// Special case for divides which are not defined for 8b/16b elements.
	multiclass sve_int_bin_pred_arit_2_div<bits<3> opc, string asm> {
	def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>;
	def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Multiply-Add Group
	//===----------------------------------------------------------------------===//

	class sve_int_mladdsub_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm, zprty:$Za),
	asm, "\t$Zdn, $Pg/m, $Zm, $Za",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bits<5> Za;
	bits<5> Zm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15-14} = 0b11;
	let Inst{13} = opc;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Za;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_mladdsub_vvv_pred<bits<1> opc, string asm> {
	def _B : sve_int_mladdsub_vvv_pred<0b00, opc, asm, ZPR8>;
	def _H : sve_int_mladdsub_vvv_pred<0b01, opc, asm, ZPR16>;
	def _S : sve_int_mladdsub_vvv_pred<0b10, opc, asm, ZPR32>;
	def _D : sve_int_mladdsub_vvv_pred<0b11, opc, asm, ZPR64>;
	}

	class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zda), (ins PPR3bAny:$Pg, zprty:$_Zda, zprty:$Zn, zprty:$Zm),
	asm, "\t$Zda, $Pg/m, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zda;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15-14} = 0b01;
	let Inst{13} = opc;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm> {
	def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>;
	def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>;
	def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>;
	def _D : sve_int_mlas_vvv_pred<0b11, opc, asm, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Integer Multiply-Add - Unpredicated Group
	//===----------------------------------------------------------------------===//

	class sve2_int_mla<bits<2> sz, bits<5> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm),
	asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15} = 0b0;
	let Inst{14-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_int_mla<bit S, string asm> {
	def _B : sve2_int_mla<0b00, { 0b1110, S }, asm, ZPR8, ZPR8>;
	def _H : sve2_int_mla<0b01, { 0b1110, S }, asm, ZPR16, ZPR16>;
	def _S : sve2_int_mla<0b10, { 0b1110, S }, asm, ZPR32, ZPR32>;
	def _D : sve2_int_mla<0b11, { 0b1110, S }, asm, ZPR64, ZPR64>;
	}

	multiclass sve2_int_mla_long<bits<5> opc, string asm> {
	def _H : sve2_int_mla<0b01, opc, asm, ZPR16, ZPR8>;
	def _S : sve2_int_mla<0b10, opc, asm, ZPR32, ZPR16>;
	def _D : sve2_int_mla<0b11, opc, asm, ZPR64, ZPR32>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Integer Multiply-Add - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve2_int_mla_by_indexed_elem<bits<2> sz, bits<6> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2,
	ZPRRegOp zprty3, Operand itype>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop),
	asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{15-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_int_mla_by_indexed_elem<bits<2> opc, bit S, string asm> {
	def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> {
	bits<3> Zm;
	bits<3> iop;
	let Inst{22} = iop{2};
	let Inst{20-19} = iop{1-0};
	let Inst{18-16} = Zm;
	}
	def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> {
	bits<3> Zm;
	bits<2> iop;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> {
	bits<4> Zm;
	bit iop;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Integer Multiply-Add Long - Indexed Group
	//===----------------------------------------------------------------------===//

	multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm> {
	def _S : sve2_int_mla_by_indexed_elem<0b10, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
	asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH> {
	bits<3> Zm;
	bits<3> iop;
	let Inst{20-19} = iop{2-1};
	let Inst{18-16} = Zm;
	let Inst{11} = iop{0};
	}
	def _D : sve2_int_mla_by_indexed_elem<0b11, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
	asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS> {
	bits<4> Zm;
	bits<2> iop;
	let Inst{20} = iop{1};
	let Inst{19-16} = Zm;
	let Inst{11} = iop{0};
	}
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Dot Product Group
	//===----------------------------------------------------------------------===//

	class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1,
	ZPRRegOp zprty2>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm), asm,
	"\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-23} = 0b010001001;
	let Inst{22} = sz;
	let Inst{21} = 0;
	let Inst{20-16} = Zm;
	let Inst{15-11} = 0;
	let Inst{10} = U;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty1.ElementSize;
	}

	multiclass sve_intx_dot<bit opc, string asm> {
	def _S : sve_intx_dot<0b0, opc, asm, ZPR32, ZPR8>;
	def _D : sve_intx_dot<0b1, opc, asm, ZPR64, ZPR16>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Dot Product Group - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2,
	ZPRRegOp zprty3, Operand itype>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop),
	asm, "\t$Zda, $Zn, $Zm$iop",
	"", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	let Inst{31-23} = 0b010001001;
	let Inst{22} = sz;
	let Inst{21} = 0b1;
	let Inst{15-11} = 0;
	let Inst{10} = U;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm> {
	def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> {
	bits<2> iop;
	bits<3> Zm;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> {
	bits<1> iop;
	bits<4> Zm;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Complex Integer Dot Product Group
	//===----------------------------------------------------------------------===//

	class sve2_complex_int_arith<bits<2> sz, bits<4> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm,
	complexrotateop:$rot),
	asm, "\t$Zda, $Zn, $Zm, $rot", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<5> Zm;
	bits<2> rot;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15-12} = opc;
	let Inst{11-10} = rot;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_cintx_dot<string asm> {
	def _S : sve2_complex_int_arith<0b10, 0b0001, asm, ZPR32, ZPR8>;
	def _D : sve2_complex_int_arith<0b11, 0b0001, asm, ZPR64, ZPR16>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Complex Multiply-Add Group
	//===----------------------------------------------------------------------===//

	multiclass sve2_int_cmla<bit opc, string asm> {
	def _B : sve2_complex_int_arith<0b00, { 0b001, opc }, asm, ZPR8, ZPR8>;
	def _H : sve2_complex_int_arith<0b01, { 0b001, opc }, asm, ZPR16, ZPR16>;
	def _S : sve2_complex_int_arith<0b10, { 0b001, opc }, asm, ZPR32, ZPR32>;
	def _D : sve2_complex_int_arith<0b11, { 0b001, opc }, asm, ZPR64, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Complex Integer Dot Product - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve2_complex_int_arith_indexed<bits<2> sz, bits<4> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2,
	ZPRRegOp zprty3, Operand itype>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop,
	complexrotateop:$rot),
	asm, "\t$Zda, $Zn, $Zm$iop, $rot", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<2> rot;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{15-12} = opc;
	let Inst{11-10} = rot;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_cintx_dot_by_indexed_elem<string asm> {
	def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> {
	bits<2> iop;
	bits<3> Zm;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> {
	bit iop;
	bits<4> Zm;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Complex Multiply-Add - Indexed Group
	//===----------------------------------------------------------------------===//

	multiclass sve2_cmla_by_indexed_elem<bit opc, string asm> {
	def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS> {
	bits<2> iop;
	bits<3> Zm;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD> {
	bit iop;
	bits<4> Zm;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Integer Multiply - Unpredicated Group
	//===----------------------------------------------------------------------===//

	class sve2_int_mul<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b011;
	let Inst{12-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_int_mul<bits<3> opc, string asm> {
	def _B : sve2_int_mul<0b00, opc, asm, ZPR8>;
	def _H : sve2_int_mul<0b01, opc, asm, ZPR16>;
	def _S : sve2_int_mul<0b10, opc, asm, ZPR32>;
	def _D : sve2_int_mul<0b11, opc, asm, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Integer Multiply - Indexed Group
	//===----------------------------------------------------------------------===//

	class sve2_int_mul_by_indexed_elem<bits<2> sz, bits<4> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2,
	ZPRRegOp zprty3, Operand itype>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty3:$Zm, itype:$iop),
	asm, "\t$Zd, $Zn, $Zm$iop", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{15-14} = 0b11;
	let Inst{13-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_int_mul_by_indexed_elem<bits<4> opc, string asm> {
	def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> {
	bits<3> Zm;
	bits<3> iop;
	let Inst{22} = iop{2};
	let Inst{20-19} = iop{1-0};
	let Inst{18-16} = Zm;
	}
	def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> {
	bits<3> Zm;
	bits<2> iop;
	let Inst{20-19} = iop;
	let Inst{18-16} = Zm;
	}
	def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> {
	bits<4> Zm;
	bit iop;
	let Inst{20} = iop;
	let Inst{19-16} = Zm;
	}
	}

	multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> {
	def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm,
	ZPR32, ZPR16, ZPR3b16, VectorIndexH> {
	bits<3> Zm;
	bits<3> iop;
	let Inst{20-19} = iop{2-1};
	let Inst{18-16} = Zm;
	let Inst{11} = iop{0};
	}
	def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm,
	ZPR64, ZPR32, ZPR4b32, VectorIndexS> {
	bits<4> Zm;
	bits<2> iop;
	let Inst{20} = iop{1};
	let Inst{19-16} = Zm;
	let Inst{11} = iop{0};
	}
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Integer - Predicated Group
	//===----------------------------------------------------------------------===//

	class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", "", []>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zm;
	bits<5> Zdn;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = opc{5-1};
	let Inst{15-14} = 0b10;
	let Inst{13} = opc{0};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve2_int_arith_pred<bits<6> opc, string asm> {
	def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>;
	def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>;
	def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>;
	def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>;
	}

	class sve2_int_sadd_long_accum_pairwise<bits<2> sz, bit U, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zda), (ins PPR3bAny:$Pg, zprty1:$_Zda, zprty2:$Zn),
	asm, "\t$Zda, $Pg/m, $Zn", "", []>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zn;
	bits<5> Zda;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21-17} = 0b00010;
	let Inst{16} = U;
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty1.ElementSize;
	}

	multiclass sve2_int_sadd_long_accum_pairwise<bit U, string asm> {
	def _H : sve2_int_sadd_long_accum_pairwise<0b01, U, asm, ZPR16, ZPR8>;
	def _S : sve2_int_sadd_long_accum_pairwise<0b10, U, asm, ZPR32, ZPR16>;
	def _D : sve2_int_sadd_long_accum_pairwise<0b11, U, asm, ZPR64, ZPR32>;
	}

	class sve2_int_un_pred_arit<bits<2> sz, bit Q, bits<2> opc,
	string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Zd, $Pg/m, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b01000100;
	let Inst{23-22} = sz;
	let Inst{21-20} = 0b00;
	let Inst{19} = Q;
	let Inst{18} = 0b0;
	let Inst{17-16} = opc;
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve2_int_un_pred_arit_s<bits<3> opc, string asm> {
	def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
	}

	multiclass sve2_int_un_pred_arit<bits<3> opc, string asm> {
	def _B : sve2_int_un_pred_arit<0b00, opc{2}, opc{1-0}, asm, ZPR8>;
	def _H : sve2_int_un_pred_arit<0b01, opc{2}, opc{1-0}, asm, ZPR16>;
	def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
	def _D : sve2_int_un_pred_arit<0b11, opc{2}, opc{1-0}, asm, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Widening Integer Arithmetic Group
	//===----------------------------------------------------------------------===//

	class sve2_wide_int_arith<bits<2> sz, bits<5> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2, ZPRRegOp zprty3>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty3:$Zm),
	asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15} = 0b0;
	let Inst{14-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_wide_int_arith_long<bits<5> opc, string asm> {
	def _H : sve2_wide_int_arith<0b01, opc, asm, ZPR16, ZPR8, ZPR8>;
	def _S : sve2_wide_int_arith<0b10, opc, asm, ZPR32, ZPR16, ZPR16>;
	def _D : sve2_wide_int_arith<0b11, opc, asm, ZPR64, ZPR32, ZPR32>;
	}

	multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm> {
	def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>;
	def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>;
	def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>;
	}

	multiclass sve2_pmul_long<bits<1> opc, string asm> {
	def _H : sve2_wide_int_arith<0b01, {0b1101, opc}, asm, ZPR16, ZPR8, ZPR8>;
	def _D : sve2_wide_int_arith<0b11, {0b1101, opc}, asm, ZPR64, ZPR32, ZPR32>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Misc Group
	//===----------------------------------------------------------------------===//

	class sve2_misc<bits<2> sz, bits<4> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm),
	asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15-14} = 0b10;
	let Inst{13-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_misc_bitwise<bits<4> opc, string asm> {
	def _B : sve2_misc<0b00, opc, asm, ZPR8, ZPR8>;
	def _H : sve2_misc<0b01, opc, asm, ZPR16, ZPR16>;
	def _S : sve2_misc<0b10, opc, asm, ZPR32, ZPR32>;
	def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>;
	}

	-multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
	- let DestructiveInstType = Destructive, ElementSize = ElementSizeNone in {
	- def _B : sve2_misc<0b00, { 0b010, opc }, asm, ZPR8, ZPR8>;
	- def _H : sve2_misc<0b01, { 0b010, opc }, asm, ZPR16, ZPR16>;
	- def _S : sve2_misc<0b10, { 0b010, opc }, asm, ZPR32, ZPR32>;
	- def _D : sve2_misc<0b11, { 0b010, opc }, asm, ZPR64, ZPR64>;
	- }
	-}
	-
	multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm> {
	def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
	def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
	def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
	}

	+class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm,
	+ ZPRRegOp zprty1, ZPRRegOp zprty2>
	+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
	+ asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
	+ bits<5> Zd;
	+ bits<5> Zn;
	+ bits<5> Zm;
	+ let Inst{31-24} = 0b01000101;
	+ let Inst{23-22} = sz;
	+ let Inst{21} = 0b0;
	+ let Inst{20-16} = Zm;
	+ let Inst{15-11} = 0b10010;
	+ let Inst{10} = opc;
	+ let Inst{9-5} = Zn;
	+ let Inst{4-0} = Zd;
	+
	+ let Constraints = "$Zd = $_Zd";
	+ let DestructiveInstType = Destructive;
	+ let ElementSize = ElementSizeNone;
	+}
	+
	+multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
	+ def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8, ZPR8>;
	+ def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>;
	+ def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>;
	+ def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>;
	+}
	+
	class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2,
	Operand immtype>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm),
	asm, "\t$Zd, $Zn, $imm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> imm;
	let Inst{31-23} = 0b010001010;
	let Inst{22} = tsz8_64{2};
	let Inst{21} = 0b0;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-16} = imm{2-0}; // imm3
	let Inst{15-12} = 0b1010;
	let Inst{11-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
	def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm,
	ZPR16, ZPR8, vecshiftL8>;
	def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm,
	ZPR32, ZPR16, vecshiftL16> {
	let Inst{19} = imm{3};
	}
	def _D : sve2_bitwise_shift_left_long<{1,?,?}, opc, asm,
	ZPR64, ZPR32, vecshiftL32> {
	let Inst{20-19} = imm{4-3};
	}
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Accumulate Group
	//===----------------------------------------------------------------------===//

	-class sve2_int_bin_cons_shift_imm<bits<4> tsz8_64, bit opc, string asm,
	- ZPRRegOp zprty, Operand immtype>
	-: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
	+class sve2_int_bin_shift_imm<bits<4> tsz8_64, bit opc, string asm,
	+ ZPRRegOp zprty, Operand immtype>
	+: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, immtype:$imm),
	asm, "\t$Zd, $Zn, $imm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<6> imm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = tsz8_64{3-2};
	let Inst{21} = 0b0;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-16} = imm{2-0}; // imm3
	let Inst{15-11} = 0b11110;
	let Inst{10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	+
	+ let Constraints = "$Zd = $_Zd";
	}

	-multiclass sve2_int_bin_cons_shift_imm_left<bit opc, string asm> {
	- def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
	- def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
	+multiclass sve2_int_bin_shift_imm_left<bit opc, string asm> {
	+ def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
	+ def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
	let Inst{19} = imm{3};
	}
	- def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
	+ def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
	let Inst{20-19} = imm{4-3};
	}
	- def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
	+ def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
	let Inst{22} = imm{5};
	let Inst{20-19} = imm{4-3};
	}
	}

	-multiclass sve2_int_bin_cons_shift_imm_right<bit opc, string asm> {
	- def _B : sve2_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
	- def _H : sve2_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
	+multiclass sve2_int_bin_shift_imm_right<bit opc, string asm> {
	+ def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
	+ def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
	let Inst{19} = imm{3};
	}
	- def _S : sve2_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
	+ def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
	let Inst{20-19} = imm{4-3};
	}
	- def _D : sve2_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
	+ def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
	let Inst{22} = imm{5};
	let Inst{20-19} = imm{4-3};
	}
	}

	-class sve2_int_bin_accum_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
	- ZPRRegOp zprty, Operand immtype>
	+class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
	+ ZPRRegOp zprty, Operand immtype>
	: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, immtype:$imm),
	asm, "\t$Zda, $Zn, $imm",
	"", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<6> imm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = tsz8_64{3-2};
	let Inst{21} = 0b0;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-16} = imm{2-0}; // imm3
	let Inst{15-12} = 0b1110;
	let Inst{11-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	-multiclass sve2_int_bin_accum_cons_shift_imm_right<bits<2> opc, string asm> {
	- def _B : sve2_int_bin_accum_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
	- def _H : sve2_int_bin_accum_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
	+multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm> {
	+ def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
	+ def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
	let Inst{19} = imm{3};
	}
	- def _S : sve2_int_bin_accum_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
	+ def _S : sve2_int_bin_accum_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
	let Inst{20-19} = imm{4-3};
	}
	- def _D : sve2_int_bin_accum_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
	+ def _D : sve2_int_bin_accum_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
	let Inst{22} = imm{5};
	let Inst{20-19} = imm{4-3};
	}
	}

	class sve2_int_cadd<bits<2> sz, bit opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, complexrotateopodd:$rot),
	asm, "\t$Zdn, $_Zdn, $Zm, $rot", "", []>, Sched<[]> {
	bits<5> Zdn;
	bits<5> Zm;
	bit rot;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = sz;
	let Inst{21-17} = 0b00000;
	let Inst{16} = opc;
	let Inst{15-11} = 0b11011;
	let Inst{10} = rot;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_int_cadd<bit opc, string asm> {
	def _B : sve2_int_cadd<0b00, opc, asm, ZPR8>;
	def _H : sve2_int_cadd<0b01, opc, asm, ZPR16>;
	def _S : sve2_int_cadd<0b10, opc, asm, ZPR32>;
	def _D : sve2_int_cadd<0b11, opc, asm, ZPR64>;
	}

	class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm,
	ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm),
	asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zda;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15-14} = 0b11;
	let Inst{13-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zda;

	let Constraints = "$Zda = $_Zda";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_int_absdiff_accum<bit opc, string asm> {
	def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>;
	def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>;
	def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>;
	def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>;
	}

	multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm> {
	def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
	def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
	def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
	}

	multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm> {
	def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm,
	ZPR32, ZPR32>;
	def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm,
	ZPR64, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Narrowing Group
	//===----------------------------------------------------------------------===//

	-class sve2_int_bin_cons_shift_imm_narrow<bits<3> tsz8_64, bits<4> opc,
	- string asm, ZPRRegOp zprty1,
	- ZPRRegOp zprty2, Operand immtype>
	+class sve2_int_bin_shift_imm_narrow_bottom<bits<3> tsz8_64, bits<3> opc,
	+ string asm, ZPRRegOp zprty1,
	+ ZPRRegOp zprty2, Operand immtype>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm),
	asm, "\t$Zd, $Zn, $imm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> imm;
	let Inst{31-23} = 0b010001010;
	let Inst{22} = tsz8_64{2};
	let Inst{21} = 0b1;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-16} = imm{2-0}; // imm3
	let Inst{15-14} = 0b00;
	- let Inst{13-10} = opc;
	+ let Inst{13-11} = opc;
	+ let Inst{10} = 0b0;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	-multiclass sve2_int_bin_cons_shift_imm_right_narrow<bits<4> opc, string asm> {
	- def _B : sve2_int_bin_cons_shift_imm_narrow<{0,0,1}, opc, asm, ZPR8, ZPR16,
	- vecshiftR8>;
	- def _H : sve2_int_bin_cons_shift_imm_narrow<{0,1,?}, opc, asm, ZPR16, ZPR32,
	- vecshiftR16> {
	+multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm> {
	+ def _B : sve2_int_bin_shift_imm_narrow_bottom<{0,0,1}, opc, asm, ZPR8, ZPR16,
	+ vecshiftR8>;
	+ def _H : sve2_int_bin_shift_imm_narrow_bottom<{0,1,?}, opc, asm, ZPR16, ZPR32,
	+ vecshiftR16> {
	let Inst{19} = imm{3};
	}
	- def _S : sve2_int_bin_cons_shift_imm_narrow<{1,?,?}, opc, asm, ZPR32, ZPR64,
	- vecshiftR32> {
	+ def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64,
	+ vecshiftR32> {
	let Inst{20-19} = imm{4-3};
	}
	}

	-class sve2_int_addsub_narrow_high<bits<2> sz, bits<3> opc, string asm,
	- ZPRRegOp zprty1, ZPRRegOp zprty2>
	+class sve2_int_bin_shift_imm_narrow_top<bits<3> tsz8_64, bits<3> opc,
	+ string asm, ZPRRegOp zprty1,
	+ ZPRRegOp zprty2, Operand immtype>
	+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, immtype:$imm),
	+ asm, "\t$Zd, $Zn, $imm",
	+ "", []>, Sched<[]> {
	+ bits<5> Zd;
	+ bits<5> Zn;
	+ bits<5> imm;
	+ let Inst{31-23} = 0b010001010;
	+ let Inst{22} = tsz8_64{2};
	+ let Inst{21} = 0b1;
	+ let Inst{20-19} = tsz8_64{1-0};
	+ let Inst{18-16} = imm{2-0}; // imm3
	+ let Inst{15-14} = 0b00;
	+ let Inst{13-11} = opc;
	+ let Inst{10} = 0b1;
	+ let Inst{9-5} = Zn;
	+ let Inst{4-0} = Zd;
	+
	+ let Constraints = "$Zd = $_Zd";
	+}
	+
	+multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm> {
	+ def _B : sve2_int_bin_shift_imm_narrow_top<{0,0,1}, opc, asm, ZPR8, ZPR16,
	+ vecshiftR8>;
	+ def _H : sve2_int_bin_shift_imm_narrow_top<{0,1,?}, opc, asm, ZPR16, ZPR32,
	+ vecshiftR16> {
	+ let Inst{19} = imm{3};
	+ }
	+ def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64,
	+ vecshiftR32> {
	+ let Inst{20-19} = imm{4-3};
	+ }
	+}
	+
	+class sve2_int_addsub_narrow_high_bottom<bits<2> sz, bits<2> opc, string asm,
	+ ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm),
	asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b01000101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b011;
	- let Inst{12-10} = opc; // S, R, T
	+ let Inst{12-11} = opc; // S, R
	+ let Inst{10} = 0b0; // Top
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	-multiclass sve2_int_addsub_narrow_high<bits<3> opc, string asm> {
	- def _B : sve2_int_addsub_narrow_high<0b01, opc, asm, ZPR8, ZPR16>;
	- def _H : sve2_int_addsub_narrow_high<0b10, opc, asm, ZPR16, ZPR32>;
	- def _S : sve2_int_addsub_narrow_high<0b11, opc, asm, ZPR32, ZPR64>;
	+multiclass sve2_int_addsub_narrow_high_bottom<bits<2> opc, string asm> {
	+ def _B : sve2_int_addsub_narrow_high_bottom<0b01, opc, asm, ZPR8, ZPR16>;
	+ def _H : sve2_int_addsub_narrow_high_bottom<0b10, opc, asm, ZPR16, ZPR32>;
	+ def _S : sve2_int_addsub_narrow_high_bottom<0b11, opc, asm, ZPR32, ZPR64>;
	}

	-class sve2_int_sat_extract_narrow<bits<3> tsz8_64, bits<3> opc, string asm,
	- ZPRRegOp zprty1, ZPRRegOp zprty2>
	+class sve2_int_addsub_narrow_high_top<bits<2> sz, bits<2> opc, string asm,
	+ ZPRRegOp zprty1, ZPRRegOp zprty2>
	+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm),
	+ asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> {
	+ bits<5> Zd;
	+ bits<5> Zn;
	+ bits<5> Zm;
	+ let Inst{31-24} = 0b01000101;
	+ let Inst{23-22} = sz;
	+ let Inst{21} = 0b1;
	+ let Inst{20-16} = Zm;
	+ let Inst{15-13} = 0b011;
	+ let Inst{12-11} = opc; // S, R
	+ let Inst{10} = 0b1; // Top
	+ let Inst{9-5} = Zn;
	+ let Inst{4-0} = Zd;
	+
	+ let Constraints = "$Zd = $_Zd";
	+}
	+
	+multiclass sve2_int_addsub_narrow_high_top<bits<2> opc, string asm> {
	+ def _B : sve2_int_addsub_narrow_high_top<0b01, opc, asm, ZPR8, ZPR16>;
	+ def _H : sve2_int_addsub_narrow_high_top<0b10, opc, asm, ZPR16, ZPR32>;
	+ def _S : sve2_int_addsub_narrow_high_top<0b11, opc, asm, ZPR32, ZPR64>;
	+}
	+
	+class sve2_int_sat_extract_narrow_bottom<bits<3> tsz8_64, bits<2> opc, string asm,
	+ ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs zprty1:$Zd), (ins zprty2:$Zn),
	asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-23} = 0b010001010;
	let Inst{22} = tsz8_64{2};
	let Inst{21} = 0b1;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-13} = 0b000010;
	- let Inst{12-10} = opc;
	+ let Inst{12-11} = opc;
	+ let Inst{10} = 0b0;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	-multiclass sve2_int_sat_extract_narrow<bits<3> opc, string asm> {
	- def _B : sve2_int_sat_extract_narrow<0b001, opc, asm, ZPR8, ZPR16>;
	- def _H : sve2_int_sat_extract_narrow<0b010, opc, asm, ZPR16, ZPR32>;
	- def _S : sve2_int_sat_extract_narrow<0b100, opc, asm, ZPR32, ZPR64>;
	+multiclass sve2_int_sat_extract_narrow_bottom<bits<2> opc, string asm> {
	+ def _B : sve2_int_sat_extract_narrow_bottom<0b001, opc, asm, ZPR8, ZPR16>;
	+ def _H : sve2_int_sat_extract_narrow_bottom<0b010, opc, asm, ZPR16, ZPR32>;
	+ def _S : sve2_int_sat_extract_narrow_bottom<0b100, opc, asm, ZPR32, ZPR64>;
	}

	+class sve2_int_sat_extract_narrow_top<bits<3> tsz8_64, bits<2> opc, string asm,
	+ ZPRRegOp zprty1, ZPRRegOp zprty2>
	+: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn),
	+ asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
	+ bits<5> Zd;
	+ bits<5> Zn;
	+ let Inst{31-23} = 0b010001010;
	+ let Inst{22} = tsz8_64{2};
	+ let Inst{21} = 0b1;
	+ let Inst{20-19} = tsz8_64{1-0};
	+ let Inst{18-13} = 0b000010;
	+ let Inst{12-11} = opc;
	+ let Inst{10} = 0b1;
	+ let Inst{9-5} = Zn;
	+ let Inst{4-0} = Zd;
	+
	+ let Constraints = "$Zd = $_Zd";
	+}
	+
	+multiclass sve2_int_sat_extract_narrow_top<bits<2> opc, string asm> {
	+ def _B : sve2_int_sat_extract_narrow_top<0b001, opc, asm, ZPR8, ZPR16>;
	+ def _H : sve2_int_sat_extract_narrow_top<0b010, opc, asm, ZPR16, ZPR32>;
	+ def _S : sve2_int_sat_extract_narrow_top<0b100, opc, asm, ZPR32, ZPR64>;
	+}
	+
	//===----------------------------------------------------------------------===//
	// SVE Integer Arithmetic - Unary Predicated Group
	//===----------------------------------------------------------------------===//

	class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc,
	string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Zd, $Pg/m, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21-20} = 0b01;
	let Inst{19} = opc{0};
	let Inst{18-16} = opc{3-1};
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm> {
	def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>;
	def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
	def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
	def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
	}

	multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm> {
	def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>;
	def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
	def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
	}

	multiclass sve_int_un_pred_arit_0_w<bits<3> opc, string asm> {
	def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>;
	def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
	}

	multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm> {
	def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>;
	}

	multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm> {
	def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>;
	def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
	def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
	def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
	}

	multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm> {
	def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>;
	def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>;
	def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Wide Immediate - Unpredicated Group
	//===----------------------------------------------------------------------===//
	class sve_int_dup_imm<bits<2> sz8_64, string asm,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs zprty:$Zd), (ins immtype:$imm),
	asm, "\t$Zd, $imm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<9> imm;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-14} = 0b11100011;
	let Inst{13} = imm{8}; // sh
	let Inst{12-5} = imm{7-0}; // imm8
	let Inst{4-0} = Zd;

	let isReMaterializable = 1;
	}

	multiclass sve_int_dup_imm<string asm> {
	def _B : sve_int_dup_imm<0b00, asm, ZPR8, cpy_imm8_opt_lsl_i8>;
	def _H : sve_int_dup_imm<0b01, asm, ZPR16, cpy_imm8_opt_lsl_i16>;
	def _S : sve_int_dup_imm<0b10, asm, ZPR32, cpy_imm8_opt_lsl_i32>;
	def _D : sve_int_dup_imm<0b11, asm, ZPR64, cpy_imm8_opt_lsl_i64>;

	def : InstAlias<"mov $Zd, $imm",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, cpy_imm8_opt_lsl_i8:$imm), 1>;
	def : InstAlias<"mov $Zd, $imm",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, cpy_imm8_opt_lsl_i16:$imm), 1>;
	def : InstAlias<"mov $Zd, $imm",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, cpy_imm8_opt_lsl_i32:$imm), 1>;
	def : InstAlias<"mov $Zd, $imm",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, cpy_imm8_opt_lsl_i64:$imm), 1>;

	def : InstAlias<"fmov $Zd, #0.0",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, 0, 0), 1>;
	def : InstAlias<"fmov $Zd, #0.0",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, 0, 0), 1>;
	def : InstAlias<"fmov $Zd, #0.0",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, 0, 0), 1>;
	}

	class sve_int_dup_fpimm<bits<2> sz8_64, Operand fpimmtype,
	string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins fpimmtype:$imm8),
	asm, "\t$Zd, $imm8",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<8> imm8;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-14} = 0b11100111;
	let Inst{13} = 0b0;
	let Inst{12-5} = imm8;
	let Inst{4-0} = Zd;

	let isReMaterializable = 1;
	}

	multiclass sve_int_dup_fpimm<string asm> {
	def _H : sve_int_dup_fpimm<0b01, fpimm16, asm, ZPR16>;
	def _S : sve_int_dup_fpimm<0b10, fpimm32, asm, ZPR32>;
	def _D : sve_int_dup_fpimm<0b11, fpimm64, asm, ZPR64>;

	def : InstAlias<"fmov $Zd, $imm8",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, fpimm16:$imm8), 1>;
	def : InstAlias<"fmov $Zd, $imm8",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, fpimm32:$imm8), 1>;
	def : InstAlias<"fmov $Zd, $imm8",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, fpimm64:$imm8), 1>;
	}

	class sve_int_arith_imm0<bits<2> sz8_64, bits<3> opc, string asm,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm),
	asm, "\t$Zdn, $_Zdn, $imm",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<9> imm;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-19} = 0b100;
	let Inst{18-16} = opc;
	let Inst{15-14} = 0b11;
	let Inst{13} = imm{8}; // sh
	let Inst{12-5} = imm{7-0}; // imm8
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_arith_imm0<bits<3> opc, string asm> {
	def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>;
	def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
	def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
	def _D : sve_int_arith_imm0<0b11, opc, asm, ZPR64, addsub_imm8_opt_lsl_i64>;
	}

	class sve_int_arith_imm<bits<2> sz8_64, bits<6> opc, string asm,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, immtype:$imm),
	asm, "\t$Zdn, $_Zdn, $imm",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<8> imm;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21-16} = opc;
	let Inst{15-13} = 0b110;
	let Inst{12-5} = imm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_arith_imm1<bits<2> opc, string asm, Operand immtype> {
	def _B : sve_int_arith_imm<0b00, { 0b1010, opc }, asm, ZPR8, immtype>;
	def _H : sve_int_arith_imm<0b01, { 0b1010, opc }, asm, ZPR16, immtype>;
	def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, immtype>;
	def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, immtype>;
	}

	multiclass sve_int_arith_imm2<string asm> {
	def _B : sve_int_arith_imm<0b00, 0b110000, asm, ZPR8, simm8>;
	def _H : sve_int_arith_imm<0b01, 0b110000, asm, ZPR16, simm8>;
	def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>;
	def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Bitwise Logical - Unpredicated Group
	//===----------------------------------------------------------------------===//

	class sve_int_bin_cons_log<bits<2> opc, string asm>
	: I<(outs ZPR64:$Zd), (ins ZPR64:$Zn, ZPR64:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc{1-0};
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-10} = 0b001100;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_bin_cons_log<bits<2> opc, string asm> {
	def NAME : sve_int_bin_cons_log<opc, asm>;

	def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
	(!cast<Instruction>(NAME) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 1>;
	def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
	(!cast<Instruction>(NAME) ZPR16:$Zd, ZPR16:$Zn, ZPR16:$Zm), 1>;
	def : InstAlias<asm # "\t$Zd, $Zn, $Zm",
	(!cast<Instruction>(NAME) ZPR32:$Zd, ZPR32:$Zn, ZPR32:$Zm), 1>;
	}

	class sve2_int_bitwise_ternary_op_d<bits<3> opc, string asm>
	: I<(outs ZPR64:$Zdn), (ins ZPR64:$_Zdn, ZPR64:$Zm, ZPR64:$Zk),
	asm, "\t$Zdn, $_Zdn, $Zm, $Zk",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<5> Zk;
	bits<5> Zm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc{2-1};
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-11} = 0b00111;
	let Inst{10} = opc{0};
	let Inst{9-5} = Zk;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm> {
	def NAME : sve2_int_bitwise_ternary_op_d<opc, asm>;

	def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
	(!cast<Instruction>(NAME) ZPR8:$Zdn, ZPR8:$Zm, ZPR8:$Zk), 1>;
	def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
	(!cast<Instruction>(NAME) ZPR16:$Zdn, ZPR16:$Zm, ZPR16:$Zk), 1>;
	def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
	(!cast<Instruction>(NAME) ZPR32:$Zdn, ZPR32:$Zm, ZPR32:$Zk), 1>;
	}

	class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, immtype:$imm),
	asm, "\t$Zdn, $_Zdn, $Zm, $imm",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<5> Zm;
	bits<6> imm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = tsz8_64{3-2};
	let Inst{21} = 0b1;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-16} = imm{2-0}; // imm3
	let Inst{15-10} = 0b001101;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve2_int_rotate_right_imm<string asm> {
	def _B : sve2_int_rotate_right_imm<{0,0,0,1}, asm, ZPR8, vecshiftR8>;
	def _H : sve2_int_rotate_right_imm<{0,0,1,?}, asm, ZPR16, vecshiftR16> {
	let Inst{19} = imm{3};
	}
	def _S : sve2_int_rotate_right_imm<{0,1,?,?}, asm, ZPR32, vecshiftR32> {
	let Inst{20-19} = imm{4-3};
	}
	def _D : sve2_int_rotate_right_imm<{1,?,?,?}, asm, ZPR64, vecshiftR64> {
	let Inst{22} = imm{5};
	let Inst{20-19} = imm{4-3};
	}
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Wide Immediate - Predicated Group
	//===----------------------------------------------------------------------===//

	class sve_int_dup_fpimm_pred<bits<2> sz, Operand fpimmtype,
	string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPRAny:$Pg, fpimmtype:$imm8),
	asm, "\t$Zd, $Pg/m, $imm8",
	"",
	[]>, Sched<[]> {
	bits<4> Pg;
	bits<5> Zd;
	bits<8> imm8;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz;
	let Inst{21-20} = 0b01;
	let Inst{19-16} = Pg;
	let Inst{15-13} = 0b110;
	let Inst{12-5} = imm8;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_dup_fpimm_pred<string asm> {
	def _H : sve_int_dup_fpimm_pred<0b01, fpimm16, asm, ZPR16>;
	def _S : sve_int_dup_fpimm_pred<0b10, fpimm32, asm, ZPR32>;
	def _D : sve_int_dup_fpimm_pred<0b11, fpimm64, asm, ZPR64>;

	def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, fpimm16:$imm8), 1>;
	def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, fpimm32:$imm8), 1>;
	def : InstAlias<"fmov $Zd, $Pg/m, $imm8",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, fpimm64:$imm8), 1>;
	}

	class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm,
	ZPRRegOp zprty, string pred_qual, dag iops>
	: I<(outs zprty:$Zd), iops,
	asm, "\t$Zd, $Pg"#pred_qual#", $imm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<4> Pg;
	bits<9> imm;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-20} = 0b01;
	let Inst{19-16} = Pg;
	let Inst{15} = 0b0;
	let Inst{14} = m;
	let Inst{13} = imm{8}; // sh
	let Inst{12-5} = imm{7-0}; // imm8
	let Inst{4-0} = Zd;

	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_dup_imm_pred_merge<string asm> {
	let Constraints = "$Zd = $_Zd" in {
	def _B : sve_int_dup_imm_pred<0b00, 1, asm, ZPR8, "/m", (ins ZPR8:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>;
	def _H : sve_int_dup_imm_pred<0b01, 1, asm, ZPR16, "/m", (ins ZPR16:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>;
	def _S : sve_int_dup_imm_pred<0b10, 1, asm, ZPR32, "/m", (ins ZPR32:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>;
	def _D : sve_int_dup_imm_pred<0b11, 1, asm, ZPR64, "/m", (ins ZPR64:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>;
	}

	def : InstAlias<"mov $Zd, $Pg/m, $imm",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $imm",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $imm",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $imm",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>;

	def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>;
	def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, 0, 0), 0>;
	def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>;
	}

	multiclass sve_int_dup_imm_pred_zero<string asm> {
	def _B : sve_int_dup_imm_pred<0b00, 0, asm, ZPR8, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>;
	def _H : sve_int_dup_imm_pred<0b01, 0, asm, ZPR16, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>;
	def _S : sve_int_dup_imm_pred<0b10, 0, asm, ZPR32, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>;
	def _D : sve_int_dup_imm_pred<0b11, 0, asm, ZPR64, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>;

	def : InstAlias<"mov $Zd, $Pg/z, $imm",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>;
	def : InstAlias<"mov $Zd, $Pg/z, $imm",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>;
	def : InstAlias<"mov $Zd, $Pg/z, $imm",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>;
	def : InstAlias<"mov $Zd, $Pg/z, $imm",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Compare - Vectors Group
	//===----------------------------------------------------------------------===//

	class sve_int_cmp<bit cmp_1, bits<2> sz8_64, bits<3> opc, string asm,
	PPRRegOp pprty, ZPRRegOp zprty1, ZPRRegOp zprty2>
	: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty1:$Zn, zprty2:$Zm),
	asm, "\t$Pd, $Pg/z, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<3> Pg;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00100100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15} = opc{2};
	let Inst{14} = cmp_1;
	let Inst{13} = opc{1};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;

	let Defs = [NZCV];
	}

	multiclass sve_int_cmp_0<bits<3> opc, string asm> {
	def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR8>;
	def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR16>;
	def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR32>;
	def _D : sve_int_cmp<0b0, 0b11, opc, asm, PPR64, ZPR64, ZPR64>;
	}

	multiclass sve_int_cmp_0_wide<bits<3> opc, string asm> {
	def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR64>;
	def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR64>;
	def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR64>;
	}

	multiclass sve_int_cmp_1_wide<bits<3> opc, string asm> {
	def _B : sve_int_cmp<0b1, 0b00, opc, asm, PPR8, ZPR8, ZPR64>;
	def _H : sve_int_cmp<0b1, 0b01, opc, asm, PPR16, ZPR16, ZPR64>;
	def _S : sve_int_cmp<0b1, 0b10, opc, asm, PPR32, ZPR32, ZPR64>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Integer Compare - Signed Immediate Group
	//===----------------------------------------------------------------------===//

	class sve_int_scmp_vi<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
	ZPRRegOp zprty,
	Operand immtype>
	: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, immtype:$imm5),
	asm, "\t$Pd, $Pg/z, $Zn, $imm5",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<3> Pg;
	bits<5> Zn;
	bits<5> imm5;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b0;
	let Inst{20-16} = imm5;
	let Inst{15} = opc{2};
	let Inst{14} = 0b0;
	let Inst{13} = opc{1};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;

	let Defs = [NZCV];
	}

	multiclass sve_int_scmp_vi<bits<3> opc, string asm> {
	def _B : sve_int_scmp_vi<0b00, opc, asm, PPR8, ZPR8, simm5_32b>;
	def _H : sve_int_scmp_vi<0b01, opc, asm, PPR16, ZPR16, simm5_32b>;
	def _S : sve_int_scmp_vi<0b10, opc, asm, PPR32, ZPR32, simm5_32b>;
	def _D : sve_int_scmp_vi<0b11, opc, asm, PPR64, ZPR64, simm5_64b>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Integer Compare - Unsigned Immediate Group
	//===----------------------------------------------------------------------===//

	class sve_int_ucmp_vi<bits<2> sz8_64, bits<2> opc, string asm, PPRRegOp pprty,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, immtype:$imm7),
	asm, "\t$Pd, $Pg/z, $Zn, $imm7",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<3> Pg;
	bits<5> Zn;
	bits<7> imm7;
	let Inst{31-24} = 0b00100100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 1;
	let Inst{20-14} = imm7;
	let Inst{13} = opc{1};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;

	let Defs = [NZCV];
	}

	multiclass sve_int_ucmp_vi<bits<2> opc, string asm> {
	def _B : sve_int_ucmp_vi<0b00, opc, asm, PPR8, ZPR8, imm0_127>;
	def _H : sve_int_ucmp_vi<0b01, opc, asm, PPR16, ZPR16, imm0_127>;
	def _S : sve_int_ucmp_vi<0b10, opc, asm, PPR32, ZPR32, imm0_127>;
	def _D : sve_int_ucmp_vi<0b11, opc, asm, PPR64, ZPR64, imm0_127>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Integer Compare - Scalars Group
	//===----------------------------------------------------------------------===//

	class sve_int_cterm<bit sz, bit opc, string asm, RegisterClass rt>
	: I<(outs), (ins rt:$Rn, rt:$Rm),
	asm, "\t$Rn, $Rm",
	"",
	[]>, Sched<[]> {
	bits<5> Rm;
	bits<5> Rn;
	let Inst{31-23} = 0b001001011;
	let Inst{22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rm;
	let Inst{15-10} = 0b001000;
	let Inst{9-5} = Rn;
	let Inst{4} = opc;
	let Inst{3-0} = 0b0000;

	let Defs = [NZCV];
	}

	class sve_int_while_rr<bits<2> sz8_64, bits<4> opc, string asm,
	RegisterClass gprty, PPRRegOp pprty>
	: I<(outs pprty:$Pd), (ins gprty:$Rn, gprty:$Rm),
	asm, "\t$Pd, $Rn, $Rm",
	"", []>, Sched<[]> {
	bits<4> Pd;
	bits<5> Rm;
	bits<5> Rn;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b000;
	let Inst{12-10} = opc{3-1};
	let Inst{9-5} = Rn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;

	let Defs = [NZCV];
	}

	multiclass sve_int_while4_rr<bits<3> opc, string asm> {
	def _B : sve_int_while_rr<0b00, { 0, opc }, asm, GPR32, PPR8>;
	def _H : sve_int_while_rr<0b01, { 0, opc }, asm, GPR32, PPR16>;
	def _S : sve_int_while_rr<0b10, { 0, opc }, asm, GPR32, PPR32>;
	def _D : sve_int_while_rr<0b11, { 0, opc }, asm, GPR32, PPR64>;
	}

	multiclass sve_int_while8_rr<bits<3> opc, string asm> {
	def _B : sve_int_while_rr<0b00, { 1, opc }, asm, GPR64, PPR8>;
	def _H : sve_int_while_rr<0b01, { 1, opc }, asm, GPR64, PPR16>;
	def _S : sve_int_while_rr<0b10, { 1, opc }, asm, GPR64, PPR32>;
	def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64>;
	}

	class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
	PPRRegOp pprty>
	: I<(outs pprty:$Pd), (ins GPR64:$Rn, GPR64:$Rm),
	asm, "\t$Pd, $Rn, $Rm",
	"", []>, Sched<[]> {
	bits<4> Pd;
	bits<5> Rm;
	bits<5> Rn;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rm;
	let Inst{15-10} = 0b001100;
	let Inst{9-5} = Rn;
	let Inst{4} = rw;
	let Inst{3-0} = Pd;

	let Defs = [NZCV];
	}

	multiclass sve2_int_while_rr<bits<1> rw, string asm> {
	def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>;
	def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>;
	def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>;
	def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Fast Reduction Group
	//===----------------------------------------------------------------------===//

	class sve_fp_fast_red<bits<2> sz, bits<3> opc, string asm,
	ZPRRegOp zprty, RegisterClass dstRegClass>
	: I<(outs dstRegClass:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Vd, $Pg, $Zn",
	"",
	[]>, Sched<[]> {
	bits<5> Zn;
	bits<5> Vd;
	bits<3> Pg;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21-19} = 0b000;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Vd;
	}

	multiclass sve_fp_fast_red<bits<3> opc, string asm> {
	def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16>;
	def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32>;
	def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Floating Point Accumulating Reduction Group
	//===----------------------------------------------------------------------===//

	class sve_fp_2op_p_vd<bits<2> sz, bits<3> opc, string asm,
	ZPRRegOp zprty, RegisterClass dstRegClass>
	: I<(outs dstRegClass:$Vdn), (ins PPR3bAny:$Pg, dstRegClass:$_Vdn, zprty:$Zm),
	asm, "\t$Vdn, $Pg, $_Vdn, $Zm",
	"",
	[]>,
	Sched<[]> {
	bits<3> Pg;
	bits<5> Vdn;
	bits<5> Zm;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21-19} = 0b011;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Vdn;

	let Constraints = "$Vdn = $_Vdn";
	}

	multiclass sve_fp_2op_p_vd<bits<3> opc, string asm> {
	def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16>;
	def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32>;
	def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Floating Point Compare - Vectors Group
	//===----------------------------------------------------------------------===//

	class sve_fp_3op_p_pd<bits<2> sz, bits<3> opc, string asm, PPRRegOp pprty,
	ZPRRegOp zprty>
	: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
	asm, "\t$Pd, $Pg/z, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<3> Pg;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21} = 0b0;
	let Inst{20-16} = Zm;
	let Inst{15} = opc{2};
	let Inst{14} = 0b1;
	let Inst{13} = opc{1};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;
	}

	multiclass sve_fp_3op_p_pd<bits<3> opc, string asm> {
	def _H : sve_fp_3op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
	def _S : sve_fp_3op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
	def _D : sve_fp_3op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Floating Point Compare - with Zero Group
	//===----------------------------------------------------------------------===//

	class sve_fp_2op_p_pd<bits<2> sz, bits<3> opc, string asm, PPRRegOp pprty,
	ZPRRegOp zprty>
	: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Pd, $Pg/z, $Zn, #0.0",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<3> Pg;
	bits<5> Zn;
	let Inst{31-24} = 0b01100101;
	let Inst{23-22} = sz;
	let Inst{21-18} = 0b0100;
	let Inst{17-16} = opc{2-1};
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;
	}

	multiclass sve_fp_2op_p_pd<bits<3> opc, string asm> {
	def _H : sve_fp_2op_p_pd<0b01, opc, asm, PPR16, ZPR16>;
	def _S : sve_fp_2op_p_pd<0b10, opc, asm, PPR32, ZPR32>;
	def _D : sve_fp_2op_p_pd<0b11, opc, asm, PPR64, ZPR64>;
	}


	//===----------------------------------------------------------------------===//
	//SVE Index Generation Group
	//===----------------------------------------------------------------------===//

	class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	Operand imm_ty>
	: I<(outs zprty:$Zd), (ins imm_ty:$imm5, imm_ty:$imm5b),
	asm, "\t$Zd, $imm5, $imm5b",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> imm5;
	bits<5> imm5b;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = imm5b;
	let Inst{15-10} = 0b010000;
	let Inst{9-5} = imm5;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_index_ii<string asm> {
	def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_32b>;
	def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_32b>;
	def _S : sve_int_index_ii<0b10, asm, ZPR32, simm5_32b>;
	def _D : sve_int_index_ii<0b11, asm, ZPR64, simm5_64b>;
	}

	class sve_int_index_ir<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegType, Operand imm_ty>
	: I<(outs zprty:$Zd), (ins imm_ty:$imm5, srcRegType:$Rm),
	asm, "\t$Zd, $imm5, $Rm",
	"", []>, Sched<[]> {
	bits<5> Rm;
	bits<5> Zd;
	bits<5> imm5;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rm;
	let Inst{15-10} = 0b010010;
	let Inst{9-5} = imm5;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_index_ir<string asm> {
	def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_32b>;
	def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_32b>;
	def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>;
	def _D : sve_int_index_ir<0b11, asm, ZPR64, GPR64, simm5_64b>;
	}

	class sve_int_index_ri<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegType, Operand imm_ty>
	: I<(outs zprty:$Zd), (ins srcRegType:$Rn, imm_ty:$imm5),
	asm, "\t$Zd, $Rn, $imm5",
	"", []>, Sched<[]> {
	bits<5> Rn;
	bits<5> Zd;
	bits<5> imm5;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = imm5;
	let Inst{15-10} = 0b010001;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_index_ri<string asm> {
	def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_32b>;
	def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_32b>;
	def _S : sve_int_index_ri<0b10, asm, ZPR32, GPR32, simm5_32b>;
	def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>;
	}

	class sve_int_index_rr<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegType>
	: I<(outs zprty:$Zd), (ins srcRegType:$Rn, srcRegType:$Rm),
	asm, "\t$Zd, $Rn, $Rm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Rm;
	bits<5> Rn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Rm;
	let Inst{15-10} = 0b010011;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_index_rr<string asm> {
	def _B : sve_int_index_rr<0b00, asm, ZPR8, GPR32>;
	def _H : sve_int_index_rr<0b01, asm, ZPR16, GPR32>;
	def _S : sve_int_index_rr<0b10, asm, ZPR32, GPR32>;
	def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>;
	}
	//
	//===----------------------------------------------------------------------===//
	// SVE Bitwise Shift - Predicated Group
	//===----------------------------------------------------------------------===//
	class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm,
	ZPRRegOp zprty, Operand immtype,
	ElementSizeEnum size>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $imm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bits<6> imm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = tsz8_64{3-2};
	let Inst{21-20} = 0b00;
	let Inst{19-16} = opc;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-8} = tsz8_64{1-0};
	let Inst{7-5} = imm{2-0}; // imm3
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = size;
	}

	multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm> {
	def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8,
	ElementSizeB>;
	def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16,
	ElementSizeH> {
	let Inst{8} = imm{3};
	}
	def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32,
	ElementSizeS> {
	let Inst{9-8} = imm{4-3};
	}
	def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64,
	ElementSizeD> {
	let Inst{22} = imm{5};
	let Inst{9-8} = imm{4-3};
	}
	}

	multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm> {
	def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8,
	ElementSizeB>;
	def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16,
	ElementSizeH> {
	let Inst{8} = imm{3};
	}
	def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32,
	ElementSizeS> {
	let Inst{9-8} = imm{4-3};
	}
	def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64,
	ElementSizeD> {
	let Inst{22} = imm{5};
	let Inst{9-8} = imm{4-3};
	}
	}

	class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc,
	string asm, ZPRRegOp zprty, ZPRRegOp zprty2>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty2:$Zm),
	asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bits<5> Zm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21-20} = 0b01;
	let Inst{19} = wide;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_bin_pred_shift<bits<3> opc, string asm> {
	def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>;
	def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>;
	def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>;
	def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>;
	}

	multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm> {
	def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>;
	def _H : sve_int_bin_pred_shift<0b01, 0b1, opc, asm, ZPR16, ZPR64>;
	def _S : sve_int_bin_pred_shift<0b10, 0b1, opc, asm, ZPR32, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Shift - Unpredicated Group
	//===----------------------------------------------------------------------===//

	class sve_int_bin_cons_shift_wide<bits<2> sz8_64, bits<2> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, ZPR64:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_64;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-12} = 0b1000;
	let Inst{11-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm> {
	def _B : sve_int_bin_cons_shift_wide<0b00, opc, asm, ZPR8>;
	def _H : sve_int_bin_cons_shift_wide<0b01, opc, asm, ZPR16>;
	def _S : sve_int_bin_cons_shift_wide<0b10, opc, asm, ZPR32>;
	}

	class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
	ZPRRegOp zprty, Operand immtype>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
	asm, "\t$Zd, $Zn, $imm",
	"", []>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<6> imm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = tsz8_64{3-2};
	let Inst{21} = 0b1;
	let Inst{20-19} = tsz8_64{1-0};
	let Inst{18-16} = imm{2-0}; // imm3
	let Inst{15-12} = 0b1001;
	let Inst{11-10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm> {
	def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
	def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
	let Inst{19} = imm{3};
	}
	def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
	let Inst{20-19} = imm{4-3};
	}
	def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
	let Inst{22} = imm{5};
	let Inst{20-19} = imm{4-3};
	}
	}

	multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm> {
	def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
	def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
	let Inst{19} = imm{3};
	}
	def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
	let Inst{20-19} = imm{4-3};
	}
	def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
	let Inst{22} = imm{5};
	let Inst{20-19} = imm{4-3};
	}
	}
	//===----------------------------------------------------------------------===//
	// SVE Memory - Store Group
	//===----------------------------------------------------------------------===//

	class sve_mem_cst_si<bits<2> msz, bits<2> esz, string asm,
	RegisterOperand VecList>
	: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
	asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zt;
	bits<4> imm4;
	let Inst{31-25} = 0b1110010;
	let Inst{24-23} = msz;
	let Inst{22-21} = esz;
	let Inst{20} = 0;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b111;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_cst_si<bits<2> msz, bits<2> esz, string asm,
	RegisterOperand listty, ZPRRegOp zprty>
	{
	def NAME : sve_mem_cst_si<msz, esz, asm, listty>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
	(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_est_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
	string asm, Operand immtype>
	: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm4),
	asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zt;
	bits<4> imm4;
	let Inst{31-25} = 0b1110010;
	let Inst{24-23} = sz;
	let Inst{22-21} = nregs;
	let Inst{20} = 1;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b111;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_est_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
	string asm, Operand immtype> {
	def NAME : sve_mem_est_si<sz, nregs, VecList, asm, immtype>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
	(!cast<Instruction>(NAME) VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_est_ss<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
	string asm, RegisterOperand gprty>
	: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$Zt, $Pg, [$Rn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Rn;
	bits<5> Zt;
	let Inst{31-25} = 0b1110010;
	let Inst{24-23} = sz;
	let Inst{22-21} = nregs;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b011;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	class sve_mem_cst_ss_base<bits<4> dtype, string asm,
	RegisterOperand listty, RegisterOperand gprty>
	: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$Zt, $Pg, [$Rn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Rn;
	bits<5> Zt;
	let Inst{31-25} = 0b1110010;
	let Inst{24-21} = dtype;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b010;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_cst_ss<bits<4> dtype, string asm,
	RegisterOperand listty, ZPRRegOp zprty,
	RegisterOperand gprty> {
	def NAME : sve_mem_cst_ss_base<dtype, asm, listty, gprty>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Rm]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
	}

	class sve_mem_cstnt_si<bits<2> msz, string asm, RegisterOperand VecList>
	: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
	asm, "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zt;
	bits<4> imm4;
	let Inst{31-25} = 0b1110010;
	let Inst{24-23} = msz;
	let Inst{22-20} = 0b001;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b111;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_cstnt_si<bits<2> msz, string asm, RegisterOperand listty,
	ZPRRegOp zprty> {
	def NAME : sve_mem_cstnt_si<msz, asm, listty>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $imm4, mul vl]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn]",
	(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_cstnt_ss_base<bits<2> msz, string asm, RegisterOperand listty,
	RegisterOperand gprty>
	: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$Zt, $Pg, [$Rn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Rn;
	bits<5> Zt;
	let Inst{31-25} = 0b1110010;
	let Inst{24-23} = msz;
	let Inst{22-21} = 0b00;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b011;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_cstnt_ss<bits<2> msz, string asm, RegisterOperand listty,
	ZPRRegOp zprty, RegisterOperand gprty> {
	def NAME : sve_mem_cstnt_ss_base<msz, asm, listty, gprty>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Rm]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
	}

	-class sve2_mem_cstnt_vs_base<bits<3> opc, dag iops, string asm,
	- RegisterOperand VecList>
	-: I<(outs VecList:$Zt), iops,
	+class sve2_mem_sstnt_vs_base<bits<3> opc, string asm,
	+ RegisterOperand listty, ZPRRegOp zprty>
	+: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
	asm, "\t$Zt, $Pg, [$Zn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Zn;
	bits<5> Zt;
	let Inst{31-25} = 0b1110010;
	let Inst{24-22} = opc;
	let Inst{21} = 0b0;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	-multiclass sve2_mem_cstnt_vs<bits<3> opc, string asm,
	+multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm,
	RegisterOperand listty, ZPRRegOp zprty> {
	- def _REAL : sve2_mem_cstnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
	- asm, listty>;
	+ def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
	(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
	(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
	- def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
	- (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
	(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
	}

	class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm,
	RegisterOperand VecList, RegisterOperand zprext>
	: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
	asm, "\t$Zt, $Pg, [$Rn, $Zm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zm;
	bits<5> Zt;
	let Inst{31-25} = 0b1110010;
	let Inst{24-22} = opc;
	let Inst{21} = scaled;
	let Inst{20-16} = Zm;
	let Inst{15} = 0b1;
	let Inst{14} = xs;
	let Inst{13} = 0;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_sst_sv_32_scaled<bits<3> opc, string asm,
	RegisterOperand listty,
	ZPRRegOp zprty,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd > {
	def _UXTW_SCALED : sve_mem_sst_sv<opc, 0, 1, asm, listty, uxtw_opnd>;
	def _SXTW_SCALED : sve_mem_sst_sv<opc, 1, 1, asm, listty, sxtw_opnd>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _UXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
	}

	multiclass sve_mem_sst_sv_32_unscaled<bits<3> opc, string asm,
	RegisterOperand listty,
	ZPRRegOp zprty,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd> {
	def _UXTW : sve_mem_sst_sv<opc, 0, 0, asm, listty, uxtw_opnd>;
	def _SXTW : sve_mem_sst_sv<opc, 1, 0, asm, listty, sxtw_opnd>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _UXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
	}

	class sve_mem_sst_sv2<bits<2> msz, bit scaled, string asm,
	RegisterOperand zprext>
	: I<(outs), (ins Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
	asm, "\t$Zt, $Pg, [$Rn, $Zm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zm;
	bits<5> Zt;
	let Inst{31-25} = 0b1110010;
	let Inst{24-23} = msz;
	let Inst{22} = 0b0;
	let Inst{21} = scaled;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_sst_sv_64_scaled<bits<2> msz, string asm,
	RegisterOperand zprext> {
	def "" : sve_mem_sst_sv2<msz, 1, asm, zprext>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;

	}

	multiclass sve_mem_sst_sv_64_unscaled<bits<2> msz, string asm> {
	def "" : sve_mem_sst_sv2<msz, 0, asm, ZPR64ExtLSL8>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
	(!cast<Instruction>(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
	}

	class sve_mem_sst_vi<bits<3> opc, string asm, ZPRRegOp zprty,
	RegisterOperand VecList, Operand imm_ty>
	: I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, zprty:$Zn, imm_ty:$imm5),
	asm, "\t$Zt, $Pg, [$Zn, $imm5]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> imm5;
	bits<5> Zn;
	bits<5> Zt;
	let Inst{31-25} = 0b1110010;
	let Inst{24-23} = opc{2-1};
	let Inst{22} = 0b1;
	let Inst{21} = opc{0};
	let Inst{20-16} = imm5;
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_sst_vi_ptrs<bits<3> opc, string asm, RegisterOperand listty,
	ZPRRegOp zprty, Operand imm_ty> {
	def _IMM : sve_mem_sst_vi<opc, asm, zprty, listty, imm_ty>;

	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
	(!cast<Instruction>(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $imm5]",
	(!cast<Instruction>(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, imm_ty:$imm5), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
	(!cast<Instruction>(NAME # _IMM) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 1>;
	}

	class sve_mem_z_spill<string asm>
	: I<(outs), (ins ZPRAny:$Zt, GPR64sp:$Rn, simm9:$imm9),
	asm, "\t$Zt, [$Rn, $imm9, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<5> Rn;
	bits<5> Zt;
	bits<9> imm9;
	let Inst{31-22} = 0b1110010110;
	let Inst{21-16} = imm9{8-3};
	let Inst{15-13} = 0b010;
	let Inst{12-10} = imm9{2-0};
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayStore = 1;
	}

	multiclass sve_mem_z_spill<string asm> {
	def NAME : sve_mem_z_spill<asm>;

	def : InstAlias<asm # "\t$Zt, [$Rn]",
	(!cast<Instruction>(NAME) ZPRAny:$Zt, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_p_spill<string asm>
	: I<(outs), (ins PPRAny:$Pt, GPR64sp:$Rn, simm9:$imm9),
	asm, "\t$Pt, [$Rn, $imm9, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<4> Pt;
	bits<5> Rn;
	bits<9> imm9;
	let Inst{31-22} = 0b1110010110;
	let Inst{21-16} = imm9{8-3};
	let Inst{15-13} = 0b000;
	let Inst{12-10} = imm9{2-0};
	let Inst{9-5} = Rn;
	let Inst{4} = 0b0;
	let Inst{3-0} = Pt;

	let mayStore = 1;
	}

	multiclass sve_mem_p_spill<string asm> {
	def NAME : sve_mem_p_spill<asm>;

	def : InstAlias<asm # "\t$Pt, [$Rn]",
	(!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Permute - Predicates Group
	//===----------------------------------------------------------------------===//

	class sve_int_perm_bin_perm_pp<bits<3> opc, bits<2> sz8_64, string asm,
	PPRRegOp pprty>
	: I<(outs pprty:$Pd), (ins pprty:$Pn, pprty:$Pm),
	asm, "\t$Pd, $Pn, $Pm",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<4> Pm;
	bits<4> Pn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-20} = 0b10;
	let Inst{19-16} = Pm;
	let Inst{15-13} = 0b010;
	let Inst{12-10} = opc;
	let Inst{9} = 0b0;
	let Inst{8-5} = Pn;
	let Inst{4} = 0b0;
	let Inst{3-0} = Pd;
	}

	multiclass sve_int_perm_bin_perm_pp<bits<3> opc, string asm> {
	def _B : sve_int_perm_bin_perm_pp<opc, 0b00, asm, PPR8>;
	def _H : sve_int_perm_bin_perm_pp<opc, 0b01, asm, PPR16>;
	def _S : sve_int_perm_bin_perm_pp<opc, 0b10, asm, PPR32>;
	def _D : sve_int_perm_bin_perm_pp<opc, 0b11, asm, PPR64>;
	}

	class sve_int_perm_punpk<bit opc, string asm>
	: I<(outs PPR16:$Pd), (ins PPR8:$Pn),
	asm, "\t$Pd, $Pn",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<4> Pn;
	let Inst{31-17} = 0b000001010011000;
	let Inst{16} = opc;
	let Inst{15-9} = 0b0100000;
	let Inst{8-5} = Pn;
	let Inst{4} = 0b0;
	let Inst{3-0} = Pd;
	}

	class sve_int_rdffr_pred<bit s, string asm>
	: I<(outs PPR8:$Pd), (ins PPRAny:$Pg),
	asm, "\t$Pd, $Pg/z",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<4> Pg;
	let Inst{31-23} = 0b001001010;
	let Inst{22} = s;
	let Inst{21-9} = 0b0110001111000;
	let Inst{8-5} = Pg;
	let Inst{4} = 0;
	let Inst{3-0} = Pd;

	let Defs = !if(!eq (s, 1), [NZCV], []);
	let Uses = [FFR];
	}

	class sve_int_rdffr_unpred<string asm> : I<
	(outs PPR8:$Pd), (ins),
	asm, "\t$Pd",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	let Inst{31-4} = 0b0010010100011001111100000000;
	let Inst{3-0} = Pd;

	let Uses = [FFR];
	}

	class sve_int_wrffr<string asm>
	: I<(outs), (ins PPR8:$Pn),
	asm, "\t$Pn",
	"",
	[]>, Sched<[]> {
	bits<4> Pn;
	let Inst{31-9} = 0b00100101001010001001000;
	let Inst{8-5} = Pn;
	let Inst{4-0} = 0b00000;

	let hasSideEffects = 1;
	let Defs = [FFR];
	}

	class sve_int_setffr<string asm>
	: I<(outs), (ins),
	asm, "",
	"",
	[]>, Sched<[]> {
	let Inst{31-0} = 0b00100101001011001001000000000000;

	let hasSideEffects = 1;
	let Defs = [FFR];
	}

	//===----------------------------------------------------------------------===//
	// SVE Permute Vector - Predicated Group
	//===----------------------------------------------------------------------===//

	class sve_int_perm_clast_rz<bits<2> sz8_64, bit ab, string asm,
	ZPRRegOp zprty, RegisterClass rt>
	: I<(outs rt:$Rdn), (ins PPR3bAny:$Pg, rt:$_Rdn, zprty:$Zm),
	asm, "\t$Rdn, $Pg, $_Rdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rdn;
	bits<5> Zm;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-17} = 0b11000;
	let Inst{16} = ab;
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Rdn;

	let Constraints = "$Rdn = $_Rdn";
	}

	multiclass sve_int_perm_clast_rz<bit ab, string asm> {
	def _B : sve_int_perm_clast_rz<0b00, ab, asm, ZPR8, GPR32>;
	def _H : sve_int_perm_clast_rz<0b01, ab, asm, ZPR16, GPR32>;
	def _S : sve_int_perm_clast_rz<0b10, ab, asm, ZPR32, GPR32>;
	def _D : sve_int_perm_clast_rz<0b11, ab, asm, ZPR64, GPR64>;
	}

	class sve_int_perm_clast_vz<bits<2> sz8_64, bit ab, string asm,
	ZPRRegOp zprty, RegisterClass rt>
	: I<(outs rt:$Vdn), (ins PPR3bAny:$Pg, rt:$_Vdn, zprty:$Zm),
	asm, "\t$Vdn, $Pg, $_Vdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Vdn;
	bits<5> Zm;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-17} = 0b10101;
	let Inst{16} = ab;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Vdn;

	let Constraints = "$Vdn = $_Vdn";
	}

	multiclass sve_int_perm_clast_vz<bit ab, string asm> {
	def _B : sve_int_perm_clast_vz<0b00, ab, asm, ZPR8, FPR8>;
	def _H : sve_int_perm_clast_vz<0b01, ab, asm, ZPR16, FPR16>;
	def _S : sve_int_perm_clast_vz<0b10, ab, asm, ZPR32, FPR32>;
	def _D : sve_int_perm_clast_vz<0b11, ab, asm, ZPR64, FPR64>;
	}

	class sve_int_perm_clast_zz<bits<2> sz8_64, bit ab, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
	asm, "\t$Zdn, $Pg, $_Zdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bits<5> Zm;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-17} = 0b10100;
	let Inst{16} = ab;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_perm_clast_zz<bit ab, string asm> {
	def _B : sve_int_perm_clast_zz<0b00, ab, asm, ZPR8>;
	def _H : sve_int_perm_clast_zz<0b01, ab, asm, ZPR16>;
	def _S : sve_int_perm_clast_zz<0b10, ab, asm, ZPR32>;
	def _D : sve_int_perm_clast_zz<0b11, ab, asm, ZPR64>;
	}

	class sve_int_perm_last_r<bits<2> sz8_64, bit ab, string asm,
	ZPRRegOp zprty, RegisterClass resultRegType>
	: I<(outs resultRegType:$Rd), (ins PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Rd, $Pg, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-17} = 0b10000;
	let Inst{16} = ab;
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Rd;
	}

	multiclass sve_int_perm_last_r<bit ab, string asm> {
	def _B : sve_int_perm_last_r<0b00, ab, asm, ZPR8, GPR32>;
	def _H : sve_int_perm_last_r<0b01, ab, asm, ZPR16, GPR32>;
	def _S : sve_int_perm_last_r<0b10, ab, asm, ZPR32, GPR32>;
	def _D : sve_int_perm_last_r<0b11, ab, asm, ZPR64, GPR64>;
	}

	class sve_int_perm_last_v<bits<2> sz8_64, bit ab, string asm,
	ZPRRegOp zprty, RegisterClass dstRegtype>
	: I<(outs dstRegtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Vd, $Pg, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Vd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-17} = 0b10001;
	let Inst{16} = ab;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Vd;
	}

	multiclass sve_int_perm_last_v<bit ab, string asm> {
	def _B : sve_int_perm_last_v<0b00, ab, asm, ZPR8, FPR8>;
	def _H : sve_int_perm_last_v<0b01, ab, asm, ZPR16, FPR16>;
	def _S : sve_int_perm_last_v<0b10, ab, asm, ZPR32, FPR32>;
	def _D : sve_int_perm_last_v<0b11, ab, asm, ZPR64, FPR64>;
	}

	class sve_int_perm_splice<bits<2> sz8_64, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm),
	asm, "\t$Zdn, $Pg, $_Zdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zdn;
	bits<5> Zm;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-13} = 0b101100100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	let DestructiveInstType = Destructive;
	let ElementSize = ElementSizeNone;
	}

	multiclass sve_int_perm_splice<string asm> {
	def _B : sve_int_perm_splice<0b00, asm, ZPR8>;
	def _H : sve_int_perm_splice<0b01, asm, ZPR16>;
	def _S : sve_int_perm_splice<0b10, asm, ZPR32>;
	def _D : sve_int_perm_splice<0b11, asm, ZPR64>;
	}

	class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm,
	ZPRRegOp zprty, RegisterOperand VecList>
	: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, VecList:$Zn),
	asm, "\t$Zd, $Pg, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zn;
	bits<5> Zd;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-13} = 0b101101100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_int_perm_splice_cons<string asm> {
	def _B : sve2_int_perm_splice_cons<0b00, asm, ZPR8, ZZ_b>;
	def _H : sve2_int_perm_splice_cons<0b01, asm, ZPR16, ZZ_h>;
	def _S : sve2_int_perm_splice_cons<0b10, asm, ZPR32, ZZ_s>;
	def _D : sve2_int_perm_splice_cons<0b11, asm, ZPR64, ZZ_d>;
	}

	class sve_int_perm_rev<bits<2> sz8_64, bits<2> opc, string asm,
	ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Zd, $Pg/m, $Zn",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<3> Pg;
	bits<5> Zn;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-18} = 0b1001;
	let Inst{17-16} = opc;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_perm_rev_rbit<string asm> {
	def _B : sve_int_perm_rev<0b00, 0b11, asm, ZPR8>;
	def _H : sve_int_perm_rev<0b01, 0b11, asm, ZPR16>;
	def _S : sve_int_perm_rev<0b10, 0b11, asm, ZPR32>;
	def _D : sve_int_perm_rev<0b11, 0b11, asm, ZPR64>;
	}

	multiclass sve_int_perm_rev_revb<string asm> {
	def _H : sve_int_perm_rev<0b01, 0b00, asm, ZPR16>;
	def _S : sve_int_perm_rev<0b10, 0b00, asm, ZPR32>;
	def _D : sve_int_perm_rev<0b11, 0b00, asm, ZPR64>;
	}

	multiclass sve_int_perm_rev_revh<string asm> {
	def _S : sve_int_perm_rev<0b10, 0b01, asm, ZPR32>;
	def _D : sve_int_perm_rev<0b11, 0b01, asm, ZPR64>;
	}

	multiclass sve_int_perm_rev_revw<string asm> {
	def _D : sve_int_perm_rev<0b11, 0b10, asm, ZPR64>;
	}

	class sve_int_perm_cpy_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegType>
	: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, srcRegType:$Rn),
	asm, "\t$Zd, $Pg/m, $Rn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zd;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-13} = 0b101000101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_perm_cpy_r<string asm> {
	def _B : sve_int_perm_cpy_r<0b00, asm, ZPR8, GPR32sp>;
	def _H : sve_int_perm_cpy_r<0b01, asm, ZPR16, GPR32sp>;
	def _S : sve_int_perm_cpy_r<0b10, asm, ZPR32, GPR32sp>;
	def _D : sve_int_perm_cpy_r<0b11, asm, ZPR64, GPR64sp>;

	def : InstAlias<"mov $Zd, $Pg/m, $Rn",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Rn",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Rn",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Rn",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, GPR64sp:$Rn), 1>;
	}

	class sve_int_perm_cpy_v<bits<2> sz8_64, string asm, ZPRRegOp zprty,
	RegisterClass srcRegtype>
	: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, srcRegtype:$Vn),
	asm, "\t$Zd, $Pg/m, $Vn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Vn;
	bits<5> Zd;
	let Inst{31-24} = 0b00000101;
	let Inst{23-22} = sz8_64;
	let Inst{21-13} = 0b100000100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Vn;
	let Inst{4-0} = Zd;

	let Constraints = "$Zd = $_Zd";
	let DestructiveInstType = Destructive;
	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_perm_cpy_v<string asm> {
	def _B : sve_int_perm_cpy_v<0b00, asm, ZPR8, FPR8>;
	def _H : sve_int_perm_cpy_v<0b01, asm, ZPR16, FPR16>;
	def _S : sve_int_perm_cpy_v<0b10, asm, ZPR32, FPR32>;
	def _D : sve_int_perm_cpy_v<0b11, asm, ZPR64, FPR64>;

	def : InstAlias<"mov $Zd, $Pg/m, $Vn",
	(!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPR3bAny:$Pg, FPR8:$Vn), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Vn",
	(!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPR3bAny:$Pg, FPR16:$Vn), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Vn",
	(!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, FPR32:$Vn), 1>;
	def : InstAlias<"mov $Zd, $Pg/m, $Vn",
	(!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>;
	}

	class sve_int_perm_compact<bit sz, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Zd, $Pg, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-23} = 0b000001011;
	let Inst{22} = sz;
	let Inst{21-13} = 0b100001100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_perm_compact<string asm> {
	def _S : sve_int_perm_compact<0b0, asm, ZPR32>;
	def _D : sve_int_perm_compact<0b1, asm, ZPR64>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Memory - Contiguous Load Group
	//===----------------------------------------------------------------------===//

	class sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
	RegisterOperand VecList>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
	asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zt;
	bits<4> imm4;
	let Inst{31-25} = 0b1010010;
	let Inst{24-21} = dtype;
	let Inst{20} = nf;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b101;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	let Uses = !if(!eq(nf, 1), [FFR], []);
	let Defs = !if(!eq(nf, 1), [FFR], []);
	}

	multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
	RegisterOperand listty, ZPRRegOp zprty> {
	def _REAL : sve_mem_cld_si_base<dtype, nf, asm, listty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
	(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	multiclass sve_mem_cld_si<bits<4> dtype, string asm, RegisterOperand listty,
	ZPRRegOp zprty>
	: sve_mem_cld_si_base<dtype, 0, asm, listty, zprty>;

	class sve_mem_cldnt_si_base<bits<2> msz, string asm, RegisterOperand VecList>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4),
	asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<5> Zt;
	bits<3> Pg;
	bits<5> Rn;
	bits<4> imm4;
	let Inst{31-25} = 0b1010010;
	let Inst{24-23} = msz;
	let Inst{22-20} = 0b000;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b111;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_cldnt_si<bits<2> msz, string asm, RegisterOperand listty,
	ZPRRegOp zprty> {
	def NAME : sve_mem_cldnt_si_base<msz, asm, listty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_cldnt_ss_base<bits<2> msz, string asm, RegisterOperand VecList,
	RegisterOperand gprty>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Rn;
	bits<5> Zt;
	let Inst{31-25} = 0b1010010;
	let Inst{24-23} = msz;
	let Inst{22-21} = 0b00;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b110;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_cldnt_ss<bits<2> msz, string asm, RegisterOperand listty,
	ZPRRegOp zprty, RegisterOperand gprty> {
	def NAME : sve_mem_cldnt_ss_base<msz, asm, listty, gprty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
	}

	class sve_mem_ldqr_si<bits<2> sz, string asm, RegisterOperand VecList>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s16:$imm4),
	asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> {
	bits<5> Zt;
	bits<5> Rn;
	bits<3> Pg;
	bits<4> imm4;
	let Inst{31-25} = 0b1010010;
	let Inst{24-23} = sz;
	let Inst{22-20} = 0;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_ldqr_si<bits<2> sz, string asm, RegisterOperand listty,
	ZPRRegOp zprty> {
	def NAME : sve_mem_ldqr_si<sz, asm, listty>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s16:$imm4), 0>;
	}

	class sve_mem_ldqr_ss<bits<2> sz, string asm, RegisterOperand VecList,
	RegisterOperand gprty>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> {
	bits<5> Zt;
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Rm;
	let Inst{31-25} = 0b1010010;
	let Inst{24-23} = sz;
	let Inst{22-21} = 0;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_ldqr_ss<bits<2> sz, string asm, RegisterOperand listty,
	ZPRRegOp zprty, RegisterOperand gprty> {
	def NAME : sve_mem_ldqr_ss<sz, asm, listty, gprty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
	}

	class sve_mem_ld_dup<bits<2> dtypeh, bits<2> dtypel, string asm,
	RegisterOperand VecList, Operand immtype>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm6),
	asm, "\t$Zt, $Pg/z, [$Rn, $imm6]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zt;
	bits<6> imm6;
	let Inst{31-25} = 0b1000010;
	let Inst{24-23} = dtypeh;
	let Inst{22} = 1;
	let Inst{21-16} = imm6;
	let Inst{15} = 0b1;
	let Inst{14-13} = dtypel;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_ld_dup<bits<2> dtypeh, bits<2> dtypel, string asm,
	RegisterOperand zlistty, ZPRRegOp zprty, Operand immtype> {
	def NAME : sve_mem_ld_dup<dtypeh, dtypel, asm, zlistty, immtype>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm6]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm6), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) zlistty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_cld_ss_base<bits<4> dtype, bit ff, dag iops, string asm,
	RegisterOperand VecList>
	: I<(outs VecList:$Zt), iops,
	asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<5> Zt;
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Rn;
	let Inst{31-25} = 0b1010010;
	let Inst{24-21} = dtype;
	let Inst{20-16} = Rm;
	let Inst{15-14} = 0b01;
	let Inst{13} = ff;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	let Uses = !if(!eq(ff, 1), [FFR], []);
	let Defs = !if(!eq(ff, 1), [FFR], []);
	}

	multiclass sve_mem_cld_ss<bits<4> dtype, string asm, RegisterOperand listty,
	ZPRRegOp zprty, RegisterOperand gprty> {
	def "" : sve_mem_cld_ss_base<dtype, 0, (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, listty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
	(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
	}

	multiclass sve_mem_cldff_ss<bits<4> dtype, string asm, RegisterOperand listty,
	ZPRRegOp zprty, RegisterOperand gprty> {
	def _REAL : sve_mem_cld_ss_base<dtype, 1, (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, listty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
	(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 1>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>;
	}

	multiclass sve_mem_cldnf_si<bits<4> dtype, string asm, RegisterOperand listty,
	ZPRRegOp zprty>
	: sve_mem_cld_si_base<dtype, 1, asm, listty, zprty>;

	class sve_mem_eld_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
	string asm, Operand immtype>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, immtype:$imm4),
	asm, "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<5> Zt;
	bits<3> Pg;
	bits<5> Rn;
	bits<4> imm4;
	let Inst{31-25} = 0b1010010;
	let Inst{24-23} = sz;
	let Inst{22-21} = nregs;
	let Inst{20} = 0;
	let Inst{19-16} = imm4;
	let Inst{15-13} = 0b111;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_eld_si<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
	string asm, Operand immtype> {
	def NAME : sve_mem_eld_si<sz, nregs, VecList, asm, immtype>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
	(!cast<Instruction>(NAME) VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_eld_ss<bits<2> sz, bits<2> nregs, RegisterOperand VecList,
	string asm, RegisterOperand gprty>
	: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$Zt, $Pg/z, [$Rn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Rn;
	bits<5> Zt;
	let Inst{31-25} = 0b1010010;
	let Inst{24-23} = sz;
	let Inst{22-21} = nregs;
	let Inst{20-16} = Rm;
	let Inst{15-13} = 0b110;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	//===----------------------------------------------------------------------===//
	// SVE Memory - 32-bit Gather and Unsized Contiguous Group
	//===----------------------------------------------------------------------===//

	// bit xs is '1' if offsets are signed
	// bit scaled is '1' if the offsets are scaled
	class sve_mem_32b_gld_sv<bits<4> opc, bit xs, bit scaled, string asm,
	RegisterOperand zprext>
	: I<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
	asm, "\t$Zt, $Pg/z, [$Rn, $Zm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zm;
	bits<5> Zt;
	let Inst{31-25} = 0b1000010;
	let Inst{24-23} = opc{3-2};
	let Inst{22} = xs;
	let Inst{21} = scaled;
	let Inst{20-16} = Zm;
	let Inst{15} = 0b0;
	let Inst{14-13} = opc{1-0};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	let Defs = !if(!eq(opc{0}, 1), [FFR], []);
	let Uses = !if(!eq(opc{0}, 1), [FFR], []);
	}

	multiclass sve_mem_32b_gld_sv_32_scaled<bits<4> opc, string asm,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd> {
	def _UXTW_SCALED_REAL : sve_mem_32b_gld_sv<opc, 0, 1, asm, uxtw_opnd>;
	def _SXTW_SCALED_REAL : sve_mem_32b_gld_sv<opc, 1, 1, asm, sxtw_opnd>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
	}

	multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd> {
	def _UXTW_REAL : sve_mem_32b_gld_sv<opc, 0, 0, asm, uxtw_opnd>;
	def _SXTW_REAL : sve_mem_32b_gld_sv<opc, 1, 0, asm, sxtw_opnd>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _UXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
	}


	class sve_mem_32b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
	: I<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5),
	asm, "\t$Zt, $Pg/z, [$Zn, $imm5]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zn;
	bits<5> Zt;
	bits<5> imm5;
	let Inst{31-25} = 0b1000010;
	let Inst{24-23} = opc{3-2};
	let Inst{22-21} = 0b01;
	let Inst{20-16} = imm5;
	let Inst{15} = 0b1;
	let Inst{14-13} = opc{1-0};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	let Defs = !if(!eq(opc{0}, 1), [FFR], []);
	let Uses = !if(!eq(opc{0}, 1), [FFR], []);
	}

	multiclass sve_mem_32b_gld_vi_32_ptrs<bits<4> opc, string asm, Operand imm_ty> {
	def _IMM_REAL : sve_mem_32b_gld_vi<opc, asm, imm_ty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
	(!cast<Instruction>(NAME # _IMM_REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $imm5]",
	(!cast<Instruction>(NAME # _IMM_REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
	(!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
	}

	class sve_mem_prfm_si<bits<2> msz, string asm>
	: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, simm6s1:$imm6),
	asm, "\t$prfop, $Pg, [$Rn, $imm6, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<5> Rn;
	bits<3> Pg;
	bits<6> imm6;
	bits<4> prfop;
	let Inst{31-22} = 0b1000010111;
	let Inst{21-16} = imm6;
	let Inst{15} = 0b0;
	let Inst{14-13} = msz;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4} = 0b0;
	let Inst{3-0} = prfop;

	let hasSideEffects = 1;
	}

	multiclass sve_mem_prfm_si<bits<2> msz, string asm> {
	def NAME : sve_mem_prfm_si<msz, asm>;

	def : InstAlias<asm # "\t$prfop, $Pg, [$Rn]",
	(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_prfm_ss<bits<3> opc, string asm, RegisterOperand gprty>
	: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
	asm, "\t$prfop, $Pg, [$Rn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<5> Rm;
	bits<5> Rn;
	bits<3> Pg;
	bits<4> prfop;
	let Inst{31-25} = 0b1000010;
	let Inst{24-23} = opc{2-1};
	let Inst{22-21} = 0b00;
	let Inst{20-16} = Rm;
	let Inst{15} = 0b1;
	let Inst{14} = opc{0};
	let Inst{13} = 0b0;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4} = 0b0;
	let Inst{3-0} = prfop;

	let hasSideEffects = 1;
	}

	class sve_mem_32b_prfm_sv<bits<2> msz, bit xs, string asm,
	RegisterOperand zprext>
	: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
	asm, "\t$prfop, $Pg, [$Rn, $Zm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zm;
	bits<4> prfop;
	let Inst{31-23} = 0b100001000;
	let Inst{22} = xs;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15} = 0b0;
	let Inst{14-13} = msz;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4} = 0b0;
	let Inst{3-0} = prfop;

	let hasSideEffects = 1;
	}

	multiclass sve_mem_32b_prfm_sv_scaled<bits<2> msz, string asm,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd> {
	def _UXTW_SCALED : sve_mem_32b_prfm_sv<msz, 0, asm, uxtw_opnd>;
	def _SXTW_SCALED : sve_mem_32b_prfm_sv<msz, 1, asm, sxtw_opnd>;
	}

	class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
	: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5),
	asm, "\t$prfop, $Pg, [$Zn, $imm5]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zn;
	bits<5> imm5;
	bits<4> prfop;
	let Inst{31-25} = 0b1000010;
	let Inst{24-23} = msz;
	let Inst{22-21} = 0b00;
	let Inst{20-16} = imm5;
	let Inst{15-13} = 0b111;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = 0b0;
	let Inst{3-0} = prfop;
	}

	multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
	def NAME : sve_mem_32b_prfm_vi<msz, asm, imm_ty>;

	def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
	(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
	}

	class sve_mem_z_fill<string asm>
	: I<(outs ZPRAny:$Zt), (ins GPR64sp:$Rn, simm9:$imm9),
	asm, "\t$Zt, [$Rn, $imm9, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<5> Rn;
	bits<5> Zt;
	bits<9> imm9;
	let Inst{31-22} = 0b1000010110;
	let Inst{21-16} = imm9{8-3};
	let Inst{15-13} = 0b010;
	let Inst{12-10} = imm9{2-0};
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	multiclass sve_mem_z_fill<string asm> {
	def NAME : sve_mem_z_fill<asm>;

	def : InstAlias<asm # "\t$Zt, [$Rn]",
	(!cast<Instruction>(NAME) ZPRAny:$Zt, GPR64sp:$Rn, 0), 1>;
	}

	class sve_mem_p_fill<string asm>
	: I<(outs PPRAny:$Pt), (ins GPR64sp:$Rn, simm9:$imm9),
	asm, "\t$Pt, [$Rn, $imm9, mul vl]",
	"",
	[]>, Sched<[]> {
	bits<4> Pt;
	bits<5> Rn;
	bits<9> imm9;
	let Inst{31-22} = 0b1000010110;
	let Inst{21-16} = imm9{8-3};
	let Inst{15-13} = 0b000;
	let Inst{12-10} = imm9{2-0};
	let Inst{9-5} = Rn;
	let Inst{4} = 0b0;
	let Inst{3-0} = Pt;

	let mayLoad = 1;
	}

	multiclass sve_mem_p_fill<string asm> {
	def NAME : sve_mem_p_fill<asm>;

	def : InstAlias<asm # "\t$Pt, [$Rn]",
	(!cast<Instruction>(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>;
	}

	-class sve2_mem_cldnt_vs_base<bits<5> opc, dag iops, string asm,
	+class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm,
	RegisterOperand VecList>
	: I<(outs VecList:$Zt), iops,
	asm, "\t$Zt, $Pg/z, [$Zn, $Rm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rm;
	bits<5> Zn;
	bits<5> Zt;
	let Inst{31} = 0b1;
	let Inst{30} = opc{4};
	let Inst{29-25} = 0b00010;
	let Inst{24-23} = opc{3-2};
	let Inst{22-21} = 0b00;
	let Inst{20-16} = Rm;
	let Inst{15} = 0b1;
	let Inst{14-13} = opc{1-0};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	}

	-multiclass sve2_mem_cldnt_vs<bits<5> opc, string asm,
	+multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm,
	RegisterOperand listty, ZPRRegOp zprty> {
	- def _REAL : sve2_mem_cldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
	+ def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
	asm, listty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
	(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
	(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
	- def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
	- (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
	(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Memory - 64-bit Gather Group
	//===----------------------------------------------------------------------===//

	// bit xs is '1' if offsets are signed
	// bit scaled is '1' if the offsets are scaled
	// bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
	class sve_mem_64b_gld_sv<bits<4> opc, bit xs, bit scaled, bit lsl, string asm,
	RegisterOperand zprext>
	: I<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
	asm, "\t$Zt, $Pg/z, [$Rn, $Zm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zm;
	bits<5> Zt;
	let Inst{31-25} = 0b1100010;
	let Inst{24-23} = opc{3-2};
	let Inst{22} = xs;
	let Inst{21} = scaled;
	let Inst{20-16} = Zm;
	let Inst{15} = lsl;
	let Inst{14-13} = opc{1-0};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	let Defs = !if(!eq(opc{0}, 1), [FFR], []);
	let Uses = !if(!eq(opc{0}, 1), [FFR], []);
	}

	multiclass sve_mem_64b_gld_sv_32_scaled<bits<4> opc, string asm,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd> {
	def _UXTW_SCALED_REAL : sve_mem_64b_gld_sv<opc, 0, 1, 0, asm, uxtw_opnd>;
	def _SXTW_SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 0, asm, sxtw_opnd>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
	}

	multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd> {
	def _UXTW_REAL : sve_mem_64b_gld_sv<opc, 0, 0, 0, asm, uxtw_opnd>;
	def _SXTW_REAL : sve_mem_64b_gld_sv<opc, 1, 0, 0, asm, sxtw_opnd>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _UXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
	}

	multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
	RegisterOperand zprext> {
	def _SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 1, asm, zprext>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
	}

	multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm> {
	def _REAL : sve_mem_64b_gld_sv<opc, 1, 0, 1, asm, ZPR64ExtLSL8>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
	(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
	}

	class sve_mem_64b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
	: I<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5),
	asm, "\t$Zt, $Pg/z, [$Zn, $imm5]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zn;
	bits<5> Zt;
	bits<5> imm5;
	let Inst{31-25} = 0b1100010;
	let Inst{24-23} = opc{3-2};
	let Inst{22-21} = 0b01;
	let Inst{20-16} = imm5;
	let Inst{15} = 0b1;
	let Inst{14-13} = opc{1-0};
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zt;

	let mayLoad = 1;
	let Defs = !if(!eq(opc{0}, 1), [FFR], []);
	let Uses = !if(!eq(opc{0}, 1), [FFR], []);
	}

	multiclass sve_mem_64b_gld_vi_64_ptrs<bits<4> opc, string asm, Operand imm_ty> {
	def _IMM_REAL : sve_mem_64b_gld_vi<opc, asm, imm_ty>;

	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
	(!cast<Instruction>(NAME # _IMM_REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $imm5]",
	(!cast<Instruction>(NAME # _IMM_REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>;
	def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
	(!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
	}

	// bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
	class sve_mem_64b_prfm_sv<bits<2> msz, bit xs, bit lsl, string asm,
	RegisterOperand zprext>
	: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm),
	asm, "\t$prfop, $Pg, [$Rn, $Zm]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Rn;
	bits<5> Zm;
	bits<4> prfop;
	let Inst{31-23} = 0b110001000;
	let Inst{22} = xs;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15} = lsl;
	let Inst{14-13} = msz;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Rn;
	let Inst{4} = 0b0;
	let Inst{3-0} = prfop;

	let hasSideEffects = 1;
	}

	multiclass sve_mem_64b_prfm_sv_ext_scaled<bits<2> msz, string asm,
	RegisterOperand sxtw_opnd,
	RegisterOperand uxtw_opnd> {
	def _UXTW_SCALED : sve_mem_64b_prfm_sv<msz, 0, 0, asm, uxtw_opnd>;
	def _SXTW_SCALED : sve_mem_64b_prfm_sv<msz, 1, 0, asm, sxtw_opnd>;
	}

	multiclass sve_mem_64b_prfm_sv_lsl_scaled<bits<2> msz, string asm,
	RegisterOperand zprext> {
	def NAME : sve_mem_64b_prfm_sv<msz, 1, 1, asm, zprext>;
	}


	class sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
	: I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5),
	asm, "\t$prfop, $Pg, [$Zn, $imm5]",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zn;
	bits<5> imm5;
	bits<4> prfop;
	let Inst{31-25} = 0b1100010;
	let Inst{24-23} = msz;
	let Inst{22-21} = 0b00;
	let Inst{20-16} = imm5;
	let Inst{15-13} = 0b111;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = 0b0;
	let Inst{3-0} = prfop;

	let hasSideEffects = 1;
	}

	multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
	def NAME : sve_mem_64b_prfm_vi<msz, asm, imm_ty>;

	def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
	(!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Compute Vector Address Group
	//===----------------------------------------------------------------------===//

	class sve_int_bin_cons_misc_0_a<bits<2> opc, bits<2> msz, string asm,
	ZPRRegOp zprty, RegisterOperand zprext>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprext:$Zm),
	asm, "\t$Zd, [$Zn, $Zm]",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-12} = 0b1010;
	let Inst{11-10} = msz;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_bin_cons_misc_0_a_uxtw<bits<2> opc, string asm> {
	def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtUXTW8>;
	def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtUXTW16>;
	def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtUXTW32>;
	def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtUXTW64>;
	}

	multiclass sve_int_bin_cons_misc_0_a_sxtw<bits<2> opc, string asm> {
	def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtSXTW8>;
	def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtSXTW16>;
	def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtSXTW32>;
	def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtSXTW64>;
	}

	multiclass sve_int_bin_cons_misc_0_a_32_lsl<bits<2> opc, string asm> {
	def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR32, ZPR32ExtLSL8>;
	def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR32, ZPR32ExtLSL16>;
	def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR32, ZPR32ExtLSL32>;
	def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR32, ZPR32ExtLSL64>;
	}

	multiclass sve_int_bin_cons_misc_0_a_64_lsl<bits<2> opc, string asm> {
	def _0 : sve_int_bin_cons_misc_0_a<opc, 0b00, asm, ZPR64, ZPR64ExtLSL8>;
	def _1 : sve_int_bin_cons_misc_0_a<opc, 0b01, asm, ZPR64, ZPR64ExtLSL16>;
	def _2 : sve_int_bin_cons_misc_0_a<opc, 0b10, asm, ZPR64, ZPR64ExtLSL32>;
	def _3 : sve_int_bin_cons_misc_0_a<opc, 0b11, asm, ZPR64, ZPR64ExtLSL64>;
	}


	//===----------------------------------------------------------------------===//
	// SVE Integer Misc - Unpredicated Group
	//===----------------------------------------------------------------------===//

	class sve_int_bin_cons_misc_0_b<bits<2> sz, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-10} = 0b101100;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve_int_bin_cons_misc_0_b<string asm> {
	def _H : sve_int_bin_cons_misc_0_b<0b01, asm, ZPR16>;
	def _S : sve_int_bin_cons_misc_0_b<0b10, asm, ZPR32>;
	def _D : sve_int_bin_cons_misc_0_b<0b11, asm, ZPR64>;
	}

	class sve_int_bin_cons_misc_0_c<bits<8> opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn),
	asm, "\t$Zd, $Zn",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = opc{7-6};
	let Inst{21} = 0b1;
	let Inst{20-16} = opc{5-1};
	let Inst{15-11} = 0b10111;
	let Inst{10} = opc{0};
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	//===----------------------------------------------------------------------===//
	// SVE Integer Reduction Group
	//===----------------------------------------------------------------------===//

	class sve_int_reduce<bits<2> sz8_32, bits<2> fmt, bits<3> opc, string asm,
	ZPRRegOp zprty, RegisterClass regtype>
	: I<(outs regtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
	asm, "\t$Vd, $Pg, $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Vd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_32;
	let Inst{21} = 0b0;
	let Inst{20-19} = fmt;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Vd;
	}

	multiclass sve_int_reduce_0_saddv<bits<3> opc, string asm> {
	def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
	def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
	def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
	}

	multiclass sve_int_reduce_0_uaddv<bits<3> opc, string asm> {
	def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>;
	def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>;
	def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>;
	def _D : sve_int_reduce<0b11, 0b00, opc, asm, ZPR64, FPR64>;
	}

	multiclass sve_int_reduce_1<bits<3> opc, string asm> {
	def _B : sve_int_reduce<0b00, 0b01, opc, asm, ZPR8, FPR8>;
	def _H : sve_int_reduce<0b01, 0b01, opc, asm, ZPR16, FPR16>;
	def _S : sve_int_reduce<0b10, 0b01, opc, asm, ZPR32, FPR32>;
	def _D : sve_int_reduce<0b11, 0b01, opc, asm, ZPR64, FPR64>;
	}

	multiclass sve_int_reduce_2<bits<3> opc, string asm> {
	def _B : sve_int_reduce<0b00, 0b11, opc, asm, ZPR8, FPR8>;
	def _H : sve_int_reduce<0b01, 0b11, opc, asm, ZPR16, FPR16>;
	def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32>;
	def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64>;
	}

	class sve_int_movprfx_pred<bits<2> sz8_32, bits<3> opc, string asm,
	ZPRRegOp zprty, string pg_suffix, dag iops>
	: I<(outs zprty:$Zd), iops,
	asm, "\t$Zd, $Pg"#pg_suffix#", $Zn",
	"",
	[]>, Sched<[]> {
	bits<3> Pg;
	bits<5> Zd;
	bits<5> Zn;
	let Inst{31-24} = 0b00000100;
	let Inst{23-22} = sz8_32;
	let Inst{21-19} = 0b010;
	let Inst{18-16} = opc;
	let Inst{15-13} = 0b001;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;

	let ElementSize = zprty.ElementSize;
	}

	multiclass sve_int_movprfx_pred_merge<bits<3> opc, string asm> {
	let Constraints = "$Zd = $_Zd" in {
	def _B : sve_int_movprfx_pred<0b00, opc, asm, ZPR8, "/m",
	(ins ZPR8:$_Zd, PPR3bAny:$Pg, ZPR8:$Zn)>;
	def _H : sve_int_movprfx_pred<0b01, opc, asm, ZPR16, "/m",
	(ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR16:$Zn)>;
	def _S : sve_int_movprfx_pred<0b10, opc, asm, ZPR32, "/m",
	(ins ZPR32:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn)>;
	def _D : sve_int_movprfx_pred<0b11, opc, asm, ZPR64, "/m",
	(ins ZPR64:$_Zd, PPR3bAny:$Pg, ZPR64:$Zn)>;
	}
	}

	multiclass sve_int_movprfx_pred_zero<bits<3> opc, string asm> {
	def _B : sve_int_movprfx_pred<0b00, opc, asm, ZPR8, "/z",
	(ins PPR3bAny:$Pg, ZPR8:$Zn)>;
	def _H : sve_int_movprfx_pred<0b01, opc, asm, ZPR16, "/z",
	(ins PPR3bAny:$Pg, ZPR16:$Zn)>;
	def _S : sve_int_movprfx_pred<0b10, opc, asm, ZPR32, "/z",
	(ins PPR3bAny:$Pg, ZPR32:$Zn)>;
	def _D : sve_int_movprfx_pred<0b11, opc, asm, ZPR64, "/z",
	(ins PPR3bAny:$Pg, ZPR64:$Zn)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE Propagate Break Group
	//===----------------------------------------------------------------------===//

	class sve_int_brkp<bits<2> opc, string asm>
	: I<(outs PPR8:$Pd), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$Pm),
	asm, "\t$Pd, $Pg/z, $Pn, $Pm",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<4> Pg;
	bits<4> Pm;
	bits<4> Pn;
	let Inst{31-24} = 0b00100101;
	let Inst{23} = 0b0;
	let Inst{22} = opc{1};
	let Inst{21-20} = 0b00;
	let Inst{19-16} = Pm;
	let Inst{15-14} = 0b11;
	let Inst{13-10} = Pg;
	let Inst{9} = 0b0;
	let Inst{8-5} = Pn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;

	let Defs = !if(!eq (opc{1}, 1), [NZCV], []);
	}


	//===----------------------------------------------------------------------===//
	// SVE Partition Break Group
	//===----------------------------------------------------------------------===//

	class sve_int_brkn<bit S, string asm>
	: I<(outs PPR8:$Pdm), (ins PPRAny:$Pg, PPR8:$Pn, PPR8:$_Pdm),
	asm, "\t$Pdm, $Pg/z, $Pn, $_Pdm",
	"",
	[]>, Sched<[]> {
	bits<4> Pdm;
	bits<4> Pg;
	bits<4> Pn;
	let Inst{31-23} = 0b001001010;
	let Inst{22} = S;
	let Inst{21-14} = 0b01100001;
	let Inst{13-10} = Pg;
	let Inst{9} = 0b0;
	let Inst{8-5} = Pn;
	let Inst{4} = 0b0;
	let Inst{3-0} = Pdm;

	let Constraints = "$Pdm = $_Pdm";
	let Defs = !if(!eq (S, 0b1), [NZCV], []);
	}

	class sve_int_break<bits<3> opc, string asm, string suffix, dag iops>
	: I<(outs PPR8:$Pd), iops,
	asm, "\t$Pd, $Pg"#suffix#", $Pn",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<4> Pg;
	bits<4> Pn;
	let Inst{31-24} = 0b00100101;
	let Inst{23-22} = opc{2-1};
	let Inst{21-14} = 0b01000001;
	let Inst{13-10} = Pg;
	let Inst{9} = 0b0;
	let Inst{8-5} = Pn;
	let Inst{4} = opc{0};
	let Inst{3-0} = Pd;

	let Constraints = !if(!eq (opc{0}, 1), "$Pd = $_Pd", "");
	let Defs = !if(!eq (opc{1}, 1), [NZCV], []);

	}

	multiclass sve_int_break_m<bits<3> opc, string asm> {
	def NAME : sve_int_break<opc, asm, "/m", (ins PPR8:$_Pd, PPRAny:$Pg, PPR8:$Pn)>;
	}

	multiclass sve_int_break_z<bits<3> opc, string asm> {
	def NAME : sve_int_break<opc, asm, "/z", (ins PPRAny:$Pg, PPR8:$Pn)>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 String Processing Group
	//===----------------------------------------------------------------------===//

	class sve2_char_match<bit sz, bit opc, string asm,
	PPRRegOp pprty, ZPRRegOp zprty>
	: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
	asm, "\t$Pd, $Pg/z, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<4> Pd;
	bits<3> Pg;
	bits<5> Zm;
	bits<5> Zn;
	let Inst{31-23} = 0b010001010;
	let Inst{22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b100;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4} = opc;
	let Inst{3-0} = Pd;

	let Defs = [NZCV];
	}

	multiclass sve2_char_match<bit opc, string asm> {
	def _B : sve2_char_match<0b0, opc, asm, PPR8, ZPR8>;
	def _H : sve2_char_match<0b1, opc, asm, PPR16, ZPR16>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Histogram Computation - Segment Group
	//===----------------------------------------------------------------------===//

	class sve2_hist_gen_segment<string asm>
	: I<(outs ZPR8:$Zd), (ins ZPR8:$Zn, ZPR8:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-21} = 0b01000101001;
	let Inst{20-16} = Zm;
	let Inst{15-10} = 0b101000;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Histogram Computation - Vector Group
	//===----------------------------------------------------------------------===//

	class sve2_hist_gen_vector<bit sz, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Pg/z, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<3> Pg;
	bits<5> Zm;
	let Inst{31-23} = 0b010001011;
	let Inst{22} = sz;
	let Inst{21} = 0b1;
	let Inst{20-16} = Zm;
	let Inst{15-13} = 0b110;
	let Inst{12-10} = Pg;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	multiclass sve2_hist_gen_vector<string asm> {
	def _S : sve2_hist_gen_vector<0b0, asm, ZPR32>;
	def _D : sve2_hist_gen_vector<0b1, asm, ZPR64>;
	}

	//===----------------------------------------------------------------------===//
	// SVE2 Crypto Extensions Group
	//===----------------------------------------------------------------------===//

	class sve2_crypto_cons_bin_op<bit opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm),
	asm, "\t$Zd, $Zn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zd;
	bits<5> Zn;
	bits<5> Zm;
	let Inst{31-21} = 0b01000101001;
	let Inst{20-16} = Zm;
	let Inst{15-11} = 0b11110;
	let Inst{10} = opc;
	let Inst{9-5} = Zn;
	let Inst{4-0} = Zd;
	}

	class sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty>
	: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm),
	asm, "\t$Zdn, $_Zdn, $Zm",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	bits<5> Zm;
	let Inst{31-17} = 0b010001010010001;
	let Inst{16} = opc{1};
	let Inst{15-11} = 0b11100;
	let Inst{10} = opc{0};
	let Inst{9-5} = Zm;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	}

	class sve2_crypto_unary_op<bit opc, string asm>
	: I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn),
	asm, "\t$Zdn, $_Zdn",
	"",
	[]>, Sched<[]> {
	bits<5> Zdn;
	let Inst{31-11} = 0b010001010010000011100;
	let Inst{10} = opc;
	let Inst{9-5} = 0b00000;
	let Inst{4-0} = Zdn;

	let Constraints = "$Zdn = $_Zdn";
	}
	Index: vendor/llvm/dist-release_90/lib/Target/ARM/ARMISelLowering.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/ARM/ARMISelLowering.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/ARM/ARMISelLowering.cpp (revision 351303)
	@@ -1,15888 +1,15889 @@
	//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that ARM uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "ARMISelLowering.h"
	#include "ARMBaseInstrInfo.h"
	#include "ARMBaseRegisterInfo.h"
	#include "ARMCallingConv.h"
	#include "ARMConstantPoolValue.h"
	#include "ARMMachineFunctionInfo.h"
	#include "ARMPerfectShuffle.h"
	#include "ARMRegisterInfo.h"
	#include "ARMSelectionDAGInfo.h"
	#include "ARMSubtarget.h"
	#include "MCTargetDesc/ARMAddressingModes.h"
	#include "MCTargetDesc/ARMBaseInfo.h"
	#include "Utils/ARMBaseInfo.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/IntrinsicLowering.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineConstantPool.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetOpcodes.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/MC/MCInstrItineraries.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/MC/MCSchedule.h"
	#include "llvm/Support/AtomicOrdering.h"
	#include "llvm/Support/BranchProbability.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MachineValueType.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <cstdlib>
	#include <iterator>
	#include <limits>
	#include <string>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;
	using namespace llvm::PatternMatch;

	#define DEBUG_TYPE "arm-isel"

	STATISTIC(NumTailCalls, "Number of tail calls");
	STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
	STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
	STATISTIC(NumConstpoolPromoted,
	"Number of constants with their storage promoted into constant pools");

	static cl::opt<bool>
	ARMInterworking("arm-interworking", cl::Hidden,
	cl::desc("Enable / disable ARM interworking (for debugging only)"),
	cl::init(true));

	static cl::opt<bool> EnableConstpoolPromotion(
	"arm-promote-constant", cl::Hidden,
	cl::desc("Enable / disable promotion of unnamed_addr constants into "
	"constant pools"),
	cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
	static cl::opt<unsigned> ConstpoolPromotionMaxSize(
	"arm-promote-constant-max-size", cl::Hidden,
	cl::desc("Maximum size of constant to promote into a constant pool"),
	cl::init(64));
	static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
	"arm-promote-constant-max-total", cl::Hidden,
	cl::desc("Maximum size of ALL constants to promote into a constant pool"),
	cl::init(128));

	// The APCS parameter registers.
	static const MCPhysReg GPRArgRegs[] = {
	ARM::R0, ARM::R1, ARM::R2, ARM::R3
	};

	void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
	MVT PromotedBitwiseVT) {
	if (VT != PromotedLdStVT) {
	setOperationAction(ISD::LOAD, VT, Promote);
	AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);

	setOperationAction(ISD::STORE, VT, Promote);
	AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
	}

	MVT ElemTy = VT.getVectorElementType();
	if (ElemTy != MVT::f64)
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	if (ElemTy == MVT::i32) {
	setOperationAction(ISD::SINT_TO_FP, VT, Custom);
	setOperationAction(ISD::UINT_TO_FP, VT, Custom);
	setOperationAction(ISD::FP_TO_SINT, VT, Custom);
	setOperationAction(ISD::FP_TO_UINT, VT, Custom);
	} else {
	setOperationAction(ISD::SINT_TO_FP, VT, Expand);
	setOperationAction(ISD::UINT_TO_FP, VT, Expand);
	setOperationAction(ISD::FP_TO_SINT, VT, Expand);
	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
	}
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::SELECT, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	setOperationAction(ISD::VSELECT, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
	if (VT.isInteger()) {
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::SRL, VT, Custom);
	}

	// Promote all bit-wise operations.
	if (VT.isInteger() && VT != PromotedBitwiseVT) {
	setOperationAction(ISD::AND, VT, Promote);
	AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
	setOperationAction(ISD::OR, VT, Promote);
	AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT);
	setOperationAction(ISD::XOR, VT, Promote);
	AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
	}

	// Neon does not support vector divide/remainder operations.
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::FDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);

	if (!VT.isFloatingPoint() &&
	VT != MVT::v2i64 && VT != MVT::v1i64)
	for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
	setOperationAction(Opcode, VT, Legal);
	}

	void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
	addRegisterClass(VT, &ARM::DPRRegClass);
	addTypeForNEON(VT, MVT::f64, MVT::v2i32);
	}

	void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
	addRegisterClass(VT, &ARM::DPairRegClass);
	addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
	}

	void ARMTargetLowering::setAllExpand(MVT VT) {
	for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
	setOperationAction(Opc, VT, Expand);

	// We support these really simple operations even on types where all
	// the actual arithmetic has to be broken down into simpler
	// operations or turned into library calls.
	setOperationAction(ISD::BITCAST, VT, Legal);
	setOperationAction(ISD::LOAD, VT, Legal);
	setOperationAction(ISD::STORE, VT, Legal);
	setOperationAction(ISD::UNDEF, VT, Legal);
	}

	void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
	LegalizeAction Action) {
	setLoadExtAction(ISD::EXTLOAD, From, To, Action);
	setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
	setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
	}

	void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
	const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };

	for (auto VT : IntTypes) {
	addRegisterClass(VT, &ARM::QPRRegClass);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);

	// No native support for these.
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);

	if (!HasMVEFP) {
	setOperationAction(ISD::SINT_TO_FP, VT, Expand);
	setOperationAction(ISD::UINT_TO_FP, VT, Expand);
	setOperationAction(ISD::FP_TO_SINT, VT, Expand);
	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
	}
	}

	const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
	for (auto VT : FloatTypes) {
	addRegisterClass(VT, &ARM::QPRRegClass);
	if (!HasMVEFP)
	setAllExpand(VT);

	// These are legal or custom whether we have MVE.fp or not
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);

	if (HasMVEFP) {
	setOperationAction(ISD::FMINNUM, VT, Legal);
	setOperationAction(ISD::FMAXNUM, VT, Legal);
	setOperationAction(ISD::FROUND, VT, Legal);

	// No native support for these.
	setOperationAction(ISD::FDIV, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FSQRT, VT, Expand);
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	}
	}

	// We 'support' these types up to bitcast/load/store level, regardless of
	// MVE integer-only / float support. Only doing FP data processing on the FP
	// vector types is inhibited at integer-only level.
	const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
	for (auto VT : LongTypes) {
	addRegisterClass(VT, &ARM::QPRRegClass);
	setAllExpand(VT);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	}
	// We can do bitwise operations on v2i64 vectors
	setOperationAction(ISD::AND, MVT::v2i64, Legal);
	setOperationAction(ISD::OR, MVT::v2i64, Legal);
	setOperationAction(ISD::XOR, MVT::v2i64, Legal);

	// It is legal to extload from v4i8 to v4i16 or v4i32.
	addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
	addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
	addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);

	// Some truncating stores are legal too.
	setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
	}

	ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
	const ARMSubtarget &STI)
	: TargetLowering(TM), Subtarget(&STI) {
	RegInfo = Subtarget->getRegisterInfo();
	Itins = Subtarget->getInstrItineraryData();

	setBooleanContents(ZeroOrOneBooleanContent);
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
	!Subtarget->isTargetWatchOS()) {
	bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
	for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
	setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
	IsHFTarget ? CallingConv::ARM_AAPCS_VFP
	: CallingConv::ARM_AAPCS);
	}

	if (Subtarget->isTargetMachO()) {
	// Uses VFP for Thumb libfuncs if available.
	if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
	Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
	static const struct {
	const RTLIB::Libcall Op;
	const char * const Name;
	const ISD::CondCode Cond;
	} LibraryCalls[] = {
	// Single-precision floating-point arithmetic.
	{ RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
	{ RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
	{ RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
	{ RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },

	// Double-precision floating-point arithmetic.
	{ RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
	{ RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
	{ RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
	{ RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },

	// Single-precision comparisons.
	{ RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },
	{ RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },
	{ RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },
	{ RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },
	{ RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
	{ RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
	{ RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
	{ RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ },

	// Double-precision comparisons.
	{ RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
	{ RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },
	{ RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },
	{ RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },
	{ RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
	{ RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
	{ RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
	{ RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ },

	// Floating-point to integer conversions.
	// i64 conversions are done via library routines even when generating VFP
	// instructions, so use the same ones.
	{ RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },
	{ RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
	{ RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },
	{ RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },

	// Conversions between floating types.
	{ RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },
	{ RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },

	// Integer to floating-point conversions.
	// i64 conversions are done via library routines even when generating VFP
	// instructions, so use the same ones.
	// FIXME: There appears to be some naming inconsistency in ARM libgcc:
	// e.g., __floatunsidf vs. __floatunssidfvfp.
	{ RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },
	{ RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
	{ RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },
	{ RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
	};

	for (const auto &LC : LibraryCalls) {
	setLibcallName(LC.Op, LC.Name);
	if (LC.Cond != ISD::SETCC_INVALID)
	setCmpLibcallCC(LC.Op, LC.Cond);
	}
	}
	}

	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);

	// RTLIB
	if (Subtarget->isAAPCS_ABI() &&
	(Subtarget->isTargetAEABI() \|\| Subtarget->isTargetGNUAEABI() \|\|
	Subtarget->isTargetMuslAEABI() \|\| Subtarget->isTargetAndroid())) {
	static const struct {
	const RTLIB::Libcall Op;
	const char * const Name;
	const CallingConv::ID CC;
	const ISD::CondCode Cond;
	} LibraryCalls[] = {
	// Double-precision floating-point arithmetic helper functions
	// RTABI chapter 4.1.2, Table 2
	{ RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },

	// Double-precision floating-point comparison helper functions
	// RTABI chapter 4.1.2, Table 3
	{ RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
	{ RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
	{ RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
	{ RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
	{ RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
	{ RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
	{ RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
	{ RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },

	// Single-precision floating-point arithmetic helper functions
	// RTABI chapter 4.1.2, Table 4
	{ RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },

	// Single-precision floating-point comparison helper functions
	// RTABI chapter 4.1.2, Table 5
	{ RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
	{ RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
	{ RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
	{ RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
	{ RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
	{ RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
	{ RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
	{ RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },

	// Floating-point to integer conversions.
	// RTABI chapter 4.1.2, Table 6
	{ RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },

	// Conversions between floating types.
	// RTABI chapter 4.1.2, Table 7
	{ RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },

	// Integer to floating-point conversions.
	// RTABI chapter 4.1.2, Table 8
	{ RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },

	// Long long helper functions
	// RTABI chapter 4.2, Table 9
	{ RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },

	// Integer division functions
	// RTABI chapter 4.3.1
	{ RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	};

	for (const auto &LC : LibraryCalls) {
	setLibcallName(LC.Op, LC.Name);
	setLibcallCallingConv(LC.Op, LC.CC);
	if (LC.Cond != ISD::SETCC_INVALID)
	setCmpLibcallCC(LC.Op, LC.Cond);
	}

	// EABI dependent RTLIB
	if (TM.Options.EABIVersion == EABI::EABI4 \|\|
	TM.Options.EABIVersion == EABI::EABI5) {
	static const struct {
	const RTLIB::Libcall Op;
	const char *const Name;
	const CallingConv::ID CC;
	const ISD::CondCode Cond;
	} MemOpsLibraryCalls[] = {
	// Memory operations
	// RTABI chapter 4.3.4
	{ RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	{ RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
	};

	for (const auto &LC : MemOpsLibraryCalls) {
	setLibcallName(LC.Op, LC.Name);
	setLibcallCallingConv(LC.Op, LC.CC);
	if (LC.Cond != ISD::SETCC_INVALID)
	setCmpLibcallCC(LC.Op, LC.Cond);
	}
	}
	}

	if (Subtarget->isTargetWindows()) {
	static const struct {
	const RTLIB::Libcall Op;
	const char * const Name;
	const CallingConv::ID CC;
	} LibraryCalls[] = {
	{ RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
	{ RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
	{ RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
	{ RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
	{ RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
	{ RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
	{ RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
	{ RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
	};

	for (const auto &LC : LibraryCalls) {
	setLibcallName(LC.Op, LC.Name);
	setLibcallCallingConv(LC.Op, LC.CC);
	}
	}

	// Use divmod compiler-rt calls for iOS 5.0 and later.
	if (Subtarget->isTargetMachO() &&
	!(Subtarget->isTargetIOS() &&
	Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
	setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
	setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
	}

	// The half <-> float conversion functions are always soft-float on
	// non-watchos platforms, but are needed for some targets which use a
	// hard-float calling convention by default.
	if (!Subtarget->isTargetWatchABI()) {
	if (Subtarget->isAAPCS_ABI()) {
	setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
	setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
	setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
	} else {
	setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
	setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
	setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
	}
	}

	// In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
	// a __gnu_ prefix (which is the default).
	if (Subtarget->isTargetAEABI()) {
	static const struct {
	const RTLIB::Libcall Op;
	const char * const Name;
	const CallingConv::ID CC;
	} LibraryCalls[] = {
	{ RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
	{ RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
	{ RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
	};

	for (const auto &LC : LibraryCalls) {
	setLibcallName(LC.Op, LC.Name);
	setLibcallCallingConv(LC.Op, LC.CC);
	}
	}

	if (Subtarget->isThumb1Only())
	addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
	else
	addRegisterClass(MVT::i32, &ARM::GPRRegClass);

	if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
	Subtarget->hasFPRegs()) {
	addRegisterClass(MVT::f32, &ARM::SPRRegClass);
	addRegisterClass(MVT::f64, &ARM::DPRRegClass);
	if (!Subtarget->hasVFP2Base())
	setAllExpand(MVT::f32);
	if (!Subtarget->hasFP64())
	setAllExpand(MVT::f64);
	}

	if (Subtarget->hasFullFP16()) {
	addRegisterClass(MVT::f16, &ARM::HPRRegClass);
	setOperationAction(ISD::BITCAST, MVT::i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::i32, Custom);
	setOperationAction(ISD::BITCAST, MVT::f16, Custom);

	setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
	}

	for (MVT VT : MVT::vector_valuetypes()) {
	for (MVT InnerVT : MVT::vector_valuetypes()) {
	setTruncStoreAction(VT, InnerVT, Expand);
	addAllExtLoads(VT, InnerVT, Expand);
	}

	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);

	setOperationAction(ISD::BSWAP, VT, Expand);
	}

	setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
	setOperationAction(ISD::ConstantFP, MVT::f64, Custom);

	setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
	setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);

	if (Subtarget->hasMVEIntegerOps())
	addMVEVectorTypes(Subtarget->hasMVEFloatOps());

	// Combine low-overhead loop intrinsics so that we can lower i1 types.
	if (Subtarget->hasLOB())
	setTargetDAGCombine(ISD::BRCOND);

	if (Subtarget->hasNEON()) {
	addDRTypeForNEON(MVT::v2f32);
	addDRTypeForNEON(MVT::v8i8);
	addDRTypeForNEON(MVT::v4i16);
	addDRTypeForNEON(MVT::v2i32);
	addDRTypeForNEON(MVT::v1i64);

	addQRTypeForNEON(MVT::v4f32);
	addQRTypeForNEON(MVT::v2f64);
	addQRTypeForNEON(MVT::v16i8);
	addQRTypeForNEON(MVT::v8i16);
	addQRTypeForNEON(MVT::v4i32);
	addQRTypeForNEON(MVT::v2i64);

	if (Subtarget->hasFullFP16()) {
	addQRTypeForNEON(MVT::v8f16);
	addDRTypeForNEON(MVT::v4f16);
	}
	}

	if (Subtarget->hasMVEIntegerOps() \|\| Subtarget->hasNEON()) {
	// v2f64 is legal so that QR subregs can be extracted as f64 elements, but
	// none of Neon, MVE or VFP supports any arithmetic operations on it.
	setOperationAction(ISD::FADD, MVT::v2f64, Expand);
	setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
	setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
	// FIXME: Code duplication: FDIV and FREM are expanded always, see
	// ARMTargetLowering::addTypeForNEON method for details.
	setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
	setOperationAction(ISD::FREM, MVT::v2f64, Expand);
	// FIXME: Create unittest.
	// In another words, find a way when "copysign" appears in DAG with vector
	// operands.
	setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
	// FIXME: Code duplication: SETCC has custom operation action, see
	// ARMTargetLowering::addTypeForNEON method for details.
	setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
	// FIXME: Create unittest for FNEG and for FABS.
	setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
	setOperationAction(ISD::FABS, MVT::v2f64, Expand);
	setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
	setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
	setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
	setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
	setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
	setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
	setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
	setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
	setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
	// FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
	setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
	setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
	setOperationAction(ISD::FMA, MVT::v2f64, Expand);
	}

	if (Subtarget->hasNEON()) {
	// The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
	// supported for v4f32.
	setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
	setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
	setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
	setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
	setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
	setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
	setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
	setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
	setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
	setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
	setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);

	// Mark v2f32 intrinsics.
	setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
	setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
	setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
	setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
	setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
	setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
	setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
	setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
	setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
	setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
	setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);

	// Neon does not support some operations on v1i64 and v2i64 types.
	setOperationAction(ISD::MUL, MVT::v1i64, Expand);
	// Custom handling for some quad-vector types to detect VMULL.
	setOperationAction(ISD::MUL, MVT::v8i16, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);
	// Custom handling for some vector types to avoid expensive expansions
	setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
	setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
	setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
	setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
	// Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
	// a destination type that is wider than the source, and nor does
	// it have a FP_TO_[SU]INT instruction with a narrower destination than
	// source.
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);

	setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);

	// NEON does not have single instruction CTPOP for vectors with element
	// types wider than 8-bits. However, custom lowering can leverage the
	// v8i8/v16i8 vcnt instruction.
	setOperationAction(ISD::CTPOP, MVT::v2i32, Custom);
	setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
	setOperationAction(ISD::CTPOP, MVT::v4i16, Custom);
	setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
	setOperationAction(ISD::CTPOP, MVT::v1i64, Custom);
	setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);

	setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);

	// NEON does not have single instruction CTTZ for vectors.
	setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
	setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
	setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
	setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);

	setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
	setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
	setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
	setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);

	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);

	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);

	// NEON only has FMA instructions as of VFP4.
	if (!Subtarget->hasVFP4Base()) {
	setOperationAction(ISD::FMA, MVT::v2f32, Expand);
	setOperationAction(ISD::FMA, MVT::v4f32, Expand);
	}

	setTargetDAGCombine(ISD::INTRINSIC_VOID);
	setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
	setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::FP_TO_SINT);
	setTargetDAGCombine(ISD::FP_TO_UINT);
	setTargetDAGCombine(ISD::FDIV);
	setTargetDAGCombine(ISD::LOAD);

	// It is legal to extload from v4i8 to v4i16 or v4i32.
	for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
	MVT::v2i32}) {
	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
	setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
	setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
	}
	}
	}

	if (Subtarget->hasNEON() \|\| Subtarget->hasMVEIntegerOps()) {
	setTargetDAGCombine(ISD::BUILD_VECTOR);
	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
	setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
	}

	if (!Subtarget->hasFP64()) {
	// When targeting a floating-point unit with only single-precision
	// operations, f64 is legal for the few double-precision instructions which
	// are present However, no double-precision operations other than moves,
	// loads and stores are provided by the hardware.
	setOperationAction(ISD::FADD, MVT::f64, Expand);
	setOperationAction(ISD::FSUB, MVT::f64, Expand);
	setOperationAction(ISD::FMUL, MVT::f64, Expand);
	setOperationAction(ISD::FMA, MVT::f64, Expand);
	setOperationAction(ISD::FDIV, MVT::f64, Expand);
	setOperationAction(ISD::FREM, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FGETSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FNEG, MVT::f64, Expand);
	setOperationAction(ISD::FABS, MVT::f64, Expand);
	setOperationAction(ISD::FSQRT, MVT::f64, Expand);
	setOperationAction(ISD::FSIN, MVT::f64, Expand);
	setOperationAction(ISD::FCOS, MVT::f64, Expand);
	setOperationAction(ISD::FPOW, MVT::f64, Expand);
	setOperationAction(ISD::FLOG, MVT::f64, Expand);
	setOperationAction(ISD::FLOG2, MVT::f64, Expand);
	setOperationAction(ISD::FLOG10, MVT::f64, Expand);
	setOperationAction(ISD::FEXP, MVT::f64, Expand);
	setOperationAction(ISD::FEXP2, MVT::f64, Expand);
	setOperationAction(ISD::FCEIL, MVT::f64, Expand);
	setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
	setOperationAction(ISD::FRINT, MVT::f64, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
	setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
	}

	if (!Subtarget->hasFP64() \|\| !Subtarget->hasFPARMv8Base()){
	setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
	}

	if (!Subtarget->hasFP16())
	setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);

	if (!Subtarget->hasFP64())
	setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);

	computeRegisterProperties(Subtarget->getRegisterInfo());

	// ARM does not have floating-point extending loads.
	for (MVT VT : MVT::fp_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
	}

	// ... or truncating stores
	setTruncStoreAction(MVT::f64, MVT::f32, Expand);
	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);

	// ARM does not have i1 sign extending load.
	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

	// ARM supports all 4 flavors of integer indexed load / store.
	if (!Subtarget->isThumb1Only()) {
	for (unsigned im = (unsigned)ISD::PRE_INC;
	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
	setIndexedLoadAction(im, MVT::i1, Legal);
	setIndexedLoadAction(im, MVT::i8, Legal);
	setIndexedLoadAction(im, MVT::i16, Legal);
	setIndexedLoadAction(im, MVT::i32, Legal);
	setIndexedStoreAction(im, MVT::i1, Legal);
	setIndexedStoreAction(im, MVT::i8, Legal);
	setIndexedStoreAction(im, MVT::i16, Legal);
	setIndexedStoreAction(im, MVT::i32, Legal);
	}
	} else {
	// Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
	setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);
	setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);
	}

	setOperationAction(ISD::SADDO, MVT::i32, Custom);
	setOperationAction(ISD::UADDO, MVT::i32, Custom);
	setOperationAction(ISD::SSUBO, MVT::i32, Custom);
	setOperationAction(ISD::USUBO, MVT::i32, Custom);

	setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
	setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);

	// i64 operation support.
	setOperationAction(ISD::MUL, MVT::i64, Expand);
	setOperationAction(ISD::MULHU, MVT::i32, Expand);
	if (Subtarget->isThumb1Only()) {
	setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
	}
	if (Subtarget->isThumb1Only() \|\| !Subtarget->hasV6Ops()
	\|\| (Subtarget->isThumb2() && !Subtarget->hasDSP()))
	setOperationAction(ISD::MULHS, MVT::i32, Expand);

	setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
	setOperationAction(ISD::SRL, MVT::i64, Custom);
	setOperationAction(ISD::SRA, MVT::i64, Custom);
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);

	// MVE lowers 64 bit shifts to lsll and lsrl
	// assuming that ISD::SRL and SRA of i64 are already marked custom
	if (Subtarget->hasMVEIntegerOps())
	setOperationAction(ISD::SHL, MVT::i64, Custom);

	// Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
	if (Subtarget->isThumb1Only()) {
	setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
	setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
	setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
	}

	if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
	setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);

	// ARM does not have ROTL.
	setOperationAction(ISD::ROTL, MVT::i32, Expand);
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	}
	setOperationAction(ISD::CTTZ, MVT::i32, Custom);
	setOperationAction(ISD::CTPOP, MVT::i32, Expand);
	if (!Subtarget->hasV5TOps() \|\| Subtarget->isThumb1Only()) {
	setOperationAction(ISD::CTLZ, MVT::i32, Expand);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall);
	}

	// @llvm.readcyclecounter requires the Performance Monitors extension.
	// Default to the 0 expansion on unsupported platforms.
	// FIXME: Technically there are older ARM CPUs that have
	// implementation-specific ways of obtaining this information.
	if (Subtarget->hasPerfMon())
	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);

	// Only ARMv6 has BSWAP.
	if (!Subtarget->hasV6Ops())
	setOperationAction(ISD::BSWAP, MVT::i32, Expand);

	bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
	: Subtarget->hasDivideInARMMode();
	if (!hasDivide) {
	// These are expanded into libcalls if the cpu doesn't have HW divider.
	setOperationAction(ISD::SDIV, MVT::i32, LibCall);
	setOperationAction(ISD::UDIV, MVT::i32, LibCall);
	}

	if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
	setOperationAction(ISD::SDIV, MVT::i32, Custom);
	setOperationAction(ISD::UDIV, MVT::i32, Custom);

	setOperationAction(ISD::SDIV, MVT::i64, Custom);
	setOperationAction(ISD::UDIV, MVT::i64, Custom);
	}

	setOperationAction(ISD::SREM, MVT::i32, Expand);
	setOperationAction(ISD::UREM, MVT::i32, Expand);

	// Register based DivRem for AEABI (RTABI 4.2)
	if (Subtarget->isTargetAEABI() \|\| Subtarget->isTargetAndroid() \|\|
	Subtarget->isTargetGNUAEABI() \|\| Subtarget->isTargetMuslAEABI() \|\|
	Subtarget->isTargetWindows()) {
	setOperationAction(ISD::SREM, MVT::i64, Custom);
	setOperationAction(ISD::UREM, MVT::i64, Custom);
	HasStandaloneRem = false;

	if (Subtarget->isTargetWindows()) {
	const struct {
	const RTLIB::Libcall Op;
	const char * const Name;
	const CallingConv::ID CC;
	} LibraryCalls[] = {
	{ RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
	{ RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
	{ RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
	{ RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },

	{ RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
	{ RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
	{ RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
	{ RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
	};

	for (const auto &LC : LibraryCalls) {
	setLibcallName(LC.Op, LC.Name);
	setLibcallCallingConv(LC.Op, LC.CC);
	}
	} else {
	const struct {
	const RTLIB::Libcall Op;
	const char * const Name;
	const CallingConv::ID CC;
	} LibraryCalls[] = {
	{ RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
	{ RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
	{ RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
	{ RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },

	{ RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
	{ RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
	{ RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
	{ RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
	};

	for (const auto &LC : LibraryCalls) {
	setLibcallName(LC.Op, LC.Name);
	setLibcallCallingConv(LC.Op, LC.CC);
	}
	}

	setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
	setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
	setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
	setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
	} else {
	setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
	}

	if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT())
	for (auto &VT : {MVT::f32, MVT::f64})
	setOperationAction(ISD::FPOWI, VT, Custom);

	setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
	setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
	setOperationAction(ISD::BlockAddress, MVT::i32, Custom);

	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// Use the default implementation.
	setOperationAction(ISD::VASTART, MVT::Other, Custom);
	setOperationAction(ISD::VAARG, MVT::Other, Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Expand);
	setOperationAction(ISD::VAEND, MVT::Other, Expand);
	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	if (Subtarget->isTargetWindows())
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
	else
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);

	// ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
	// the default expansion.
	InsertFencesForAtomic = false;
	if (Subtarget->hasAnyDataBarrier() &&
	(!Subtarget->isThumb() \|\| Subtarget->hasV8MBaselineOps())) {
	// ATOMIC_FENCE needs custom lowering; the others should have been expanded
	// to ldrex/strex loops already.
	setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
	if (!Subtarget->isThumb() \|\| !Subtarget->isMClass())
	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);

	// On v8, we have particularly efficient implementations of atomic fences
	// if they can be combined with nearby atomic loads and stores.
	if (!Subtarget->hasAcquireRelease() \|\|
	getTargetMachine().getOptLevel() == 0) {
	// Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
	InsertFencesForAtomic = true;
	}
	} else {
	// If there's anything we can use as a barrier, go through custom lowering
	// for ATOMIC_FENCE.
	// If target has DMB in thumb, Fences can be inserted.
	if (Subtarget->hasDataBarrier())
	InsertFencesForAtomic = true;

	setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
	Subtarget->hasAnyDataBarrier() ? Custom : Expand);

	// Set them all for expansion, which will force libcalls.
	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand);
	setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
	// Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
	// Unordered/Monotonic case.
	if (!InsertFencesForAtomic) {
	setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
	}
	}

	setOperationAction(ISD::PREFETCH, MVT::Other, Custom);

	// Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
	if (!Subtarget->hasV6Ops()) {
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
	}
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);

	if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
	!Subtarget->isThumb1Only()) {
	// Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
	// iff target supports vfp2.
	setOperationAction(ISD::BITCAST, MVT::i64, Custom);
	setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
	}

	// We want to custom lower some of our intrinsics.
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
	setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
	if (Subtarget->useSjLjEH())
	setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

	setOperationAction(ISD::SETCC, MVT::i32, Expand);
	setOperationAction(ISD::SETCC, MVT::f32, Expand);
	setOperationAction(ISD::SETCC, MVT::f64, Expand);
	setOperationAction(ISD::SELECT, MVT::i32, Custom);
	setOperationAction(ISD::SELECT, MVT::f32, Custom);
	setOperationAction(ISD::SELECT, MVT::f64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
	if (Subtarget->hasFullFP16()) {
	setOperationAction(ISD::SETCC, MVT::f16, Expand);
	setOperationAction(ISD::SELECT, MVT::f16, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
	}

	setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);

	setOperationAction(ISD::BRCOND, MVT::Other, Custom);
	setOperationAction(ISD::BR_CC, MVT::i32, Custom);
	if (Subtarget->hasFullFP16())
	setOperationAction(ISD::BR_CC, MVT::f16, Custom);
	setOperationAction(ISD::BR_CC, MVT::f32, Custom);
	setOperationAction(ISD::BR_CC, MVT::f64, Custom);
	setOperationAction(ISD::BR_JT, MVT::Other, Custom);

	// We don't support sin/cos/fmod/copysign/pow
	setOperationAction(ISD::FSIN, MVT::f64, Expand);
	setOperationAction(ISD::FSIN, MVT::f32, Expand);
	setOperationAction(ISD::FCOS, MVT::f32, Expand);
	setOperationAction(ISD::FCOS, MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
	setOperationAction(ISD::FREM, MVT::f64, Expand);
	setOperationAction(ISD::FREM, MVT::f32, Expand);
	if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
	!Subtarget->isThumb1Only()) {
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
	}
	setOperationAction(ISD::FPOW, MVT::f64, Expand);
	setOperationAction(ISD::FPOW, MVT::f32, Expand);

	if (!Subtarget->hasVFP4Base()) {
	setOperationAction(ISD::FMA, MVT::f64, Expand);
	setOperationAction(ISD::FMA, MVT::f32, Expand);
	}

	// Various VFP goodness
	if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
	// FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
	if (!Subtarget->hasFPARMv8Base() \|\| !Subtarget->hasFP64()) {
	setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
	}

	// fp16 is a special v7 extension that adds f16 <-> f32 conversions.
	if (!Subtarget->hasFP16()) {
	setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
	}
	}

	// Use __sincos_stret if available.
	if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
	getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	}

	// FP-ARMv8 implements a lot of rounding-like FP operations.
	if (Subtarget->hasFPARMv8Base()) {
	setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
	setOperationAction(ISD::FCEIL, MVT::f32, Legal);
	setOperationAction(ISD::FROUND, MVT::f32, Legal);
	setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
	setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
	setOperationAction(ISD::FRINT, MVT::f32, Legal);
	setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
	if (Subtarget->hasNEON()) {
	setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
	setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
	}

	if (Subtarget->hasFP64()) {
	setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
	setOperationAction(ISD::FCEIL, MVT::f64, Legal);
	setOperationAction(ISD::FROUND, MVT::f64, Legal);
	setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
	setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
	setOperationAction(ISD::FRINT, MVT::f64, Legal);
	setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
	}
	}

	// FP16 often need to be promoted to call lib functions
	if (Subtarget->hasFullFP16()) {
	setOperationAction(ISD::FREM, MVT::f16, Promote);
	setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
	setOperationAction(ISD::FSIN, MVT::f16, Promote);
	setOperationAction(ISD::FCOS, MVT::f16, Promote);
	setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
	setOperationAction(ISD::FPOWI, MVT::f16, Promote);
	setOperationAction(ISD::FPOW, MVT::f16, Promote);
	setOperationAction(ISD::FEXP, MVT::f16, Promote);
	setOperationAction(ISD::FEXP2, MVT::f16, Promote);
	setOperationAction(ISD::FLOG, MVT::f16, Promote);
	setOperationAction(ISD::FLOG10, MVT::f16, Promote);
	setOperationAction(ISD::FLOG2, MVT::f16, Promote);

	setOperationAction(ISD::FROUND, MVT::f16, Legal);
	}

	if (Subtarget->hasNEON()) {
	// vmin and vmax aren't available in a scalar form, so we use
	// a NEON instruction with an undef lane instead.
	setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
	setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
	setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
	setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);

	if (Subtarget->hasFullFP16()) {
	setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
	setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);

	setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
	setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
	}
	}

	// We have target-specific dag combine patterns for the following nodes:
	// ARMISD::VMOVRRD - No need to call setTargetDAGCombine
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::AND);
	setTargetDAGCombine(ISD::OR);
	setTargetDAGCombine(ISD::XOR);

	if (Subtarget->hasV6Ops())
	setTargetDAGCombine(ISD::SRL);
	if (Subtarget->isThumb1Only())
	setTargetDAGCombine(ISD::SHL);

	setStackPointerRegisterToSaveRestore(ARM::SP);

	if (Subtarget->useSoftFloat() \|\| Subtarget->isThumb1Only() \|\|
	!Subtarget->hasVFP2Base() \|\| Subtarget->hasMinSize())
	setSchedulingPreference(Sched::RegPressure);
	else
	setSchedulingPreference(Sched::Hybrid);

	//// temporary - rewrite interface to use type
	MaxStoresPerMemset = 8;
	MaxStoresPerMemsetOptSize = 4;
	MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
	MaxStoresPerMemcpyOptSize = 2;
	MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
	MaxStoresPerMemmoveOptSize = 2;

	// On ARM arguments smaller than 4 bytes are extended, so all arguments
	// are at least 4 bytes aligned.
	setMinStackArgumentAlignment(4);

	// Prefer likely predicted branches to selects on out-of-order cores.
	PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();

	setPrefLoopAlignment(Subtarget->getPrefLoopAlignment());

	setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);

	if (Subtarget->isThumb() \|\| Subtarget->isThumb2())
	setTargetDAGCombine(ISD::ABS);
	}

	bool ARMTargetLowering::useSoftFloat() const {
	return Subtarget->useSoftFloat();
	}

	// FIXME: It might make sense to define the representative register class as the
	// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
	// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
	// SPR's representative would be DPR_VFP2. This should work well if register
	// pressure tracking were modified such that a register use would increment the
	// pressure of the register class's representative and all of it's super
	// classes' representatives transitively. We have not implemented this because
	// of the difficulty prior to coalescing of modeling operand register classes
	// due to the common occurrence of cross class copies and subregister insertions
	// and extractions.
	std::pair<const TargetRegisterClass *, uint8_t>
	ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const {
	const TargetRegisterClass *RRC = nullptr;
	uint8_t Cost = 1;
	switch (VT.SimpleTy) {
	default:
	return TargetLowering::findRepresentativeClass(TRI, VT);
	// Use DPR as representative register class for all floating point
	// and vector types. Since there are 32 SPR registers and 32 DPR registers so
	// the cost is 1 for both f32 and f64.
	case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
	case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
	RRC = &ARM::DPRRegClass;
	// When NEON is used for SP, only half of the register file is available
	// because operations that define both SP and DP results will be constrained
	// to the VFP2 class (D0-D15). We currently model this constraint prior to
	// coalescing by double-counting the SP regs. See the FIXME above.
	if (Subtarget->useNEONForSinglePrecisionFP())
	Cost = 2;
	break;
	case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
	case MVT::v4f32: case MVT::v2f64:
	RRC = &ARM::DPRRegClass;
	Cost = 2;
	break;
	case MVT::v4i64:
	RRC = &ARM::DPRRegClass;
	Cost = 4;
	break;
	case MVT::v8i64:
	RRC = &ARM::DPRRegClass;
	Cost = 8;
	break;
	}
	return std::make_pair(RRC, Cost);
	}

	const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((ARMISD::NodeType)Opcode) {
	case ARMISD::FIRST_NUMBER: break;
	case ARMISD::Wrapper: return "ARMISD::Wrapper";
	case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC";
	case ARMISD::WrapperJT: return "ARMISD::WrapperJT";
	case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
	case ARMISD::CALL: return "ARMISD::CALL";
	case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED";
	case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK";
	case ARMISD::BRCOND: return "ARMISD::BRCOND";
	case ARMISD::BR_JT: return "ARMISD::BR_JT";
	case ARMISD::BR2_JT: return "ARMISD::BR2_JT";
	case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG";
	case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG";
	case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD";
	case ARMISD::CMP: return "ARMISD::CMP";
	case ARMISD::CMN: return "ARMISD::CMN";
	case ARMISD::CMPZ: return "ARMISD::CMPZ";
	case ARMISD::CMPFP: return "ARMISD::CMPFP";
	case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0";
	case ARMISD::BCC_i64: return "ARMISD::BCC_i64";
	case ARMISD::FMSTAT: return "ARMISD::FMSTAT";

	case ARMISD::CMOV: return "ARMISD::CMOV";
	case ARMISD::SUBS: return "ARMISD::SUBS";

	case ARMISD::SSAT: return "ARMISD::SSAT";
	case ARMISD::USAT: return "ARMISD::USAT";

	case ARMISD::ASRL: return "ARMISD::ASRL";
	case ARMISD::LSRL: return "ARMISD::LSRL";
	case ARMISD::LSLL: return "ARMISD::LSLL";

	case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG";
	case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG";
	case ARMISD::RRX: return "ARMISD::RRX";

	case ARMISD::ADDC: return "ARMISD::ADDC";
	case ARMISD::ADDE: return "ARMISD::ADDE";
	case ARMISD::SUBC: return "ARMISD::SUBC";
	case ARMISD::SUBE: return "ARMISD::SUBE";

	case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD";
	case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR";
	case ARMISD::VMOVhr: return "ARMISD::VMOVhr";
	case ARMISD::VMOVrh: return "ARMISD::VMOVrh";
	case ARMISD::VMOVSR: return "ARMISD::VMOVSR";

	case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
	case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
	case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";

	case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN";

	case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";

	case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC";

	case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";

	case ARMISD::PRELOAD: return "ARMISD::PRELOAD";

	case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK";
	case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK";

	case ARMISD::VCEQ: return "ARMISD::VCEQ";
	case ARMISD::VCEQZ: return "ARMISD::VCEQZ";
	case ARMISD::VCGE: return "ARMISD::VCGE";
	case ARMISD::VCGEZ: return "ARMISD::VCGEZ";
	case ARMISD::VCLEZ: return "ARMISD::VCLEZ";
	case ARMISD::VCGEU: return "ARMISD::VCGEU";
	case ARMISD::VCGT: return "ARMISD::VCGT";
	case ARMISD::VCGTZ: return "ARMISD::VCGTZ";
	case ARMISD::VCLTZ: return "ARMISD::VCLTZ";
	case ARMISD::VCGTU: return "ARMISD::VCGTU";
	case ARMISD::VTST: return "ARMISD::VTST";

	case ARMISD::VSHLs: return "ARMISD::VSHLs";
	case ARMISD::VSHLu: return "ARMISD::VSHLu";
	case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM";
	case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM";
	case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM";
	case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM";
	case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM";
	case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM";
	case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM";
	case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM";
	case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM";
	case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM";
	case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM";
	case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM";
	case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM";
	case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM";
	case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM";
	case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM";
	case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM";
	case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu";
	case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs";
	case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM";
	case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM";
	case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM";
	case ARMISD::VDUP: return "ARMISD::VDUP";
	case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE";
	case ARMISD::VEXT: return "ARMISD::VEXT";
	case ARMISD::VREV64: return "ARMISD::VREV64";
	case ARMISD::VREV32: return "ARMISD::VREV32";
	case ARMISD::VREV16: return "ARMISD::VREV16";
	case ARMISD::VZIP: return "ARMISD::VZIP";
	case ARMISD::VUZP: return "ARMISD::VUZP";
	case ARMISD::VTRN: return "ARMISD::VTRN";
	case ARMISD::VTBL1: return "ARMISD::VTBL1";
	case ARMISD::VTBL2: return "ARMISD::VTBL2";
	case ARMISD::VMULLs: return "ARMISD::VMULLs";
	case ARMISD::VMULLu: return "ARMISD::VMULLu";
	case ARMISD::UMAAL: return "ARMISD::UMAAL";
	case ARMISD::UMLAL: return "ARMISD::UMLAL";
	case ARMISD::SMLAL: return "ARMISD::SMLAL";
	case ARMISD::SMLALBB: return "ARMISD::SMLALBB";
	case ARMISD::SMLALBT: return "ARMISD::SMLALBT";
	case ARMISD::SMLALTB: return "ARMISD::SMLALTB";
	case ARMISD::SMLALTT: return "ARMISD::SMLALTT";
	case ARMISD::SMULWB: return "ARMISD::SMULWB";
	case ARMISD::SMULWT: return "ARMISD::SMULWT";
	case ARMISD::SMLALD: return "ARMISD::SMLALD";
	case ARMISD::SMLALDX: return "ARMISD::SMLALDX";
	case ARMISD::SMLSLD: return "ARMISD::SMLSLD";
	case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX";
	case ARMISD::SMMLAR: return "ARMISD::SMMLAR";
	case ARMISD::SMMLSR: return "ARMISD::SMMLSR";
	case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";
	case ARMISD::BFI: return "ARMISD::BFI";
	case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
	case ARMISD::VBICIMM: return "ARMISD::VBICIMM";
	case ARMISD::VBSL: return "ARMISD::VBSL";
	case ARMISD::MEMCPY: return "ARMISD::MEMCPY";
	case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP";
	case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
	case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP";
	case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP";
	case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD";
	case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD";
	case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD";
	case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD";
	case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD";
	case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD";
	case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD";
	case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD";
	case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD";
	case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD";
	case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD";
	case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD";
	case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD";
	case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD";
	case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD";
	case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD";
	case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD";
	case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD";
	case ARMISD::WLS: return "ARMISD::WLS";
	}
	return nullptr;
	}

	EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
	EVT VT) const {
	if (!VT.isVector())
	return getPointerTy(DL);
	return VT.changeVectorElementTypeToInteger();
	}

	/// getRegClassFor - Return the register class that should be used for the
	/// specified value type.
	const TargetRegisterClass *
	ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
	(void)isDivergent;
	// Map v4i64 to QQ registers but do not make the type legal. Similarly map
	// v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
	// load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
	// MVE Q registers.
	if (Subtarget->hasNEON() \|\| Subtarget->hasMVEIntegerOps()) {
	if (VT == MVT::v4i64)
	return &ARM::QQPRRegClass;
	if (VT == MVT::v8i64)
	return &ARM::QQQQPRRegClass;
	}
	return TargetLowering::getRegClassFor(VT);
	}

	// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
	// source/dest is aligned and the copy size is large enough. We therefore want
	// to align such objects passed to memory intrinsics.
	bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
	unsigned &PrefAlign) const {
	if (!isa<MemIntrinsic>(CI))
	return false;
	MinSize = 8;
	// On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
	// cycle faster than 4-byte aligned LDM.
	PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
	return true;
	}

	// Create a fast isel object.
	FastISel *
	ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return ARM::createFastISel(funcInfo, libInfo);
	}

	Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
	unsigned NumVals = N->getNumValues();
	if (!NumVals)
	return Sched::RegPressure;

	for (unsigned i = 0; i != NumVals; ++i) {
	EVT VT = N->getValueType(i);
	if (VT == MVT::Glue \|\| VT == MVT::Other)
	continue;
	if (VT.isFloatingPoint() \|\| VT.isVector())
	return Sched::ILP;
	}

	if (!N->isMachineOpcode())
	return Sched::RegPressure;

	// Load are scheduled for latency even if there instruction itinerary
	// is not available.
	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());

	if (MCID.getNumDefs() == 0)
	return Sched::RegPressure;
	if (!Itins->isEmpty() &&
	Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
	return Sched::ILP;

	return Sched::RegPressure;
	}

	//===----------------------------------------------------------------------===//
	// Lowering Code
	//===----------------------------------------------------------------------===//

	static bool isSRL16(const SDValue &Op) {
	if (Op.getOpcode() != ISD::SRL)
	return false;
	if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
	return Const->getZExtValue() == 16;
	return false;
	}

	static bool isSRA16(const SDValue &Op) {
	if (Op.getOpcode() != ISD::SRA)
	return false;
	if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
	return Const->getZExtValue() == 16;
	return false;
	}

	static bool isSHL16(const SDValue &Op) {
	if (Op.getOpcode() != ISD::SHL)
	return false;
	if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
	return Const->getZExtValue() == 16;
	return false;
	}

	// Check for a signed 16-bit value. We special case SRA because it makes it
	// more simple when also looking for SRAs that aren't sign extending a
	// smaller value. Without the check, we'd need to take extra care with
	// checking order for some operations.
	static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
	if (isSRA16(Op))
	return isSHL16(Op.getOperand(0));
	return DAG.ComputeNumSignBits(Op) == 17;
	}

	/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
	static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
	switch (CC) {
	default: llvm_unreachable("Unknown condition code!");
	case ISD::SETNE: return ARMCC::NE;
	case ISD::SETEQ: return ARMCC::EQ;
	case ISD::SETGT: return ARMCC::GT;
	case ISD::SETGE: return ARMCC::GE;
	case ISD::SETLT: return ARMCC::LT;
	case ISD::SETLE: return ARMCC::LE;
	case ISD::SETUGT: return ARMCC::HI;
	case ISD::SETUGE: return ARMCC::HS;
	case ISD::SETULT: return ARMCC::LO;
	case ISD::SETULE: return ARMCC::LS;
	}
	}

	/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
	static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
	ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) {
	CondCode2 = ARMCC::AL;
	InvalidOnQNaN = true;
	switch (CC) {
	default: llvm_unreachable("Unknown FP condition!");
	case ISD::SETEQ:
	case ISD::SETOEQ:
	CondCode = ARMCC::EQ;
	InvalidOnQNaN = false;
	break;
	case ISD::SETGT:
	case ISD::SETOGT: CondCode = ARMCC::GT; break;
	case ISD::SETGE:
	case ISD::SETOGE: CondCode = ARMCC::GE; break;
	case ISD::SETOLT: CondCode = ARMCC::MI; break;
	case ISD::SETOLE: CondCode = ARMCC::LS; break;
	case ISD::SETONE:
	CondCode = ARMCC::MI;
	CondCode2 = ARMCC::GT;
	InvalidOnQNaN = false;
	break;
	case ISD::SETO: CondCode = ARMCC::VC; break;
	case ISD::SETUO: CondCode = ARMCC::VS; break;
	case ISD::SETUEQ:
	CondCode = ARMCC::EQ;
	CondCode2 = ARMCC::VS;
	InvalidOnQNaN = false;
	break;
	case ISD::SETUGT: CondCode = ARMCC::HI; break;
	case ISD::SETUGE: CondCode = ARMCC::PL; break;
	case ISD::SETLT:
	case ISD::SETULT: CondCode = ARMCC::LT; break;
	case ISD::SETLE:
	case ISD::SETULE: CondCode = ARMCC::LE; break;
	case ISD::SETNE:
	case ISD::SETUNE:
	CondCode = ARMCC::NE;
	InvalidOnQNaN = false;
	break;
	}
	}

	//===----------------------------------------------------------------------===//
	// Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	/// getEffectiveCallingConv - Get the effective calling convention, taking into
	/// account presence of floating point hardware and calling convention
	/// limitations, such as support for variadic functions.
	CallingConv::ID
	ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
	bool isVarArg) const {
	switch (CC) {
	default:
	report_fatal_error("Unsupported calling convention");
	case CallingConv::ARM_AAPCS:
	case CallingConv::ARM_APCS:
	case CallingConv::GHC:
	return CC;
	case CallingConv::PreserveMost:
	return CallingConv::PreserveMost;
	case CallingConv::ARM_AAPCS_VFP:
	case CallingConv::Swift:
	return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
	case CallingConv::C:
	if (!Subtarget->isAAPCS_ABI())
	return CallingConv::ARM_APCS;
	else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
	getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
	!isVarArg)
	return CallingConv::ARM_AAPCS_VFP;
	else
	return CallingConv::ARM_AAPCS;
	case CallingConv::Fast:
	case CallingConv::CXX_FAST_TLS:
	if (!Subtarget->isAAPCS_ABI()) {
	if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
	return CallingConv::Fast;
	return CallingConv::ARM_APCS;
	} else if (Subtarget->hasVFP2Base() &&
	!Subtarget->isThumb1Only() && !isVarArg)
	return CallingConv::ARM_AAPCS_VFP;
	else
	return CallingConv::ARM_AAPCS;
	}
	}

	CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
	bool isVarArg) const {
	return CCAssignFnForNode(CC, false, isVarArg);
	}

	CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
	bool isVarArg) const {
	return CCAssignFnForNode(CC, true, isVarArg);
	}

	/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
	/// CallingConvention.
	CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
	bool Return,
	bool isVarArg) const {
	switch (getEffectiveCallingConv(CC, isVarArg)) {
	default:
	report_fatal_error("Unsupported calling convention");
	case CallingConv::ARM_APCS:
	return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
	case CallingConv::ARM_AAPCS:
	return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
	case CallingConv::ARM_AAPCS_VFP:
	return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
	case CallingConv::Fast:
	return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
	case CallingConv::GHC:
	return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
	case CallingConv::PreserveMost:
	return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
	}
	}

	/// LowerCallResult - Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	SDValue ARMTargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
	SDValue ThisVal) const {
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));

	// Copy all of the result registers out of their specified physreg.
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	CCValAssign VA = RVLocs[i];

	// Pass 'this' value directly from the argument to return value, to avoid
	// reg unit interference
	if (i == 0 && isThisReturn) {
	assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
	"unexpected return calling convention register assignment");
	InVals.push_back(ThisVal);
	continue;
	}

	SDValue Val;
	if (VA.needsCustom()) {
	// Handle f64 or half of a v2f64.
	SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
	InFlag);
	Chain = Lo.getValue(1);
	InFlag = Lo.getValue(2);
	VA = RVLocs[++i]; // skip ahead to next loc
	SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
	InFlag);
	Chain = Hi.getValue(1);
	InFlag = Hi.getValue(2);
	if (!Subtarget->isLittle())
	std::swap (Lo, Hi);
	Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);

	if (VA.getLocVT() == MVT::v2f64) {
	SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
	Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
	DAG.getConstant(0, dl, MVT::i32));

	VA = RVLocs[++i]; // skip ahead to next loc
	Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
	Chain = Lo.getValue(1);
	InFlag = Lo.getValue(2);
	VA = RVLocs[++i]; // skip ahead to next loc
	Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
	Chain = Hi.getValue(1);
	InFlag = Hi.getValue(2);
	if (!Subtarget->isLittle())
	std::swap (Lo, Hi);
	Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
	Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
	DAG.getConstant(1, dl, MVT::i32));
	}
	} else {
	Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
	InFlag);
	Chain = Val.getValue(1);
	InFlag = Val.getValue(2);
	}

	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::BCvt:
	Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
	break;
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	/// LowerMemOpCallTo - Store the argument to the stack.
	SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
	SDValue Arg, const SDLoc &dl,
	SelectionDAG &DAG,
	const CCValAssign &VA,
	ISD::ArgFlagsTy Flags) const {
	unsigned LocMemOffset = VA.getLocMemOffset();
	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, PtrOff);
	return DAG.getStore(
	Chain, dl, Arg, PtrOff,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
	}

	void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
	SDValue Chain, SDValue &Arg,
	RegsToPassVector &RegsToPass,
	CCValAssign &VA, CCValAssign &NextVA,
	SDValue &StackPtr,
	SmallVectorImpl<SDValue> &MemOpChains,
	ISD::ArgFlagsTy Flags) const {
	SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
	DAG.getVTList(MVT::i32, MVT::i32), Arg);
	unsigned id = Subtarget->isLittle() ? 0 : 1;
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));

	if (NextVA.isRegLoc())
	RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
	else {
	assert(NextVA.isMemLoc());
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
	getPointerTy(DAG.getDataLayout()));

	MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
	dl, DAG, NextVA,
	Flags));
	}
	}

	/// LowerCall - Lowering a call into a callseq_start <-
	/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
	/// nodes.
	SDValue
	ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &isTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool doesNotRet = CLI.DoesNotReturn;
	bool isVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
	bool isThisReturn = false;
	auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
	bool PreferIndirect = false;

	// Disable tail calls if they're not supported.
	if (!Subtarget->supportsTailCall() \|\| Attr.getValueAsString() == "true")
	isTailCall = false;

	if (isa<GlobalAddressSDNode>(Callee)) {
	// If we're optimizing for minimum size and the function is called three or
	// more times in this block, we can improve codesize by calling indirectly
	// as BLXr has a 16-bit encoding.
	auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
	if (CLI.CS) {
	auto *BB = CLI.CS.getParent();
	PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
	count_if(GV->users(), [&BB](const User *U) {
	return isa<Instruction>(U) &&
	cast<Instruction>(U)->getParent() == BB;
	}) > 2;
	}
	}
	if (isTailCall) {
	// Check if it's really possible to do a tail call.
	isTailCall = IsEligibleForTailCallOptimization(
	Callee, CallConv, isVarArg, isStructRet,
	MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
	PreferIndirect);
	if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall())
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");
	// We don't support GuaranteedTailCallOpt for ARM, only automatically
	// detected sibcalls.
	if (isTailCall)
	++NumTailCalls;
	}

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getNextStackOffset();

	if (isTailCall) {
	// For tail calls, memory operands are available in our caller's stack.
	NumBytes = 0;
	} else {
	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
	}

	SDValue StackPtr =
	DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));

	RegsToPassVector RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;

	// Walk the register/memloc assignments, inserting copies/loads. In the case
	// of tail call optimization, arguments are handled later.
	for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
	i != e;
	++i, ++realArgIdx) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[realArgIdx];
	ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
	bool isByVal = Flags.isByVal();

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExt:
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
	break;
	}

	// f64 and v2f64 might be passed in i32 pairs and must be split into pieces
	if (VA.needsCustom()) {
	if (VA.getLocVT() == MVT::v2f64) {
	SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
	DAG.getConstant(0, dl, MVT::i32));
	SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
	DAG.getConstant(1, dl, MVT::i32));

	PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
	VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);

	VA = ArgLocs[++i]; // skip ahead to next loc
	if (VA.isRegLoc()) {
	PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
	VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
	} else {
	assert(VA.isMemLoc());

	MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
	dl, DAG, VA, Flags));
	}
	} else {
	PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
	StackPtr, MemOpChains, Flags);
	}
	} else if (VA.isRegLoc()) {
	if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
	Outs[0].VT == MVT::i32) {
	assert(VA.getLocVT() == MVT::i32 &&
	"unexpected calling convention register assignment");
	assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
	"unexpected use of 'returned'");
	isThisReturn = true;
	}
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	} else if (isByVal) {
	assert(VA.isMemLoc());
	unsigned offset = 0;

	// True if this byval aggregate will be split between registers
	// and memory.
	unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
	unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();

	if (CurByValIdx < ByValArgsCount) {

	unsigned RegBegin, RegEnd;
	CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);

	EVT PtrVT =
	DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	unsigned int i, j;
	for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
	SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
	SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
	SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
	MachinePointerInfo(),
	DAG.InferPtrAlignment(AddArg));
	MemOpChains.push_back(Load.getValue(1));
	RegsToPass.push_back(std::make_pair(j, Load));
	}

	// If parameter size outsides register area, "offset" value
	// helps us to calculate stack slot for remained part properly.
	offset = RegEnd - RegBegin;

	CCInfo.nextInRegsParam();
	}

	if (Flags.getByValSize() > 4*offset) {
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	unsigned LocMemOffset = VA.getLocMemOffset();
	SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
	SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
	SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
	SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
	MVT::i32);
	SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
	MVT::i32);

	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
	MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
	Ops));
	}
	} else if (!isTailCall) {
	assert(VA.isMemLoc());

	MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
	dl, DAG, VA, Flags));
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
	// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
	// node so that legalize doesn't hack it.
	bool isDirect = false;

	const TargetMachine &TM = getTargetMachine();
	const Module *Mod = MF.getFunction().getParent();
	const GlobalValue *GV = nullptr;
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
	GV = G->getGlobal();
	bool isStub =
	!TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();

	bool isARMFunc = !Subtarget->isThumb() \|\| (isStub && !Subtarget->isMClass());
	bool isLocalARMFunc = false;
	ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
	auto PtrVt = getPointerTy(DAG.getDataLayout());

	if (Subtarget->genLongCalls()) {
	assert((!isPositionIndependent() \|\| Subtarget->isTargetWindows()) &&
	"long-calls codegen is not position independent!");
	// Handle a global address or an external symbol. If it's not one of
	// those, the target's already in a register, so we don't need to do
	// anything extra.
	if (isa<GlobalAddressSDNode>(Callee)) {
	// Create a constant pool entry for the callee address
	unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
	ARMConstantPoolValue *CPV =
	ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);

	// Get the address of the callee into a register
	SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
	CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
	Callee = DAG.getLoad(
	PtrVt, dl, DAG.getEntryNode(), CPAddr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
	} else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
	const char *Sym = S->getSymbol();

	// Create a constant pool entry for the callee address
	unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
	ARMConstantPoolValue *CPV =
	ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
	ARMPCLabelIndex, 0);
	// Get the address of the callee into a register
	SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
	CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
	Callee = DAG.getLoad(
	PtrVt, dl, DAG.getEntryNode(), CPAddr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
	}
	} else if (isa<GlobalAddressSDNode>(Callee)) {
	if (!PreferIndirect) {
	isDirect = true;
	bool isDef = GV->isStrongDefinitionForLinker();

	// ARM call to a local ARM function is predicable.
	isLocalARMFunc = !Subtarget->isThumb() && (isDef \|\| !ARMInterworking);
	// tBX takes a register source operand.
	if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
	assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
	Callee = DAG.getNode(
	ARMISD::WrapperPIC, dl, PtrVt,
	DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
	Callee = DAG.getLoad(
	PtrVt, dl, DAG.getEntryNode(), Callee,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()),
	/* Alignment = */ 0, MachineMemOperand::MODereferenceable \|
	MachineMemOperand::MOInvariant);
	} else if (Subtarget->isTargetCOFF()) {
	assert(Subtarget->isTargetWindows() &&
	"Windows is the only supported COFF target");
	unsigned TargetFlags = GV->hasDLLImportStorageClass()
	? ARMII::MO_DLLIMPORT
	: ARMII::MO_NO_FLAG;
	Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /offset=/0,
	TargetFlags);
	if (GV->hasDLLImportStorageClass())
	Callee =
	DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
	DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	} else {
	Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
	}
	}
	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	isDirect = true;
	// tBX takes a register source operand.
	const char *Sym = S->getSymbol();
	if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
	unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
	ARMConstantPoolValue *CPV =
	ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
	ARMPCLabelIndex, 4);
	SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
	CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
	Callee = DAG.getLoad(
	PtrVt, dl, DAG.getEntryNode(), CPAddr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
	SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
	Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
	} else {
	Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
	}
	}

	// FIXME: handle tail calls differently.
	unsigned CallOpc;
	if (Subtarget->isThumb()) {
	if ((!isDirect \|\| isARMFunc) && !Subtarget->hasV5TOps())
	CallOpc = ARMISD::CALL_NOLINK;
	else
	CallOpc = ARMISD::CALL;
	} else {
	if (!isDirect && !Subtarget->hasV5TOps())
	CallOpc = ARMISD::CALL_NOLINK;
	else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
	// Emit regular call when code size is the priority
	!Subtarget->hasMinSize())
	// "mov lr, pc; b _foo" to avoid confusing the RSP
	CallOpc = ARMISD::CALL_NOLINK;
	else
	CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
	}

	std::vector<SDValue> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	if (!isTailCall) {
	const uint32_t *Mask;
	const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
	if (isThisReturn) {
	// For 'this' returns, use the R0-preserving mask if applicable
	Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
	if (!Mask) {
	// Set isThisReturn to false if the calling convention is not one that
	// allows 'returned' to be modeled in this way, so LowerCallResult does
	// not try to pass 'this' straight through
	isThisReturn = false;
	Mask = ARI->getCallPreservedMask(MF, CallConv);
	}
	} else
	Mask = ARI->getCallPreservedMask(MF, CallConv);

	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));
	}

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	if (isTailCall) {
	MF.getFrameInfo().setHasTailCall();
	return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
	}

	// Returns a chain and a flag for retval copy to use.
	Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
	InFlag = Chain.getValue(1);

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	if (!Ins.empty())
	InFlag = Chain.getValue(1);

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
	InVals, isThisReturn,
	isThisReturn ? OutVals[0] : SDValue());
	}

	/// HandleByVal - Every parameter after a byval parameter is passed
	/// on the stack. Remember the next parameter register to allocate,
	/// and then confiscate the rest of the parameter registers to insure
	/// this.
	void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
	unsigned Align) const {
	// Byval (as with any stack) slots are always at least 4 byte aligned.
	Align = std::max(Align, 4U);

	unsigned Reg = State->AllocateReg(GPRArgRegs);
	if (!Reg)
	return;

	unsigned AlignInRegs = Align / 4;
	unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
	for (unsigned i = 0; i < Waste; ++i)
	Reg = State->AllocateReg(GPRArgRegs);

	if (!Reg)
	return;

	unsigned Excess = 4 * (ARM::R4 - Reg);

	// Special case when NSAA != SP and parameter size greater than size of
	// all remained GPR regs. In that case we can't split parameter, we must
	// send it to stack. We also must set NCRN to R4, so waste all
	// remained registers.
	const unsigned NSAAOffset = State->getNextStackOffset();
	if (NSAAOffset != 0 && Size > Excess) {
	while (State->AllocateReg(GPRArgRegs))
	;
	return;
	}

	// First register for byval parameter is the first register that wasn't
	// allocated before this method call, so it would be "reg".
	// If parameter is small enough to be saved in range [reg, r4), then
	// the end (first after last) register would be reg + param-size-in-regs,
	// else parameter would be splitted between registers and stack,
	// end register would be r4 in this case.
	unsigned ByValRegBegin = Reg;
	unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
	State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
	// Note, first register is allocated in the beginning of function already,
	// allocate remained amount of registers we need.
	for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
	State->AllocateReg(GPRArgRegs);
	// A byval parameter that is split between registers and memory needs its
	// size truncated here.
	// In the case where the entire structure fits in registers, we set the
	// size in memory to zero.
	Size = std::max<int>(Size - Excess, 0);
	}

	/// MatchingStackOffset - Return true if the given stack call argument is
	/// already available in the same position (relatively) of the caller's
	/// incoming argument stack.
	static
	bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
	MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
	const TargetInstrInfo *TII) {
	unsigned Bytes = Arg.getValueSizeInBits() / 8;
	int FI = std::numeric_limits<int>::max();
	if (Arg.getOpcode() == ISD::CopyFromReg) {
	unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
	if (!TargetRegisterInfo::isVirtualRegister(VR))
	return false;
	MachineInstr *Def = MRI->getVRegDef(VR);
	if (!Def)
	return false;
	if (!Flags.isByVal()) {
	if (!TII->isLoadFromStackSlot(*Def, FI))
	return false;
	} else {
	return false;
	}
	} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
	if (Flags.isByVal())
	// ByVal argument is passed in as a pointer but it's now being
	// dereferenced. e.g.
	// define @foo(%struct.X* %A) {
	// tail call @bar(%struct.X* byval %A)
	// }
	return false;
	SDValue Ptr = Ld->getBasePtr();
	FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
	if (!FINode)
	return false;
	FI = FINode->getIndex();
	} else
	return false;

	assert(FI != std::numeric_limits<int>::max());
	if (!MFI.isFixedObjectIndex(FI))
	return false;
	return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
	}

	/// IsEligibleForTailCallOptimization - Check whether the call is eligible
	/// for tail call optimization. Targets which want to do tail call
	/// optimization should implement this function.
	bool ARMTargetLowering::IsEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	bool isCalleeStructRet, bool isCallerStructRet,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG,
	const bool isIndirect) const {
	MachineFunction &MF = DAG.getMachineFunction();
	const Function &CallerF = MF.getFunction();
	CallingConv::ID CallerCC = CallerF.getCallingConv();

	assert(Subtarget->supportsTailCall());

	// Indirect tail calls cannot be optimized for Thumb1 if the args
	// to the call take up r0-r3. The reason is that there are no legal registers
	// left to hold the pointer to the function to be called.
	if (Subtarget->isThumb1Only() && Outs.size() >= 4 &&
	(!isa<GlobalAddressSDNode>(Callee.getNode()) \|\| isIndirect))
	return false;

	// Look for obvious safe cases to perform tail call optimization that do not
	// require ABI changes. This is what gcc calls sibcall.

	// Exception-handling functions need a special set of instructions to indicate
	// a return to the hardware. Tail-calling another function would probably
	// break this.
	if (CallerF.hasFnAttribute("interrupt"))
	return false;

	// Also avoid sibcall optimization if either caller or callee uses struct
	// return semantics.
	if (isCalleeStructRet \|\| isCallerStructRet)
	return false;

	// Externally-defined functions with weak linkage should not be
	// tail-called on ARM when the OS does not support dynamic
	// pre-emption of symbols, as the AAELF spec requires normal calls
	// to undefined weak functions to be replaced with a NOP or jump to the
	// next instruction. The behaviour of branch instructions in this
	// situation (as used for tail calls) is implementation-defined, so we
	// cannot rely on the linker replacing the tail call with a return.
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = G->getGlobal();
	const Triple &TT = getTargetMachine().getTargetTriple();
	if (GV->hasExternalWeakLinkage() &&
	(!TT.isOSWindows() \|\| TT.isOSBinFormatELF() \|\| TT.isOSBinFormatMachO()))
	return false;
	}

	// Check that the call results are passed in the same way.
	LLVMContext &C = *DAG.getContext();
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	CCAssignFnForReturn(CalleeCC, isVarArg),
	CCAssignFnForReturn(CallerCC, isVarArg)))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (CalleeCC != CallerCC) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	// If Caller's vararg or byval argument has been split between registers and
	// stack, do not perform tail call, since part of the argument is in caller's
	// local frame.
	const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
	if (AFI_Caller->getArgRegsSaveSize())
	return false;

	// If the callee takes no arguments then go on to check the results of the
	// call.
	if (!Outs.empty()) {
	// Check if stack adjustment is needed. For now, do not do this if any
	// argument is passed on the stack.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
	if (CCInfo.getNextStackOffset()) {
	// Check if the arguments are already laid out in the right way as
	// the caller's fixed stack objects.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const MachineRegisterInfo *MRI = &MF.getRegInfo();
	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
	i != e;
	++i, ++realArgIdx) {
	CCValAssign &VA = ArgLocs[i];
	EVT RegVT = VA.getLocVT();
	SDValue Arg = OutVals[realArgIdx];
	ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;
	if (VA.needsCustom()) {
	// f64 and vector types are split into multiple registers or
	// register/stack-slot combinations. The types will not match
	// the registers; give up on memory f64 refs until we figure
	// out what to do about this.
	if (!VA.isRegLoc())
	return false;
	if (!ArgLocs[++i].isRegLoc())
	return false;
	if (RegVT == MVT::v2f64) {
	if (!ArgLocs[++i].isRegLoc())
	return false;
	if (!ArgLocs[++i].isRegLoc())
	return false;
	}
	} else if (!VA.isRegLoc()) {
	if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
	MFI, MRI, TII))
	return false;
	}
	}
	}

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;
	}

	return true;
	}

	bool
	ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
	MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
	}

	static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
	const SDLoc &DL, SelectionDAG &DAG) {
	const MachineFunction &MF = DAG.getMachineFunction();
	const Function &F = MF.getFunction();

	StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();

	// See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
	// version of the "preferred return address". These offsets affect the return
	// instruction if this is a return from PL1 without hypervisor extensions.
	// IRQ/FIQ: +4 "subs pc, lr, #4"
	// SWI: 0 "subs pc, lr, #0"
	// ABORT: +4 "subs pc, lr, #4"
	// UNDEF: +4/+2 "subs pc, lr, #0"
	// UNDEF varies depending on where the exception came from ARM or Thumb
	// mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.

	int64_t LROffset;
	if (IntKind == "" \|\| IntKind == "IRQ" \|\| IntKind == "FIQ" \|\|
	IntKind == "ABORT")
	LROffset = 4;
	else if (IntKind == "SWI" \|\| IntKind == "UNDEF")
	LROffset = 0;
	else
	report_fatal_error("Unsupported interrupt attribute. If present, value "
	"must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");

	RetOps.insert(RetOps.begin() + 1,
	DAG.getConstant(LROffset, DL, MVT::i32, false));

	return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
	}

	SDValue
	ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	// CCValAssign - represent the assignment of the return value to a location.
	SmallVector<CCValAssign, 16> RVLocs;

	// CCState - Info about the registers and stack slots.
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());

	// Analyze outgoing return values.
	CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));

	SDValue Flag;
	SmallVector<SDValue, 4> RetOps;
	RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
	bool isLittleEndian = Subtarget->isLittle();

	MachineFunction &MF = DAG.getMachineFunction();
	ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
	AFI->setReturnRegsCount(RVLocs.size());

	// Copy the result values into the output registers.
	for (unsigned i = 0, realRVLocIdx = 0;
	i != RVLocs.size();
	++i, ++realRVLocIdx) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");

	SDValue Arg = OutVals[realRVLocIdx];
	bool ReturnF16 = false;

	if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
	// Half-precision return values can be returned like this:
	//
	// t11 f16 = fadd ...
	// t12: i16 = bitcast t11
	// t13: i32 = zero_extend t12
	// t14: f32 = bitcast t13 <~~~~~~~ Arg
	//
	// to avoid code generation for bitcasts, we simply set Arg to the node
	// that produces the f16 value, t11 in this case.
	//
	if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
	SDValue ZE = Arg.getOperand(0);
	if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
	SDValue BC = ZE.getOperand(0);
	if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
	Arg = BC.getOperand(0);
	ReturnF16 = true;
	}
	}
	}
	}

	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::BCvt:
	if (!ReturnF16)
	Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
	break;
	}

	if (VA.needsCustom()) {
	if (VA.getLocVT() == MVT::v2f64) {
	// Extract the first half and return it in two registers.
	SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
	DAG.getConstant(0, dl, MVT::i32));
	SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
	DAG.getVTList(MVT::i32, MVT::i32), Half);

	Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
	HalfGPRs.getValue(isLittleEndian ? 0 : 1),
	Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	VA = RVLocs[++i]; // skip ahead to next loc
	Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
	HalfGPRs.getValue(isLittleEndian ? 1 : 0),
	Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	VA = RVLocs[++i]; // skip ahead to next loc

	// Extract the 2nd half and fall through to handle it as an f64 value.
	Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
	DAG.getConstant(1, dl, MVT::i32));
	}
	// Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
	// available.
	SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
	DAG.getVTList(MVT::i32, MVT::i32), Arg);
	Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
	fmrrd.getValue(isLittleEndian ? 0 : 1),
	Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	VA = RVLocs[++i]; // skip ahead to next loc
	Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
	fmrrd.getValue(isLittleEndian ? 1 : 0),
	Flag);
	} else
	Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);

	// Guarantee that all emitted copies are
	// stuck together, avoiding something bad.
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(),
	ReturnF16 ? MVT::f16 : VA.getLocVT()));
	}
	const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (ARM::GPRRegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i32));
	else if (ARM::DPRRegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	// Update chain and glue.
	RetOps[0] = Chain;
	if (Flag.getNode())
	RetOps.push_back(Flag);

	// CPUs which aren't M-class use a special sequence to return from
	// exceptions (roughly, any instruction setting pc and cpsr simultaneously,
	// though we use "subs pc, lr, #N").
	//
	// M-class CPUs actually use a normal return sequence with a special
	// (hardware-provided) value in LR, so the normal code path works.
	if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
	!Subtarget->isMClass()) {
	if (Subtarget->isThumb1Only())
	report_fatal_error("interrupt attribute is not supported in Thumb1");
	return LowerInterruptReturn(RetOps, dl, DAG);
	}

	return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
	}

	bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
	if (N->getNumValues() != 1)
	return false;
	if (!N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
	SDNode *VMov = Copy;
	// f64 returned in a pair of GPRs.
	SmallPtrSet<SDNode*, 2> Copies;
	for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() != ISD::CopyToReg)
	return false;
	Copies.insert(*UI);
	}
	if (Copies.size() > 2)
	return false;

	for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
	UI != UE; ++UI) {
	SDValue UseChain = UI->getOperand(0);
	if (Copies.count(UseChain.getNode()))
	// Second CopyToReg
	Copy = *UI;
	else {
	// We are at the top of this chain.
	// If the copy has a glue operand, we conservatively assume it
	// isn't safe to perform a tail call.
	if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
	return false;
	// First CopyToReg
	TCChain = UseChain;
	}
	}
	} else if (Copy->getOpcode() == ISD::BITCAST) {
	// f32 returned in a single GPR.
	if (!Copy->hasOneUse())
	return false;
	Copy = *Copy->use_begin();
	if (Copy->getOpcode() != ISD::CopyToReg \|\| !Copy->hasNUsesOfValue(1, 0))
	return false;
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else {
	return false;
	}

	bool HasRet = false;
	for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() != ARMISD::RET_FLAG &&
	UI->getOpcode() != ARMISD::INTRET_FLAG)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	if (!Subtarget->supportsTailCall())
	return false;

	auto Attr =
	CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
	if (!CI->isTailCall() \|\| Attr.getValueAsString() == "true")
	return false;

	return true;
	}

	// Trying to write a 64 bit value so need to split into two 32 bit values first,
	// and pass the lower and high parts through.
	static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	SDValue WriteValue = Op->getOperand(2);

	// This function is only supposed to be called for i64 type argument.
	assert(WriteValue.getValueType() == MVT::i64
	&& "LowerWRITE_REGISTER called for non-i64 type argument.");

	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
	DAG.getConstant(0, DL, MVT::i32));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
	DAG.getConstant(1, DL, MVT::i32));
	SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
	return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
	}

	// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
	// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
	// one of the above mentioned nodes. It has to be wrapped because otherwise
	// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
	// be used to form addressing mode. These wrapped nodes will be selected
	// into MOVi.
	SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
	SelectionDAG &DAG) const {
	EVT PtrVT = Op.getValueType();
	// FIXME there is no actual debug info here
	SDLoc dl(Op);
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
	SDValue Res;

	// When generating execute-only code Constant Pools must be promoted to the
	// global data section. It's a bit ugly that we can't share them across basic
	// blocks, but this way we guarantee that execute-only behaves correct with
	// position-independent addressing modes.
	if (Subtarget->genExecuteOnly()) {
	auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
	auto T = const_cast<Type*>(CP->getType());
	auto C = const_cast<Constant*>(CP->getConstVal());
	auto M = const_cast<Module*>(DAG.getMachineFunction().
	getFunction().getParent());
	auto GV = new GlobalVariable(
	M, T, /isConstant=*/true, GlobalVariable::InternalLinkage, C,
	Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
	Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
	Twine(AFI->createPICLabelUId())
	);
	SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
	dl, PtrVT);
	return LowerGlobalAddress(GA, DAG);
	}

	if (CP->isMachineConstantPoolEntry())
	Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
	CP->getAlignment());
	else
	Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
	CP->getAlignment());
	return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
	}

	unsigned ARMTargetLowering::getJumpTableEncoding() const {
	return MachineJumpTableInfo::EK_Inline;
	}

	SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
	unsigned ARMPCLabelIndex = 0;
	SDLoc DL(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
	SDValue CPAddr;
	bool IsPositionIndependent = isPositionIndependent() \|\| Subtarget->isROPI();
	if (!IsPositionIndependent) {
	CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
	} else {
	unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
	ARMPCLabelIndex = AFI->createPICLabelUId();
	ARMConstantPoolValue *CPV =
	ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
	ARMCP::CPBlockAddress, PCAdj);
	CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
	}
	CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
	SDValue Result = DAG.getLoad(
	PtrVT, DL, DAG.getEntryNode(), CPAddr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
	if (!IsPositionIndependent)
	return Result;
	SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
	return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
	}

	/// Convert a TLS address reference into the correct sequence of loads
	/// and calls to compute the variable's address for Darwin, and return an
	/// SDValue containing the final node.

	/// Darwin only has one TLS scheme which must be capable of dealing with the
	/// fully general situation, in the worst case. This means:
	/// + "extern __thread" declaration.
	/// + Defined in a possibly unknown dynamic library.
	///
	/// The general system is that each __thread variable has a [3 x i32] descriptor
	/// which contains information used by the runtime to calculate the address. The
	/// only part of this the compiler needs to know about is the first word, which
	/// contains a function pointer that must be called with the address of the
	/// entire descriptor in "r0".
	///
	/// Since this descriptor may be in a different unit, in general access must
	/// proceed along the usual ARM rules. A common sequence to produce is:
	///
	/// movw rT1, :lower16:_var$non_lazy_ptr
	/// movt rT1, :upper16:_var$non_lazy_ptr
	/// ldr r0, [rT1]
	/// ldr rT2, [r0]
	/// blx rT2
	/// [...address now in r0...]
	SDValue
	ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetDarwin() &&
	"This function expects a Darwin target");
	SDLoc DL(Op);

	// First step is to get the address of the actua global symbol. This is where
	// the TLS descriptor lives.
	SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);

	// The first entry in the descriptor is a function pointer that we must call
	// to obtain the address of the variable.
	SDValue Chain = DAG.getEntryNode();
	SDValue FuncTLVGet = DAG.getLoad(
	MVT::i32, DL, Chain, DescAddr,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()),
	/* Alignment = */ 4,
	MachineMemOperand::MONonTemporal \| MachineMemOperand::MODereferenceable \|
	MachineMemOperand::MOInvariant);
	Chain = FuncTLVGet.getValue(1);

	MachineFunction &F = DAG.getMachineFunction();
	MachineFrameInfo &MFI = F.getFrameInfo();
	MFI.setAdjustsStack(true);

	// TLS calls preserve all registers except those that absolutely must be
	// trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
	// silly).
	auto TRI =
	getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo();
	auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
	const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());

	// Finally, we can make the call. This is just a degenerate version of a
	// normal AArch64 call node: r0 takes the address of the descriptor, and
	// returns the address of the variable in this thread.
	Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
	Chain =
	DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
	Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
	DAG.getRegisterMask(Mask), Chain.getValue(1));
	return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
	}

	SDValue
	ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");

	SDValue Chain = DAG.getEntryNode();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);

	// Load the current TEB (thread environment block)
	SDValue Ops[] = {Chain,
	DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
	DAG.getConstant(15, DL, MVT::i32),
	DAG.getConstant(0, DL, MVT::i32),
	DAG.getConstant(13, DL, MVT::i32),
	DAG.getConstant(0, DL, MVT::i32),
	DAG.getConstant(2, DL, MVT::i32)};
	SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
	DAG.getVTList(MVT::i32, MVT::Other), Ops);

	SDValue TEB = CurrentTEB.getValue(0);
	Chain = CurrentTEB.getValue(1);

	// Load the ThreadLocalStoragePointer from the TEB
	// A pointer to the TLS array is located at offset 0x2c from the TEB.
	SDValue TLSArray =
	DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
	TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());

	// The pointer to the thread's TLS data area is at the TLS Index scaled by 4
	// offset into the TLSArray.

	// Load the TLS index from the C runtime
	SDValue TLSIndex =
	DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
	TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
	TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());

	SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
	DAG.getConstant(2, DL, MVT::i32));
	SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
	DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
	MachinePointerInfo());

	// Get the offset of the start of the .tls section (section base)
	const auto *GA = cast<GlobalAddressSDNode>(Op);
	auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
	SDValue Offset = DAG.getLoad(
	PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
	DAG.getTargetConstantPool(CPV, PtrVT, 4)),
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));

	return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model
	SDValue
	ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
	SelectionDAG &DAG) const {
	SDLoc dl(GA);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
	MachineFunction &MF = DAG.getMachineFunction();
	ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
	unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
	ARMConstantPoolValue *CPV =
	ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
	ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
	SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
	Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
	Argument = DAG.getLoad(
	PtrVT, dl, DAG.getEntryNode(), Argument,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
	SDValue Chain = Argument.getValue(1);

	SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
	Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);

	// call __tls_get_addr.
	ArgListTy Args;
	ArgListEntry Entry;
	Entry.Node = Argument;
	Entry.Ty = (Type ) Type::getInt32Ty(DAG.getContext());
	Args.push_back(Entry);

	// FIXME: is there useful debug info available here?
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
	CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
	DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));

	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
	return CallResult.first;
	}

	// Lower ISD::GlobalTLSAddress using the "initial exec" or
	// "local exec" model.
	SDValue
	ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
	SelectionDAG &DAG,
	TLSModel::Model model) const {
	const GlobalValue *GV = GA->getGlobal();
	SDLoc dl(GA);
	SDValue Offset;
	SDValue Chain = DAG.getEntryNode();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	// Get the Thread Pointer
	SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);

	if (model == TLSModel::InitialExec) {
	MachineFunction &MF = DAG.getMachineFunction();
	ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
	unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
	// Initial exec model.
	unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
	ARMConstantPoolValue *CPV =
	ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
	ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
	true);
	Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
	Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
	Offset = DAG.getLoad(
	PtrVT, dl, Chain, Offset,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
	Chain = Offset.getValue(1);

	SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
	Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);

	Offset = DAG.getLoad(
	PtrVT, dl, Chain, Offset,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
	} else {
	// local exec model
	assert(model == TLSModel::LocalExec);
	ARMConstantPoolValue *CPV =
	ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
	Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
	Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
	Offset = DAG.getLoad(
	PtrVT, dl, Chain, Offset,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
	}

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
	}

	SDValue
	ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
	if (DAG.getTarget().useEmulatedTLS())
	return LowerToTLSEmulatedModel(GA, DAG);

	if (Subtarget->isTargetDarwin())
	return LowerGlobalTLSAddressDarwin(Op, DAG);

	if (Subtarget->isTargetWindows())
	return LowerGlobalTLSAddressWindows(Op, DAG);

	// TODO: implement the "local dynamic" model
	assert(Subtarget->isTargetELF() && "Only ELF implemented here");
	TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());

	switch (model) {
	case TLSModel::GeneralDynamic:
	case TLSModel::LocalDynamic:
	return LowerToTLSGeneralDynamicModel(GA, DAG);
	case TLSModel::InitialExec:
	case TLSModel::LocalExec:
	return LowerToTLSExecModels(GA, DAG, model);
	}
	llvm_unreachable("bogus TLS model");
	}

	/// Return true if all users of V are within function F, looking through
	/// ConstantExprs.
	static bool allUsersAreInFunction(const Value V, const Function F) {
	SmallVector<const User*,4> Worklist;
	for (auto *U : V->users())
	Worklist.push_back(U);
	while (!Worklist.empty()) {
	auto *U = Worklist.pop_back_val();
	if (isa<ConstantExpr>(U)) {
	for (auto *UU : U->users())
	Worklist.push_back(UU);
	continue;
	}

	auto *I = dyn_cast<Instruction>(U);
	if (!I \|\| I->getParent()->getParent() != F)
	return false;
	}
	return true;
	}

	static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
	const GlobalValue *GV, SelectionDAG &DAG,
	EVT PtrVT, const SDLoc &dl) {
	// If we're creating a pool entry for a constant global with unnamed address,
	// and the global is small enough, we can emit it inline into the constant pool
	// to save ourselves an indirection.
	//
	// This is a win if the constant is only used in one function (so it doesn't
	// need to be duplicated) or duplicating the constant wouldn't increase code
	// size (implying the constant is no larger than 4 bytes).
	const Function &F = DAG.getMachineFunction().getFunction();

	// We rely on this decision to inline being idemopotent and unrelated to the
	// use-site. We know that if we inline a variable at one use site, we'll
	// inline it elsewhere too (and reuse the constant pool entry). Fast-isel
	// doesn't know about this optimization, so bail out if it's enabled else
	// we could decide to inline here (and thus never emit the GV) but require
	// the GV from fast-isel generated code.
	if (!EnableConstpoolPromotion \|\|
	DAG.getMachineFunction().getTarget().Options.EnableFastISel)
	return SDValue();

	auto *GVar = dyn_cast<GlobalVariable>(GV);
	if (!GVar \|\| !GVar->hasInitializer() \|\|
	!GVar->isConstant() \|\| !GVar->hasGlobalUnnamedAddr() \|\|
	!GVar->hasLocalLinkage())
	return SDValue();

	// If we inline a value that contains relocations, we move the relocations
	// from .data to .text. This is not allowed in position-independent code.
	auto *Init = GVar->getInitializer();
	if ((TLI->isPositionIndependent() \|\| TLI->getSubtarget()->isROPI()) &&
	Init->needsRelocation())
	return SDValue();

	// The constant islands pass can only really deal with alignment requests
	// <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
	// any type wanting greater alignment requirements than 4 bytes. We also
	// can only promote constants that are multiples of 4 bytes in size or
	// are paddable to a multiple of 4. Currently we only try and pad constants
	// that are strings for simplicity.
	auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
	unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
	unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar);
	unsigned RequiredPadding = 4 - (Size % 4);
	bool PaddingPossible =
	RequiredPadding == 4 \|\| (CDAInit && CDAInit->isString());
	if (!PaddingPossible \|\| Align > 4 \|\| Size > ConstpoolPromotionMaxSize \|\|
	Size == 0)
	return SDValue();

	unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
	MachineFunction &MF = DAG.getMachineFunction();
	ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();

	// We can't bloat the constant pool too much, else the ConstantIslands pass
	// may fail to converge. If we haven't promoted this global yet (it may have
	// multiple uses), and promoting it would increase the constant pool size (Sz
	// > 4), ensure we have space to do so up to MaxTotal.
	if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
	if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
	ConstpoolPromotionMaxTotal)
	return SDValue();

	// This is only valid if all users are in a single function; we can't clone
	// the constant in general. The LLVM IR unnamed_addr allows merging
	// constants, but not cloning them.
	//
	// We could potentially allow cloning if we could prove all uses of the
	// constant in the current function don't care about the address, like
	// printf format strings. But that isn't implemented for now.
	if (!allUsersAreInFunction(GVar, &F))
	return SDValue();

	// We're going to inline this global. Pad it out if needed.
	if (RequiredPadding != 4) {
	StringRef S = CDAInit->getAsString();

	SmallVector<uint8_t,16> V(S.size());
	std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
	while (RequiredPadding--)
	V.push_back(0);
	Init = ConstantDataArray::get(*DAG.getContext(), V);
	}

	auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
	SDValue CPAddr =
	DAG.getTargetConstantPool(CPVal, PtrVT, /Align=/4);
	if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
	AFI->markGlobalAsPromotedToConstantPool(GVar);
	AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
	PaddedSize - 4);
	}
	++NumConstpoolPromoted;
	return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
	}

	bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
	if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
	if (!(GV = GA->getBaseObject()))
	return false;
	if (const auto *V = dyn_cast<GlobalVariable>(GV))
	return V->isConstant();
	return isa<Function>(GV);
	}

	SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	switch (Subtarget->getTargetTriple().getObjectFormat()) {
	default: llvm_unreachable("unknown object format");
	case Triple::COFF:
	return LowerGlobalAddressWindows(Op, DAG);
	case Triple::ELF:
	return LowerGlobalAddressELF(Op, DAG);
	case Triple::MachO:
	return LowerGlobalAddressDarwin(Op, DAG);
	}
	}

	SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
	SelectionDAG &DAG) const {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc dl(Op);
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
	const TargetMachine &TM = getTargetMachine();
	bool IsRO = isReadOnly(GV);

	// promoteToConstantPool only if not generating XO text section
	if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
	if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
	return V;

	if (isPositionIndependent()) {
	bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
	SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
	UseGOT_PREL ? ARMII::MO_GOT : 0);
	SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
	if (UseGOT_PREL)
	Result =
	DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	return Result;
	} else if (Subtarget->isROPI() && IsRO) {
	// PC-relative.
	SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
	SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
	return Result;
	} else if (Subtarget->isRWPI() && !IsRO) {
	// SB-relative.
	SDValue RelAddr;
	if (Subtarget->useMovt()) {
	++NumMovwMovt;
	SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
	RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
	} else { // use literal pool for address constant
	ARMConstantPoolValue *CPV =
	ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
	SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
	CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
	RelAddr = DAG.getLoad(
	PtrVT, dl, DAG.getEntryNode(), CPAddr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
	}
	SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
	SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
	return Result;
	}

	// If we have T2 ops, we can materialize the address directly via movt/movw
	// pair. This is always cheaper.
	if (Subtarget->useMovt()) {
	++NumMovwMovt;
	// FIXME: Once remat is capable of dealing with instructions with register
	// operands, expand this into two nodes.
	return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
	DAG.getTargetGlobalAddress(GV, dl, PtrVT));
	} else {
	SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
	CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
	return DAG.getLoad(
	PtrVT, dl, DAG.getEntryNode(), CPAddr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
	}
	}

	SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
	SelectionDAG &DAG) const {
	assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
	"ROPI/RWPI not currently supported for Darwin");
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc dl(Op);
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();

	if (Subtarget->useMovt())
	++NumMovwMovt;

	// FIXME: Once remat is capable of dealing with instructions with register
	// operands, expand this into multiple nodes
	unsigned Wrapper =
	isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;

	SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
	SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);

	if (Subtarget->isGVIndirectSymbol(GV))
	Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	return Result;
	}

	SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
	assert(Subtarget->useMovt() &&
	"Windows on ARM expects to use movw/movt");
	assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
	"ROPI/RWPI not currently supported for Windows");

	const TargetMachine &TM = getTargetMachine();
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
	ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
	if (GV->hasDLLImportStorageClass())
	TargetFlags = ARMII::MO_DLLIMPORT;
	else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
	TargetFlags = ARMII::MO_COFFSTUB;
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result;
	SDLoc DL(Op);

	++NumMovwMovt;

	// FIXME: Once remat is capable of dealing with instructions with register
	// operands, expand this into two nodes.
	Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, /offset=/0,
	TargetFlags));
	if (TargetFlags & (ARMII::MO_DLLIMPORT \| ARMII::MO_COFFSTUB))
	Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	return Result;
	}

	SDValue
	ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue Val = DAG.getConstant(0, dl, MVT::i32);
	return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
	DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
	Op.getOperand(1), Val);
	}

	SDValue
	ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
	Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
	}

	SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
	Op.getOperand(0));
	}

	SDValue
	ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
	const ARMSubtarget *Subtarget) const {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.
	case Intrinsic::thread_pointer: {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
	}
	case Intrinsic::eh_sjlj_lsda: {
	MachineFunction &MF = DAG.getMachineFunction();
	ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
	unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue CPAddr;
	bool IsPositionIndependent = isPositionIndependent();
	unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
	ARMConstantPoolValue *CPV =
	ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
	ARMCP::CPLSDA, PCAdj);
	CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
	CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
	SDValue Result = DAG.getLoad(
	PtrVT, dl, DAG.getEntryNode(), CPAddr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));

	if (IsPositionIndependent) {
	SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
	Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
	}
	return Result;
	}
	case Intrinsic::arm_neon_vabs:
	return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::arm_neon_vmulls:
	case Intrinsic::arm_neon_vmullu: {
	unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
	? ARMISD::VMULLs : ARMISD::VMULLu;
	return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	}
	case Intrinsic::arm_neon_vminnm:
	case Intrinsic::arm_neon_vmaxnm: {
	unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
	? ISD::FMINNUM : ISD::FMAXNUM;
	return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	}
	case Intrinsic::arm_neon_vminu:
	case Intrinsic::arm_neon_vmaxu: {
	if (Op.getValueType().isFloatingPoint())
	return SDValue();
	unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
	? ISD::UMIN : ISD::UMAX;
	return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	}
	case Intrinsic::arm_neon_vmins:
	case Intrinsic::arm_neon_vmaxs: {
	// v{min,max}s is overloaded between signed integers and floats.
	if (!Op.getValueType().isFloatingPoint()) {
	unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
	? ISD::SMIN : ISD::SMAX;
	return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	}
	unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
	? ISD::FMINIMUM : ISD::FMAXIMUM;
	return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	}
	case Intrinsic::arm_neon_vtbl1:
	return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::arm_neon_vtbl2:
	return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
	}
	}

	static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
	const ARMSubtarget *Subtarget) {
	SDLoc dl(Op);
	ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2));
	auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue());
	if (SSID == SyncScope::SingleThread)
	return Op;

	if (!Subtarget->hasDataBarrier()) {
	// Some ARMv6 cpus can support data barriers with an mcr instruction.
	// Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
	// here.
	assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
	"Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
	return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
	DAG.getConstant(0, dl, MVT::i32));
	}

	ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
	AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
	ARM_MB::MemBOpt Domain = ARM_MB::ISH;
	if (Subtarget->isMClass()) {
	// Only a full system barrier exists in the M-class architectures.
	Domain = ARM_MB::SY;
	} else if (Subtarget->preferISHSTBarriers() &&
	Ord == AtomicOrdering::Release) {
	// Swift happens to implement ISHST barriers in a way that's compatible with
	// Release semantics but weaker than ISH so we'd be fools not to use
	// it. Beware: other processors probably don't!
	Domain = ARM_MB::ISHST;
	}

	return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
	DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
	DAG.getConstant(Domain, dl, MVT::i32));
	}

	static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
	const ARMSubtarget *Subtarget) {
	// ARM pre v5TE and Thumb1 does not have preload instructions.
	if (!(Subtarget->isThumb2() \|\|
	(!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
	// Just preserve the chain.
	return Op.getOperand(0);

	SDLoc dl(Op);
	unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
	if (!isRead &&
	(!Subtarget->hasV7Ops() \|\| !Subtarget->hasMPExtension()))
	// ARMv7 with MP extension has PLDW.
	return Op.getOperand(0);

	unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
	if (Subtarget->isThumb()) {
	// Invert the bits.
	isRead = ~isRead & 1;
	isData = ~isData & 1;
	}

	return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
	Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
	DAG.getConstant(isData, dl, MVT::i32));
	}

	static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();

	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDLoc dl(Op);
	EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
	CCValAssign &NextVA,
	SDValue &Root,
	SelectionDAG &DAG,
	const SDLoc &dl) const {
	MachineFunction &MF = DAG.getMachineFunction();
	ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();

	const TargetRegisterClass *RC;
	if (AFI->isThumb1OnlyFunction())
	RC = &ARM::tGPRRegClass;
	else
	RC = &ARM::GPRRegClass;

	// Transform the arguments stored in physical registers into virtual ones.
	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);

	SDValue ArgValue2;
	if (NextVA.isMemLoc()) {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);

	// Create load node to retrieve arguments from the stack.
	SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	ArgValue2 = DAG.getLoad(
	MVT::i32, dl, Root, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	} else {
	Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
	ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
	}
	if (!Subtarget->isLittle())
	std::swap (ArgValue, ArgValue2);
	return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
	}

	// The remaining GPRs hold either the beginning of variable-argument
	// data, or the beginning of an aggregate passed by value (usually
	// byval). Either way, we allocate stack slots adjacent to the data
	// provided by our caller, and store the unallocated registers there.
	// If this is a variadic function, the va_list pointer will begin with
	// these values; otherwise, this reassembles a (byval) structure that
	// was split between registers and memory.
	// Return: The frame index registers were stored into.
	int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
	const SDLoc &dl, SDValue &Chain,
	const Value *OrigArg,
	unsigned InRegsParamRecordIdx,
	int ArgOffset, unsigned ArgSize) const {
	// Currently, two use-cases possible:
	// Case #1. Non-var-args function, and we meet first byval parameter.
	// Setup first unallocated register as first byval register;
	// eat all remained registers
	// (these two actions are performed by HandleByVal method).
	// Then, here, we initialize stack frame with
	// "store-reg" instructions.
	// Case #2. Var-args function, that doesn't contain byval parameters.
	// The same: eat all remained unallocated registers,
	// initialize stack frame.

	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
	unsigned RBegin, REnd;
	if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
	CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
	} else {
	unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
	RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
	REnd = ARM::R4;
	}

	if (REnd != RBegin)
	ArgOffset = -4 * (ARM::R4 - RBegin);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
	SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);

	SmallVector<SDValue, 4> MemOps;
	const TargetRegisterClass *RC =
	AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;

	for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
	unsigned VReg = MF.addLiveIn(Reg, RC);
	SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
	SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo(OrigArg, 4 * i));
	MemOps.push_back(Store);
	FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
	return FrameIndex;
	}

	// Setup stack frame, the va_list pointer will start from.
	void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
	const SDLoc &dl, SDValue &Chain,
	unsigned ArgOffset,
	unsigned TotalArgRegsSaveSize,
	bool ForceMutable) const {
	MachineFunction &MF = DAG.getMachineFunction();
	ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();

	// Try to store any remaining integer argument regs
	// to their spots on the stack so that they may be loaded by dereferencing
	// the result of va_next.
	// If there is no regs to be stored, just point address after last
	// argument passed via stack.
	int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
	CCInfo.getInRegsParamsCount(),
	CCInfo.getNextStackOffset(),
	std::max(4U, TotalArgRegsSaveSize));
	AFI->setVarArgsFrameIndex(FrameIndex);
	}

	SDValue ARMTargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();

	ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());
	CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));

	SmallVector<SDValue, 16> ArgValues;
	SDValue ArgValue;
	Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
	unsigned CurArgIdx = 0;

	// Initially ArgRegsSaveSize is zero.
	// Then we increase this value each time we meet byval parameter.
	// We also increase this value in case of varargs function.
	AFI->setArgRegsSaveSize(0);

	// Calculate the amount of stack space that we need to allocate to store
	// byval and variadic arguments that are passed in registers.
	// We need to know this before we allocate the first byval or variadic
	// argument, as they will be allocated a stack slot below the CFA (Canonical
	// Frame Address, the stack pointer at entry to the function).
	unsigned ArgRegBegin = ARM::R4;
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
	break;

	CCValAssign &VA = ArgLocs[i];
	unsigned Index = VA.getValNo();
	ISD::ArgFlagsTy Flags = Ins[Index].Flags;
	if (!Flags.isByVal())
	continue;

	assert(VA.isMemLoc() && "unexpected byval pointer in reg");
	unsigned RBegin, REnd;
	CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
	ArgRegBegin = std::min(ArgRegBegin, RBegin);

	CCInfo.nextInRegsParam();
	}
	CCInfo.rewindByValRegsInfo();

	int lastInsIndex = -1;
	if (isVarArg && MFI.hasVAStart()) {
	unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
	if (RegIdx != array_lengthof(GPRArgRegs))
	ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
	}

	unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
	AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	if (Ins[VA.getValNo()].isOrigArg()) {
	std::advance(CurOrigArg,
	Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
	CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
	}
	// Arguments stored in registers.
	if (VA.isRegLoc()) {
	EVT RegVT = VA.getLocVT();

	if (VA.needsCustom()) {
	// f64 and vector types are split up into multiple registers or
	// combinations of registers and stack slots.
	if (VA.getLocVT() == MVT::v2f64) {
	SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
	Chain, DAG, dl);
	VA = ArgLocs[++i]; // skip ahead to next loc
	SDValue ArgValue2;
	if (VA.isMemLoc()) {
	int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), FI));
	} else {
	ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
	Chain, DAG, dl);
	}
	ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
	ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
	ArgValue, ArgValue1,
	DAG.getIntPtrConstant(0, dl));
	ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
	ArgValue, ArgValue2,
	DAG.getIntPtrConstant(1, dl));
	} else
	ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
	} else {
	const TargetRegisterClass *RC;


	if (RegVT == MVT::f16)
	RC = &ARM::HPRRegClass;
	else if (RegVT == MVT::f32)
	RC = &ARM::SPRRegClass;
	else if (RegVT == MVT::f64 \|\| RegVT == MVT::v4f16)
	RC = &ARM::DPRRegClass;
	else if (RegVT == MVT::v2f64 \|\| RegVT == MVT::v8f16)
	RC = &ARM::QPRRegClass;
	else if (RegVT == MVT::i32)
	RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
	: &ARM::GPRRegClass;
	else
	llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");

	// Transform the arguments in physical registers into virtual ones.
	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
	}

	// If this is an 8 or 16-bit value, it is really passed promoted
	// to 32 bits. Insert an assert[sz]ext to capture this, then
	// truncate to the right size.
	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::BCvt:
	ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
	break;
	case CCValAssign::SExt:
	ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
	break;
	case CCValAssign::ZExt:
	ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
	break;
	}

	InVals.push_back(ArgValue);
	} else { // VA.isRegLoc()
	// sanity check
	assert(VA.isMemLoc());
	assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");

	int index = VA.getValNo();

	// Some Ins[] entries become multiple ArgLoc[] entries.
	// Process them only once.
	if (index != lastInsIndex)
	{
	ISD::ArgFlagsTy Flags = Ins[index].Flags;
	// FIXME: For now, all byval parameter objects are marked mutable.
	// This can be changed with more analysis.
	// In case of tail call optimization mark all arguments mutable.
	// Since they could be overwritten by lowering of arguments in case of
	// a tail call.
	if (Flags.isByVal()) {
	assert(Ins[index].isOrigArg() &&
	"Byval arguments cannot be implicit");
	unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();

	int FrameIndex = StoreByValRegs(
	CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
	VA.getLocMemOffset(), Flags.getByValSize());
	InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
	CCInfo.nextInRegsParam();
	} else {
	unsigned FIOffset = VA.getLocMemOffset();
	int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
	FIOffset, true);

	// Create load nodes to retrieve arguments from the stack.
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), FI)));
	}
	lastInsIndex = index;
	}
	}
	}

	// varargs
	if (isVarArg && MFI.hasVAStart())
	VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
	CCInfo.getNextStackOffset(),
	TotalArgRegsSaveSize);

	AFI->setArgumentStackSize(CCInfo.getNextStackOffset());

	return Chain;
	}

	/// isFloatingPointZero - Return true if this is +0.0.
	static bool isFloatingPointZero(SDValue Op) {
	if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
	return CFP->getValueAPF().isPosZero();
	else if (ISD::isEXTLoad(Op.getNode()) \|\| ISD::isNON_EXTLoad(Op.getNode())) {
	// Maybe this has already been legalized into the constant pool?
	if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
	SDValue WrapperOp = Op.getOperand(1).getOperand(0);
	if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
	if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
	return CFP->getValueAPF().isPosZero();
	}
	} else if (Op->getOpcode() == ISD::BITCAST &&
	Op->getValueType(0) == MVT::f64) {
	// Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
	// created by LowerConstantFP().
	SDValue BitcastOp = Op->getOperand(0);
	if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
	isNullConstant(BitcastOp->getOperand(0)))
	return true;
	}
	return false;
	}

	/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
	/// the given operands.
	SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
	SDValue &ARMcc, SelectionDAG &DAG,
	const SDLoc &dl) const {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
	unsigned C = RHSC->getZExtValue();
	if (!isLegalICmpImmediate((int32_t)C)) {
	// Constant does not fit, try adjusting it by one.
	switch (CC) {
	default: break;
	case ISD::SETLT:
	case ISD::SETGE:
	if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
	CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
	RHS = DAG.getConstant(C - 1, dl, MVT::i32);
	}
	break;
	case ISD::SETULT:
	case ISD::SETUGE:
	if (C != 0 && isLegalICmpImmediate(C-1)) {
	CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
	RHS = DAG.getConstant(C - 1, dl, MVT::i32);
	}
	break;
	case ISD::SETLE:
	case ISD::SETGT:
	if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
	CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
	RHS = DAG.getConstant(C + 1, dl, MVT::i32);
	}
	break;
	case ISD::SETULE:
	case ISD::SETUGT:
	if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
	CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
	RHS = DAG.getConstant(C + 1, dl, MVT::i32);
	}
	break;
	}
	}
	} else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
	(ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) {
	// In ARM and Thumb-2, the compare instructions can shift their second
	// operand.
	CC = ISD::getSetCCSwappedOperands(CC);
	std::swap(LHS, RHS);
	}

	ARMCC::CondCodes CondCode = IntCCToARMCC(CC);

	// If the RHS is a constant zero then the V (overflow) flag will never be
	// set. This can allow us to simplify GE to PL or LT to MI, which can be
	// simpler for other passes (like the peephole optimiser) to deal with.
	if (isNullConstant(RHS)) {
	switch (CondCode) {
	default: break;
	case ARMCC::GE:
	CondCode = ARMCC::PL;
	break;
	case ARMCC::LT:
	CondCode = ARMCC::MI;
	break;
	}
	}

	ARMISD::NodeType CompareType;
	switch (CondCode) {
	default:
	CompareType = ARMISD::CMP;
	break;
	case ARMCC::EQ:
	case ARMCC::NE:
	// Uses only Z Flag
	CompareType = ARMISD::CMPZ;
	break;
	}
	ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
	return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
	}

	/// Returns a appropriate VFP CMP (fcmp{s\|d}+fmstat) for the given operands.
	SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
	SelectionDAG &DAG, const SDLoc &dl,
	bool InvalidOnQNaN) const {
	assert(Subtarget->hasFP64() \|\| RHS.getValueType() != MVT::f64);
	SDValue Cmp;
	SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32);
	if (!isFloatingPointZero(RHS))
	Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C);
	else
	Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C);
	return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
	}

	/// duplicateCmp - Glue values can have only one use, so this function
	/// duplicates a comparison node.
	SDValue
	ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
	unsigned Opc = Cmp.getOpcode();
	SDLoc DL(Cmp);
	if (Opc == ARMISD::CMP \|\| Opc == ARMISD::CMPZ)
	return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));

	assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
	Cmp = Cmp.getOperand(0);
	Opc = Cmp.getOpcode();
	if (Opc == ARMISD::CMPFP)
	Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
	Cmp.getOperand(1), Cmp.getOperand(2));
	else {
	assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
	Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
	Cmp.getOperand(1));
	}
	return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
	}

	// This function returns three things: the arithmetic computation itself
	// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
	// comparison and the condition code define the case in which the arithmetic
	// computation does not overflow.
	std::pair<SDValue, SDValue>
	ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
	SDValue &ARMcc) const {
	assert(Op.getValueType() == MVT::i32 && "Unsupported value type");

	SDValue Value, OverflowCmp;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDLoc dl(Op);

	// FIXME: We are currently always generating CMPs because we don't support
	// generating CMN through the backend. This is not as good as the natural
	// CMP case because it causes a register dependency and cannot be folded
	// later.

	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Unknown overflow instruction!");
	case ISD::SADDO:
	ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
	Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
	OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
	break;
	case ISD::UADDO:
	ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
	// We use ADDC here to correspond to its use in LowerUnsignedALUO.
	// We do not use it in the USUBO case as Value may not be used.
	Value = DAG.getNode(ARMISD::ADDC, dl,
	DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
	.getValue(0);
	OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
	break;
	case ISD::SSUBO:
	ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
	Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
	OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
	break;
	case ISD::USUBO:
	ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
	Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
	OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
	break;
	case ISD::UMULO:
	// We generate a UMUL_LOHI and then check if the high word is 0.
	ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
	Value = DAG.getNode(ISD::UMUL_LOHI, dl,
	DAG.getVTList(Op.getValueType(), Op.getValueType()),
	LHS, RHS);
	OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
	DAG.getConstant(0, dl, MVT::i32));
	Value = Value.getValue(0); // We only want the low 32 bits for the result.
	break;
	case ISD::SMULO:
	// We generate a SMUL_LOHI and then check if all the bits of the high word
	// are the same as the sign bit of the low word.
	ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
	Value = DAG.getNode(ISD::SMUL_LOHI, dl,
	DAG.getVTList(Op.getValueType(), Op.getValueType()),
	LHS, RHS);
	OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
	DAG.getNode(ISD::SRA, dl, Op.getValueType(),
	Value.getValue(0),
	DAG.getConstant(31, dl, MVT::i32)));
	Value = Value.getValue(0); // We only want the low 32 bits for the result.
	break;
	} // switch (...)

	return std::make_pair(Value, OverflowCmp);
	}

	SDValue
	ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
	return SDValue();

	SDValue Value, OverflowCmp;
	SDValue ARMcc;
	std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
	SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
	SDLoc dl(Op);
	// We use 0 and 1 as false and true values.
	SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
	SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
	EVT VT = Op.getValueType();

	SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
	ARMcc, CCR, OverflowCmp);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
	}

	static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,
	SelectionDAG &DAG) {
	SDLoc DL(BoolCarry);
	EVT CarryVT = BoolCarry.getValueType();

	// This converts the boolean value carry into the carry flag by doing
	// ARMISD::SUBC Carry, 1
	SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
	DAG.getVTList(CarryVT, MVT::i32),
	BoolCarry, DAG.getConstant(1, DL, CarryVT));
	return Carry.getValue(1);
	}

	static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
	SelectionDAG &DAG) {
	SDLoc DL(Flags);

	// Now convert the carry flag into a boolean carry. We do this
	// using ARMISD:ADDE 0, 0, Carry
	return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
	DAG.getConstant(0, DL, MVT::i32),
	DAG.getConstant(0, DL, MVT::i32), Flags);
	}

	SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
	SelectionDAG &DAG) const {
	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
	return SDValue();

	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDLoc dl(Op);

	EVT VT = Op.getValueType();
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	SDValue Value;
	SDValue Overflow;
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Unknown overflow instruction!");
	case ISD::UADDO:
	Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
	// Convert the carry flag into a boolean value.
	Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
	break;
	case ISD::USUBO: {
	Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
	// Convert the carry flag into a boolean value.
	Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
	// ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
	// value. So compute 1 - C.
	Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
	DAG.getConstant(1, dl, MVT::i32), Overflow);
	break;
	}
	}

	return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
	}

	SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	SDValue Cond = Op.getOperand(0);
	SDValue SelectTrue = Op.getOperand(1);
	SDValue SelectFalse = Op.getOperand(2);
	SDLoc dl(Op);
	unsigned Opc = Cond.getOpcode();

	if (Cond.getResNo() == 1 &&
	(Opc == ISD::SADDO \|\| Opc == ISD::UADDO \|\| Opc == ISD::SSUBO \|\|
	Opc == ISD::USUBO)) {
	if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
	return SDValue();

	SDValue Value, OverflowCmp;
	SDValue ARMcc;
	std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
	SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
	EVT VT = Op.getValueType();

	return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
	OverflowCmp, DAG);
	}

	// Convert:
	//
	// (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
	// (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
	//
	if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
	const ConstantSDNode *CMOVTrue =
	dyn_cast<ConstantSDNode>(Cond.getOperand(0));
	const ConstantSDNode *CMOVFalse =
	dyn_cast<ConstantSDNode>(Cond.getOperand(1));

	if (CMOVTrue && CMOVFalse) {
	unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
	unsigned CMOVFalseVal = CMOVFalse->getZExtValue();

	SDValue True;
	SDValue False;
	if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
	True = SelectTrue;
	False = SelectFalse;
	} else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
	True = SelectFalse;
	False = SelectTrue;
	}

	if (True.getNode() && False.getNode()) {
	EVT VT = Op.getValueType();
	SDValue ARMcc = Cond.getOperand(2);
	SDValue CCR = Cond.getOperand(3);
	SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
	assert(True.getValueType() == VT);
	return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
	}
	}
	}

	// ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
	// undefined bits before doing a full-word comparison with zero.
	Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
	DAG.getConstant(1, dl, Cond.getValueType()));

	return DAG.getSelectCC(dl, Cond,
	DAG.getConstant(0, dl, Cond.getValueType()),
	SelectTrue, SelectFalse, ISD::SETNE);
	}

	static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
	bool &swpCmpOps, bool &swpVselOps) {
	// Start by selecting the GE condition code for opcodes that return true for
	// 'equality'
	if (CC == ISD::SETUGE \|\| CC == ISD::SETOGE \|\| CC == ISD::SETOLE \|\|
	CC == ISD::SETULE \|\| CC == ISD::SETGE \|\| CC == ISD::SETLE)
	CondCode = ARMCC::GE;

	// and GT for opcodes that return false for 'equality'.
	else if (CC == ISD::SETUGT \|\| CC == ISD::SETOGT \|\| CC == ISD::SETOLT \|\|
	CC == ISD::SETULT \|\| CC == ISD::SETGT \|\| CC == ISD::SETLT)
	CondCode = ARMCC::GT;

	// Since we are constrained to GE/GT, if the opcode contains 'less', we need
	// to swap the compare operands.
	if (CC == ISD::SETOLE \|\| CC == ISD::SETULE \|\| CC == ISD::SETOLT \|\|
	CC == ISD::SETULT \|\| CC == ISD::SETLE \|\| CC == ISD::SETLT)
	swpCmpOps = true;

	// Both GT and GE are ordered comparisons, and return false for 'unordered'.
	// If we have an unordered opcode, we need to swap the operands to the VSEL
	// instruction (effectively negating the condition).
	//
	// This also has the effect of swapping which one of 'less' or 'greater'
	// returns true, so we also swap the compare operands. It also switches
	// whether we return true for 'equality', so we compensate by picking the
	// opposite condition code to our original choice.
	if (CC == ISD::SETULE \|\| CC == ISD::SETULT \|\| CC == ISD::SETUGE \|\|
	CC == ISD::SETUGT) {
	swpCmpOps = !swpCmpOps;
	swpVselOps = !swpVselOps;
	CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
	}

	// 'ordered' is 'anything but unordered', so use the VS condition code and
	// swap the VSEL operands.
	if (CC == ISD::SETO) {
	CondCode = ARMCC::VS;
	swpVselOps = true;
	}

	// 'unordered or not equal' is 'anything but equal', so use the EQ condition
	// code and swap the VSEL operands. Also do this if we don't care about the
	// unordered case.
	if (CC == ISD::SETUNE \|\| CC == ISD::SETNE) {
	CondCode = ARMCC::EQ;
	swpVselOps = true;
	}
	}

	SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
	SDValue TrueVal, SDValue ARMcc, SDValue CCR,
	SDValue Cmp, SelectionDAG &DAG) const {
	if (!Subtarget->hasFP64() && VT == MVT::f64) {
	FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
	DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
	TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
	DAG.getVTList(MVT::i32, MVT::i32), TrueVal);

	SDValue TrueLow = TrueVal.getValue(0);
	SDValue TrueHigh = TrueVal.getValue(1);
	SDValue FalseLow = FalseVal.getValue(0);
	SDValue FalseHigh = FalseVal.getValue(1);

	SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
	ARMcc, CCR, Cmp);
	SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
	ARMcc, CCR, duplicateCmp(Cmp, DAG));

	return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
	} else {
	return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
	Cmp);
	}
	}

	static bool isGTorGE(ISD::CondCode CC) {
	return CC == ISD::SETGT \|\| CC == ISD::SETGE;
	}

	static bool isLTorLE(ISD::CondCode CC) {
	return CC == ISD::SETLT \|\| CC == ISD::SETLE;
	}

	// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
	// All of these conditions (and their <= and >= counterparts) will do:
	// x < k ? k : x
	// x > k ? x : k
	// k < x ? x : k
	// k > x ? k : x
	static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
	const SDValue TrueVal, const SDValue FalseVal,
	const ISD::CondCode CC, const SDValue K) {
	return (isGTorGE(CC) &&
	((K == LHS && K == TrueVal) \|\| (K == RHS && K == FalseVal))) \|\|
	(isLTorLE(CC) &&
	((K == RHS && K == TrueVal) \|\| (K == LHS && K == FalseVal)));
	}

	// Similar to isLowerSaturate(), but checks for upper-saturating conditions.
	static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
	const SDValue TrueVal, const SDValue FalseVal,
	const ISD::CondCode CC, const SDValue K) {
	return (isGTorGE(CC) &&
	((K == RHS && K == TrueVal) \|\| (K == LHS && K == FalseVal))) \|\|
	(isLTorLE(CC) &&
	((K == LHS && K == TrueVal) \|\| (K == RHS && K == FalseVal)));
	}

	// Check if two chained conditionals could be converted into SSAT or USAT.
	//
	// SSAT can replace a set of two conditional selectors that bound a number to an
	// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
	//
	// x < -k ? -k : (x > k ? k : x)
	// x < -k ? -k : (x < k ? x : k)
	// x > -k ? (x > k ? k : x) : -k
	// x < k ? (x < -k ? -k : x) : k
	// etc.
	//
	// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is
	// a power of 2.
	//
	// It returns true if the conversion can be done, false otherwise.
	// Additionally, the variable is returned in parameter V, the constant in K and
	// usat is set to true if the conditional represents an unsigned saturation
	static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
	uint64_t &K, bool &usat) {
	SDValue LHS1 = Op.getOperand(0);
	SDValue RHS1 = Op.getOperand(1);
	SDValue TrueVal1 = Op.getOperand(2);
	SDValue FalseVal1 = Op.getOperand(3);
	ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();

	const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
	if (Op2.getOpcode() != ISD::SELECT_CC)
	return false;

	SDValue LHS2 = Op2.getOperand(0);
	SDValue RHS2 = Op2.getOperand(1);
	SDValue TrueVal2 = Op2.getOperand(2);
	SDValue FalseVal2 = Op2.getOperand(3);
	ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();

	// Find out which are the constants and which are the variables
	// in each conditional
	SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
	? &RHS1
	: nullptr;
	SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
	? &RHS2
	: nullptr;
	SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
	SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
	SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
	SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;

	// We must detect cases where the original operations worked with 16- or
	// 8-bit values. In such case, V2Tmp != V2 because the comparison operations
	// must work with sign-extended values but the select operations return
	// the original non-extended value.
	SDValue V2TmpReg = V2Tmp;
	if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
	V2TmpReg = V2Tmp->getOperand(0);

	// Check that the registers and the constants have the correct values
	// in both conditionals
	if (!K1 \|\| !K2 \|\| K1 == Op2 \|\| K2 != K2Tmp \|\| V1Tmp != V2Tmp \|\|
	V2TmpReg != V2)
	return false;

	// Figure out which conditional is saturating the lower/upper bound.
	const SDValue *LowerCheckOp =
	isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
	? &Op
	: isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
	? &Op2
	: nullptr;
	const SDValue *UpperCheckOp =
	isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
	? &Op
	: isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
	? &Op2
	: nullptr;

	if (!UpperCheckOp \|\| !LowerCheckOp \|\| LowerCheckOp == UpperCheckOp)
	return false;

	// Check that the constant in the lower-bound check is
	// the opposite of the constant in the upper-bound check
	// in 1's complement.
	int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
	int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
	int64_t PosVal = std::max(Val1, Val2);
	int64_t NegVal = std::min(Val1, Val2);

	if (((Val1 > Val2 && UpperCheckOp == &Op) \|\|
	(Val1 < Val2 && UpperCheckOp == &Op2)) &&
	isPowerOf2_64(PosVal + 1)) {

	// Handle the difference between USAT (unsigned) and SSAT (signed) saturation
	if (Val1 == ~Val2)
	usat = false;
	else if (NegVal == 0)
	usat = true;
	else
	return false;

	V = V2;
	K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive

	return true;
	}

	return false;
	}

	// Check if a condition of the type x < k ? k : x can be converted into a
	// bit operation instead of conditional moves.
	// Currently this is allowed given:
	// - The conditions and values match up
	// - k is 0 or -1 (all ones)
	// This function will not check the last condition, thats up to the caller
	// It returns true if the transformation can be made, and in such case
	// returns x in V, and k in SatK.
	static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
	SDValue &SatK)
	{
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
	SDValue TrueVal = Op.getOperand(2);
	SDValue FalseVal = Op.getOperand(3);

	SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
	? &RHS
	: nullptr;

	// No constant operation in comparison, early out
	if (!K)
	return false;

	SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
	V = (KTmp == TrueVal) ? FalseVal : TrueVal;
	SDValue VTmp = (K && *K == LHS) ? RHS : LHS;

	// If the constant on left and right side, or variable on left and right,
	// does not match, early out
	if (*K != KTmp \|\| V != VTmp)
	return false;

	if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
	SatK = *K;
	return true;
	}

	return false;
	}

	bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
	if (VT == MVT::f32)
	return !Subtarget->hasVFP2Base();
	if (VT == MVT::f64)
	return !Subtarget->hasFP64();
	if (VT == MVT::f16)
	return !Subtarget->hasFullFP16();
	return false;
	}

	SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc dl(Op);

	// Try to convert two saturating conditional selects into a single SSAT
	SDValue SatValue;
	uint64_t SatConstant;
	bool SatUSat;
	if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) \|\| Subtarget->isThumb2()) &&
	isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) {
	if (SatUSat)
	return DAG.getNode(ARMISD::USAT, dl, VT, SatValue,
	DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
	else
	return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
	DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
	}

	// Try to convert expressions of the form x < k ? k : x (and similar forms)
	// into more efficient bit operations, which is possible when k is 0 or -1
	// On ARM and Thumb-2 which have flexible operand 2 this will result in
	// single instructions. On Thumb the shift and the bit operation will be two
	// instructions.
	// Only allow this transformation on full-width (32-bit) operations
	SDValue LowerSatConstant;
	if (VT == MVT::i32 &&
	isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
	SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
	DAG.getConstant(31, dl, VT));
	if (isNullConstant(LowerSatConstant)) {
	SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
	DAG.getAllOnesConstant(dl, VT));
	return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
	} else if (isAllOnesConstant(LowerSatConstant))
	return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
	}

	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
	SDValue TrueVal = Op.getOperand(2);
	SDValue FalseVal = Op.getOperand(3);

	if (isUnsupportedFloatingType(LHS.getValueType())) {
	DAG.getTargetLoweringInfo().softenSetCCOperands(
	DAG, LHS.getValueType(), LHS, RHS, CC, dl);

	// If softenSetCCOperands only returned one value, we should compare it to
	// zero.
	if (!RHS.getNode()) {
	RHS = DAG.getConstant(0, dl, LHS.getValueType());
	CC = ISD::SETNE;
	}
	}

	if (LHS.getValueType() == MVT::i32) {
	// Try to generate VSEL on ARMv8.
	// The VSEL instruction can't use all the usual ARM condition
	// codes: it only has two bits to select the condition code, so it's
	// constrained to use only GE, GT, VS and EQ.
	//
	// To implement all the various ISD::SETXXX opcodes, we sometimes need to
	// swap the operands of the previous compare instruction (effectively
	// inverting the compare condition, swapping 'less' and 'greater') and
	// sometimes need to swap the operands to the VSEL (which inverts the
	// condition in the sense of firing whenever the previous condition didn't)
	if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 \|\|
	TrueVal.getValueType() == MVT::f32 \|\|
	TrueVal.getValueType() == MVT::f64)) {
	ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
	if (CondCode == ARMCC::LT \|\| CondCode == ARMCC::LE \|\|
	CondCode == ARMCC::VC \|\| CondCode == ARMCC::NE) {
	CC = ISD::getSetCCInverse(CC, true);
	std::swap(TrueVal, FalseVal);
	}
	}

	SDValue ARMcc;
	SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
	SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
	// Choose GE over PL, which vsel does now support
	if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL)
	ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
	return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
	}

	ARMCC::CondCodes CondCode, CondCode2;
	bool InvalidOnQNaN;
	FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);

	// Normalize the fp compare. If RHS is zero we prefer to keep it there so we
	// match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
	// must use VSEL (limited condition codes), due to not having conditional f16
	// moves.
	if (Subtarget->hasFPARMv8Base() &&
	!(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
	(TrueVal.getValueType() == MVT::f16 \|\|
	TrueVal.getValueType() == MVT::f32 \|\|
	TrueVal.getValueType() == MVT::f64)) {
	bool swpCmpOps = false;
	bool swpVselOps = false;
	checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);

	if (CondCode == ARMCC::GT \|\| CondCode == ARMCC::GE \|\|
	CondCode == ARMCC::VS \|\| CondCode == ARMCC::EQ) {
	if (swpCmpOps)
	std::swap(LHS, RHS);
	if (swpVselOps)
	std::swap(TrueVal, FalseVal);
	}
	}

	SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
	SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
	SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
	SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
	if (CondCode2 != ARMCC::AL) {
	SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
	// FIXME: Needs another CMP because flag can have but one use.
	SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
	Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
	}
	return Result;
	}

	/// canChangeToInt - Given the fp compare operand, return true if it is suitable
	/// to morph to an integer compare sequence.
	static bool canChangeToInt(SDValue Op, bool &SeenZero,
	const ARMSubtarget *Subtarget) {
	SDNode *N = Op.getNode();
	if (!N->hasOneUse())
	// Otherwise it requires moving the value from fp to integer registers.
	return false;
	if (!N->getNumValues())
	return false;
	EVT VT = Op.getValueType();
	if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
	// f32 case is generally profitable. f64 case only makes sense when vcmpe +
	// vmrs are very slow, e.g. cortex-a8.
	return false;

	if (isFloatingPointZero(Op)) {
	SeenZero = true;
	return true;
	}
	return ISD::isNormalLoad(N);
	}

	static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
	if (isFloatingPointZero(Op))
	return DAG.getConstant(0, SDLoc(Op), MVT::i32);

	if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
	return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());

	llvm_unreachable("Unknown VFP cmp argument!");
	}

	static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
	SDValue &RetVal1, SDValue &RetVal2) {
	SDLoc dl(Op);

	if (isFloatingPointZero(Op)) {
	RetVal1 = DAG.getConstant(0, dl, MVT::i32);
	RetVal2 = DAG.getConstant(0, dl, MVT::i32);
	return;
	}

	if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
	SDValue Ptr = Ld->getBasePtr();
	RetVal1 =
	DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
	Ld->getAlignment(), Ld->getMemOperand()->getFlags());

	EVT PtrType = Ptr.getValueType();
	unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
	SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
	PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
	RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
	Ld->getPointerInfo().getWithOffset(4), NewAlign,
	Ld->getMemOperand()->getFlags());
	return;
	}

	llvm_unreachable("Unknown VFP cmp argument!");
	}

	/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
	/// f32 and even f64 comparisons to integer ones.
	SDValue
	ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
	SDValue LHS = Op.getOperand(2);
	SDValue RHS = Op.getOperand(3);
	SDValue Dest = Op.getOperand(4);
	SDLoc dl(Op);

	bool LHSSeenZero = false;
	bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
	bool RHSSeenZero = false;
	bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
	if (LHSOk && RHSOk && (LHSSeenZero \|\| RHSSeenZero)) {
	// If unsafe fp math optimization is enabled and there are no other uses of
	// the CMP operands, and the condition code is EQ or NE, we can optimize it
	// to an integer comparison.
	if (CC == ISD::SETOEQ)
	CC = ISD::SETEQ;
	else if (CC == ISD::SETUNE)
	CC = ISD::SETNE;

	SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
	SDValue ARMcc;
	if (LHS.getValueType() == MVT::f32) {
	LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
	bitcastf32Toi32(LHS, DAG), Mask);
	RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
	bitcastf32Toi32(RHS, DAG), Mask);
	SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
	SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
	return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
	Chain, Dest, ARMcc, CCR, Cmp);
	}

	SDValue LHS1, LHS2;
	SDValue RHS1, RHS2;
	expandf64Toi32(LHS, DAG, LHS1, LHS2);
	expandf64Toi32(RHS, DAG, RHS1, RHS2);
	LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
	RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
	ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
	ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
	SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
	return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
	}

	return SDValue();
	}

	SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Cond = Op.getOperand(1);
	SDValue Dest = Op.getOperand(2);
	SDLoc dl(Op);

	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a branch
	// instruction.
	unsigned Opc = Cond.getOpcode();
	bool OptimizeMul = (Opc == ISD::SMULO \|\| Opc == ISD::UMULO) &&
	!Subtarget->isThumb1Only();
	if (Cond.getResNo() == 1 &&
	(Opc == ISD::SADDO \|\| Opc == ISD::UADDO \|\| Opc == ISD::SSUBO \|\|
	Opc == ISD::USUBO \|\| OptimizeMul)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
	return SDValue();

	// The actual operation with overflow check.
	SDValue Value, OverflowCmp;
	SDValue ARMcc;
	std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);

	// Reverse the condition code.
	ARMCC::CondCodes CondCode =
	(ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
	CondCode = ARMCC::getOppositeCondition(CondCode);
	ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
	SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);

	return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
	OverflowCmp);
	}

	return SDValue();
	}

	SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
	SDValue LHS = Op.getOperand(2);
	SDValue RHS = Op.getOperand(3);
	SDValue Dest = Op.getOperand(4);
	SDLoc dl(Op);

	if (isUnsupportedFloatingType(LHS.getValueType())) {
	DAG.getTargetLoweringInfo().softenSetCCOperands(
	DAG, LHS.getValueType(), LHS, RHS, CC, dl);

	// If softenSetCCOperands only returned one value, we should compare it to
	// zero.
	if (!RHS.getNode()) {
	RHS = DAG.getConstant(0, dl, LHS.getValueType());
	CC = ISD::SETNE;
	}
	}

	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a branch
	// instruction.
	unsigned Opc = LHS.getOpcode();
	bool OptimizeMul = (Opc == ISD::SMULO \|\| Opc == ISD::UMULO) &&
	!Subtarget->isThumb1Only();
	if (LHS.getResNo() == 1 && (isOneConstant(RHS) \|\| isNullConstant(RHS)) &&
	(Opc == ISD::SADDO \|\| Opc == ISD::UADDO \|\| Opc == ISD::SSUBO \|\|
	Opc == ISD::USUBO \|\| OptimizeMul) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
	return SDValue();

	// The actual operation with overflow check.
	SDValue Value, OverflowCmp;
	SDValue ARMcc;
	std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);

	if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
	// Reverse the condition code.
	ARMCC::CondCodes CondCode =
	(ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
	CondCode = ARMCC::getOppositeCondition(CondCode);
	ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
	}
	SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);

	return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
	OverflowCmp);
	}

	if (LHS.getValueType() == MVT::i32) {
	SDValue ARMcc;
	SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
	SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
	return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
	Chain, Dest, ARMcc, CCR, Cmp);
	}

	if (getTargetMachine().Options.UnsafeFPMath &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETOEQ \|\|
	CC == ISD::SETNE \|\| CC == ISD::SETUNE)) {
	if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
	return Result;
	}

	ARMCC::CondCodes CondCode, CondCode2;
	bool InvalidOnQNaN;
	FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);

	SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
	SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
	SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
	SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
	SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
	if (CondCode2 != ARMCC::AL) {
	ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
	SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
	Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
	}
	return Res;
	}

	SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Table = Op.getOperand(1);
	SDValue Index = Op.getOperand(2);
	SDLoc dl(Op);

	EVT PTy = getPointerTy(DAG.getDataLayout());
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
	SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
	Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
	Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
	SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
	if (Subtarget->isThumb2() \|\| (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
	// Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
	// which does another jump to the destination. This also makes it easier
	// to translate it to TBB / TBH later (Thumb2 only).
	// FIXME: This might not work if the function is extremely large.
	return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
	Addr, Op.getOperand(2), JTI);
	}
	if (isPositionIndependent() \|\| Subtarget->isROPI()) {
	Addr =
	DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
	MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
	Chain = Addr.getValue(1);
	Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
	return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
	} else {
	Addr =
	DAG.getLoad(PTy, dl, Chain, Addr,
	MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
	Chain = Addr.getValue(1);
	return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
	}
	}

	static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();
	SDLoc dl(Op);

	if (Op.getValueType().getVectorElementType() == MVT::i32) {
	if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
	return Op;
	return DAG.UnrollVectorOp(Op.getNode());
	}

	const bool HasFullFP16 =
	static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();

	EVT NewTy;
	const EVT OpTy = Op.getOperand(0).getValueType();
	if (OpTy == MVT::v4f32)
	NewTy = MVT::v4i32;
	else if (OpTy == MVT::v4f16 && HasFullFP16)
	NewTy = MVT::v4i16;
	else if (OpTy == MVT::v8f16 && HasFullFP16)
	NewTy = MVT::v8i16;
	else
	llvm_unreachable("Invalid type for custom lowering!");

	if (VT != MVT::v4i16 && VT != MVT::v8i16)
	return DAG.UnrollVectorOp(Op.getNode());

	Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
	}

	SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	if (VT.isVector())
	return LowerVectorFP_TO_INT(Op, DAG);
	if (isUnsupportedFloatingType(Op.getOperand(0).getValueType())) {
	RTLIB::Libcall LC;
	if (Op.getOpcode() == ISD::FP_TO_SINT)
	LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
	Op.getValueType());
	else
	LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
	Op.getValueType());
	return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
	/isSigned/ false, SDLoc(Op)).first;
	}

	return Op;
	}

	static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();
	SDLoc dl(Op);

	if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
	if (VT.getVectorElementType() == MVT::f32)
	return Op;
	return DAG.UnrollVectorOp(Op.getNode());
	}

	assert((Op.getOperand(0).getValueType() == MVT::v4i16 \|\|
	Op.getOperand(0).getValueType() == MVT::v8i16) &&
	"Invalid type for custom lowering!");

	const bool HasFullFP16 =
	static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();

	EVT DestVecType;
	if (VT == MVT::v4f32)
	DestVecType = MVT::v4i32;
	else if (VT == MVT::v4f16 && HasFullFP16)
	DestVecType = MVT::v4i16;
	else if (VT == MVT::v8f16 && HasFullFP16)
	DestVecType = MVT::v8i16;
	else
	return DAG.UnrollVectorOp(Op.getNode());

	unsigned CastOpc;
	unsigned Opc;
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Invalid opcode!");
	case ISD::SINT_TO_FP:
	CastOpc = ISD::SIGN_EXTEND;
	Opc = ISD::SINT_TO_FP;
	break;
	case ISD::UINT_TO_FP:
	CastOpc = ISD::ZERO_EXTEND;
	Opc = ISD::UINT_TO_FP;
	break;
	}

	Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
	return DAG.getNode(Opc, dl, VT, Op);
	}

	SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	if (VT.isVector())
	return LowerVectorINT_TO_FP(Op, DAG);
	if (isUnsupportedFloatingType(VT)) {
	RTLIB::Libcall LC;
	if (Op.getOpcode() == ISD::SINT_TO_FP)
	LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
	Op.getValueType());
	else
	LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
	Op.getValueType());
	return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
	/isSigned/ false, SDLoc(Op)).first;
	}

	return Op;
	}

	SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
	// Implement fcopysign with a fabs and a conditional fneg.
	SDValue Tmp0 = Op.getOperand(0);
	SDValue Tmp1 = Op.getOperand(1);
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	EVT SrcVT = Tmp1.getValueType();
	bool InGPR = Tmp0.getOpcode() == ISD::BITCAST \|\|
	Tmp0.getOpcode() == ARMISD::VMOVDRR;
	bool UseNEON = !InGPR && Subtarget->hasNEON();

	if (UseNEON) {
	// Use VBSL to copy the sign bit.
	unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
	SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
	DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
	EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
	if (VT == MVT::f64)
	Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
	DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
	DAG.getConstant(32, dl, MVT::i32));
	else /if (VT == MVT::f32)/
	Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
	if (SrcVT == MVT::f32) {
	Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
	if (VT == MVT::f64)
	Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
	DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
	DAG.getConstant(32, dl, MVT::i32));
	} else if (VT == MVT::f32)
	Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
	DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
	DAG.getConstant(32, dl, MVT::i32));
	Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
	Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);

	SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
	dl, MVT::i32);
	AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
	SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
	DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));

	SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
	DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
	DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
	if (VT == MVT::f32) {
	Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
	Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
	DAG.getConstant(0, dl, MVT::i32));
	} else {
	Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
	}

	return Res;
	}

	// Bitcast operand 1 to i32.
	if (SrcVT == MVT::f64)
	Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
	Tmp1).getValue(1);
	Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);

	// Or in the signbit with integer operations.
	SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
	SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
	Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
	if (VT == MVT::f32) {
	Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
	DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
	return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
	DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
	}

	// f64: Or the high part with signbit and then combine two parts.
	Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
	Tmp0);
	SDValue Lo = Tmp0.getValue(0);
	SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
	Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
	return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
	}

	SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	if (Depth) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
	return DAG.getLoad(VT, dl, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Return LR, which contains the return address. Mark it an implicit live-in.
	unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
	}

	SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
	const ARMBaseRegisterInfo &ARI =
	static_cast<const ARMBaseRegisterInfo>(RegInfo);
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setFrameAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc dl(Op); // FIXME probably not meaningful
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	unsigned FrameReg = ARI.getFrameRegister(MF);
	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("sp", ARM::SP)
	.Default(0);
	if (Reg)
	return Reg;
	report_fatal_error(Twine("Invalid register name \""
	+ StringRef(RegName) + "\"."));
	}

	// Result is 64 bit value so split into two 32 bit values and return as a
	// pair of values.
	static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) {
	SDLoc DL(N);

	// This function is only supposed to be called for i64 type destination.
	assert(N->getValueType(0) == MVT::i64
	&& "ExpandREAD_REGISTER called for non-i64 type result.");

	SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
	DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
	N->getOperand(0),
	N->getOperand(1));

	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
	Read.getValue(1)));
	Results.push_back(Read.getOperand(0));
	}

	/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
	/// When \p DstVT, the destination type of \p BC, is on the vector
	/// register bank and the source of bitcast, \p Op, operates on the same bank,
	/// it might be possible to combine them, such that everything stays on the
	/// vector register bank.
	/// \p return The node that would replace \p BT, if the combine
	/// is possible.
	static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
	SelectionDAG &DAG) {
	SDValue Op = BC->getOperand(0);
	EVT DstVT = BC->getValueType(0);

	// The only vector instruction that can produce a scalar (remember,
	// since the bitcast was about to be turned into VMOVDRR, the source
	// type is i64) from a vector is EXTRACT_VECTOR_ELT.
	// Moreover, we can do this combine only if there is one use.
	// Finally, if the destination type is not a vector, there is not
	// much point on forcing everything on the vector bank.
	if (!DstVT.isVector() \|\| Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!Op.hasOneUse())
	return SDValue();

	// If the index is not constant, we will introduce an additional
	// multiply that will stick.
	// Give up in that case.
	ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!Index)
	return SDValue();
	unsigned DstNumElt = DstVT.getVectorNumElements();

	// Compute the new index.
	const APInt &APIntIndex = Index->getAPIntValue();
	APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
	NewIndex *= APIntIndex;
	// Check if the new constant index fits into i32.
	if (NewIndex.getBitWidth() > 32)
	return SDValue();

	// vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
	// vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
	SDLoc dl(Op);
	SDValue ExtractSrc = Op.getOperand(0);
	EVT VecVT = EVT::getVectorVT(
	*DAG.getContext(), DstVT.getScalarType(),
	ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
	SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
	DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
	}

	/// ExpandBITCAST - If the target supports VFP, this function is called to
	/// expand a bit convert where either the source or destination type is i64 to
	/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
	/// operand type is illegal (e.g., v2f32 for a target that doesn't support
	/// vectors), since the legalizer won't know what to do with that.
	static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
	const ARMSubtarget *Subtarget) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDLoc dl(N);
	SDValue Op = N->getOperand(0);

	// This function is only supposed to be called for i64 types, either as the
	// source or destination of the bit convert.
	EVT SrcVT = Op.getValueType();
	EVT DstVT = N->getValueType(0);
	const bool HasFullFP16 = Subtarget->hasFullFP16();

	if (SrcVT == MVT::f32 && DstVT == MVT::i32) {
	// FullFP16: half values are passed in S-registers, and we don't
	// need any of the bitcast and moves:
	//
	// t2: f32,ch = CopyFromReg t0, Register:f32 %0
	// t5: i32 = bitcast t2
	// t18: f16 = ARMISD::VMOVhr t5
	if (Op.getOpcode() != ISD::CopyFromReg \|\|
	Op.getValueType() != MVT::f32)
	return SDValue();

	auto Move = N->use_begin();
	if (Move->getOpcode() != ARMISD::VMOVhr)
	return SDValue();

	SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
	SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops);
	DAG.ReplaceAllUsesWith(*Move, &Copy);
	return Copy;
	}

	if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
	if (!HasFullFP16)
	return SDValue();
	// SoftFP: read half-precision arguments:
	//
	// t2: i32,ch = ...
	// t7: i16 = truncate t2 <~~~~ Op
	// t8: f16 = bitcast t7 <~~~~ N
	//
	if (Op.getOperand(0).getValueType() == MVT::i32)
	return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op),
	MVT::f16, Op.getOperand(0));

	return SDValue();
	}

	// Half-precision return values
	if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
	if (!HasFullFP16)
	return SDValue();
	//
	// t11: f16 = fadd t8, t10
	// t12: i16 = bitcast t11 <~~~ SDNode N
	// t13: i32 = zero_extend t12
	// t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13
	// t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1
	//
	// transform this into:
	//
	// t20: i32 = ARMISD::VMOVrh t11
	// t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20
	//
	auto ZeroExtend = N->use_begin();
	if (N->use_size() != 1 \|\| ZeroExtend->getOpcode() != ISD::ZERO_EXTEND \|\|
	ZeroExtend->getValueType(0) != MVT::i32)
	return SDValue();

	auto Copy = ZeroExtend->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg &&
	Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) {
	SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op);
	DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);
	return Cvt;
	}
	return SDValue();
	}

	if (!(SrcVT == MVT::i64 \|\| DstVT == MVT::i64))
	return SDValue();

	// Turn i64->f64 into VMOVDRR.
	if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
	// Do not force values to GPRs (this is what VMOVDRR does for the inputs)
	// if we can combine the bitcast with its source.
	if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
	return Val;

	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, MVT::i32));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
	DAG.getConstant(1, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, DstVT,
	DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
	}

	// Turn f64->i64 into VMOVRRD.
	if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
	SDValue Cvt;
	if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
	SrcVT.getVectorNumElements() > 1)
	Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
	DAG.getVTList(MVT::i32, MVT::i32),
	DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
	else
	Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
	DAG.getVTList(MVT::i32, MVT::i32), Op);
	// Merge the pieces into a single i64 value.
	return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
	}

	return SDValue();
	}

	/// getZeroVector - Returns a vector of specified type with all zero elements.
	/// Zero vectors are used to represent vector negation and in those cases
	/// will be implemented with the NEON VNEG instruction. However, VNEG does
	/// not support i64 elements, so sometimes the zero vectors will need to be
	/// explicitly constructed. Regardless, use a canonical VMOV to create the
	/// zero vector.
	static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert(VT.isVector() && "Expected a vector type");
	// The canonical modified immediate encoding of a zero vector is....0!
	SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
	EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
	SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
	return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
	}

	/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
	/// i32 values and take a 2 x i32 value to shift plus a shift amount.
	SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	SDValue ARMcc;
	SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
	unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;

	assert(Op.getOpcode() == ISD::SRA_PARTS \|\| Op.getOpcode() == ISD::SRL_PARTS);

	SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
	DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
	SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
	SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i32));
	SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
	SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
	SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
	SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
	ISD::SETGE, ARMcc, DAG, dl);
	SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
	ARMcc, CCR, CmpLo);

	SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
	SDValue HiBigShift = Opc == ISD::SRA
	? DAG.getNode(Opc, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, VT))
	: DAG.getConstant(0, dl, VT);
	SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
	ISD::SETGE, ARMcc, DAG, dl);
	SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
	ARMcc, CCR, CmpHi);

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
	/// i32 values and take a 2 x i32 value to shift plus a shift amount.
	SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	SDValue ARMcc;
	SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);

	assert(Op.getOpcode() == ISD::SHL_PARTS);
	SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
	DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
	SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
	SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
	SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);

	SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i32));
	SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
	SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
	ISD::SETGE, ARMcc, DAG, dl);
	SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
	ARMcc, CCR, CmpHi);

	SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
	ISD::SETGE, ARMcc, DAG, dl);
	SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
	SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
	DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	// The rounding mode is in bits 23:22 of the FPSCR.
	// The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
	// The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
	// so that the shift + and get folded into a bitfield extract.
	SDLoc dl(Op);
	SDValue Ops[] = { DAG.getEntryNode(),
	DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) };

	SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops);
	SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
	DAG.getConstant(1U << 22, dl, MVT::i32));
	SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
	DAG.getConstant(22, dl, MVT::i32));
	return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
	DAG.getConstant(3, dl, MVT::i32));
	}

	static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
	const ARMSubtarget *ST) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	if (VT.isVector()) {
	assert(ST->hasNEON());

	// Compute the least significant set bit: LSB = X & -X
	SDValue X = N->getOperand(0);
	SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
	SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);

	EVT ElemTy = VT.getVectorElementType();

	if (ElemTy == MVT::i8) {
	// Compute with: cttz(x) = ctpop(lsb - 1)
	SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
	DAG.getTargetConstant(1, dl, ElemTy));
	SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
	return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
	}

	if ((ElemTy == MVT::i16 \|\| ElemTy == MVT::i32) &&
	(N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
	// Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
	unsigned NumBits = ElemTy.getSizeInBits();
	SDValue WidthMinus1 =
	DAG.getNode(ARMISD::VMOVIMM, dl, VT,
	DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
	SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
	return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
	}

	// Compute with: cttz(x) = ctpop(lsb - 1)

	// Compute LSB - 1.
	SDValue Bits;
	if (ElemTy == MVT::i64) {
	// Load constant 0xffff'ffff'ffff'ffff to register.
	SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
	DAG.getTargetConstant(0x1eff, dl, MVT::i32));
	Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
	} else {
	SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
	DAG.getTargetConstant(1, dl, ElemTy));
	Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
	}
	return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
	}

	if (!ST->hasV6T2Ops())
	return SDValue();

	SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
	return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
	}

	static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
	const ARMSubtarget *ST) {
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
	assert((VT == MVT::v1i64 \|\| VT == MVT::v2i64 \|\| VT == MVT::v2i32 \|\|
	VT == MVT::v4i32 \|\| VT == MVT::v4i16 \|\| VT == MVT::v8i16) &&
	"Unexpected type for custom ctpop lowering");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
	SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
	Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);

	// Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
	unsigned EltSize = 8;
	unsigned NumElts = VT.is64BitVector() ? 8 : 16;
	while (EltSize != VT.getScalarSizeInBits()) {
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
	TLI.getPointerTy(DAG.getDataLayout())));
	Ops.push_back(Res);

	EltSize *= 2;
	NumElts /= 2;
	MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
	Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
	}

	return Res;
	}

	/// Getvshiftimm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift operation, where all the elements of the
	/// build_vector must have the same constant integer value.
	static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
	// Ignore bit_converts.
	while (Op.getOpcode() == ISD::BITCAST)
	Op = Op.getOperand(0);
	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (!BVN \|\|
	!BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
	ElementBits) \|\|
	SplatBitSize > ElementBits)
	return false;
	Cnt = SplatBits.getSExtValue();
	return true;
	}

	/// isVShiftLImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift left operation. That value must be in the range:
	/// 0 <= Value < ElementBits for a left shift; or
	/// 0 <= Value <= ElementBits for a long left shift.
	static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
	assert(VT.isVector() && "vector shift count is not a vector type");
	int64_t ElementBits = VT.getScalarSizeInBits();
	if (!getVShiftImm(Op, ElementBits, Cnt))
	return false;
	return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
	}

	/// isVShiftRImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift right operation. For a shift opcode, the value
	/// is positive, but for an intrinsic the value count must be negative. The
	/// absolute value must be in the range:
	/// 1 <= \|Value\| <= ElementBits for a right shift; or
	/// 1 <= \|Value\| <= ElementBits/2 for a narrow right shift.
	static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
	int64_t &Cnt) {
	assert(VT.isVector() && "vector shift count is not a vector type");
	int64_t ElementBits = VT.getScalarSizeInBits();
	if (!getVShiftImm(Op, ElementBits, Cnt))
	return false;
	if (!isIntrinsic)
	return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
	if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
	Cnt = -Cnt;
	return true;
	}
	return false;
	}

	static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
	const ARMSubtarget *ST) {
	EVT VT = N->getValueType(0);
	SDLoc dl(N);
	int64_t Cnt;

	if (!VT.isVector())
	return SDValue();

	// We essentially have two forms here. Shift by an immediate and shift by a
	// vector register (there are also shift by a gpr, but that is just handled
	// with a tablegen pattern). We cannot easily match shift by an immediate in
	// tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
	// For shifting by a vector, we don't have VSHR, only VSHL (which can be
	// signed or unsigned, and a negative shift indicates a shift right).
	if (N->getOpcode() == ISD::SHL) {
	if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
	return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
	DAG.getConstant(Cnt, dl, MVT::i32));
	return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
	N->getOperand(1));
	}

	assert((N->getOpcode() == ISD::SRA \|\| N->getOpcode() == ISD::SRL) &&
	"unexpected vector shift opcode");

	if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
	unsigned VShiftOpc =
	(N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
	return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
	DAG.getConstant(Cnt, dl, MVT::i32));
	}

	// Other right shifts we don't have operations for (we use a shift left by a
	// negative number).
	EVT ShiftVT = N->getOperand(1).getValueType();
	SDValue NegatedCount = DAG.getNode(
	ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
	unsigned VShiftOpc =
	(N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
	return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
	}

	static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
	const ARMSubtarget *ST) {
	EVT VT = N->getValueType(0);
	SDLoc dl(N);

	// We can get here for a node like i32 = ISD::SHL i32, i64
	if (VT != MVT::i64)
	return SDValue();

	assert((N->getOpcode() == ISD::SRL \|\| N->getOpcode() == ISD::SRA \|\|
	N->getOpcode() == ISD::SHL) &&
	"Unknown shift to lower!");

	unsigned ShOpc = N->getOpcode();
	if (ST->hasMVEIntegerOps()) {
	SDValue ShAmt = N->getOperand(1);
	unsigned ShPartsOpc = ARMISD::LSLL;
	ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);

	// If the shift amount is greater than 32 then do the default optimisation
	if (Con && Con->getZExtValue() > 32)
	return SDValue();

	// Extract the lower 32 bits of the shift amount if it's an i64
	if (ShAmt->getValueType(0) == MVT::i64)
	ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt,
	DAG.getConstant(0, dl, MVT::i32));

	if (ShOpc == ISD::SRL) {
	if (!Con)
	// There is no t2LSRLr instruction so negate and perform an lsll if the
	// shift amount is in a register, emulating a right shift.
	ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
	DAG.getConstant(0, dl, MVT::i32), ShAmt);
	else
	// Else generate an lsrl on the immediate shift amount
	ShPartsOpc = ARMISD::LSRL;
	} else if (ShOpc == ISD::SRA)
	ShPartsOpc = ARMISD::ASRL;

	// Lower 32 bits of the destination/source
	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
	DAG.getConstant(0, dl, MVT::i32));
	// Upper 32 bits of the destination/source
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
	DAG.getConstant(1, dl, MVT::i32));

	// Generate the shift operation as computed above
	Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
	ShAmt);
	// The upper 32 bits come from the second return value of lsll
	Hi = SDValue(Lo.getNode(), 1);
	return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
	}

	// We only lower SRA, SRL of 1 here, all others use generic lowering.
	if (!isOneConstant(N->getOperand(1)) \|\| N->getOpcode() == ISD::SHL)
	return SDValue();

	// If we are in thumb mode, we don't have RRX.
	if (ST->isThumb1Only())
	return SDValue();

	// Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
	DAG.getConstant(0, dl, MVT::i32));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
	DAG.getConstant(1, dl, MVT::i32));

	// First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
	// captures the result into a carry flag.
	unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
	Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);

	// The low part is an ARMISD::RRX operand, which shifts the carry in.
	Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));

	// Merge the pieces into a single i64 value.
	return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
	}

	static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
	SDValue TmpOp0, TmpOp1;
	bool Invert = false;
	bool Swap = false;
	unsigned Opc = 0;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
	EVT VT = Op.getValueType();
	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
	SDLoc dl(Op);

	if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
	(SetCCOpcode == ISD::SETEQ \|\| SetCCOpcode == ISD::SETNE)) {
	// Special-case integer 64-bit equality comparisons. They aren't legal,
	// but they can be lowered with a few vector instructions.
	unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
	EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
	SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
	SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
	SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
	DAG.getCondCode(ISD::SETEQ));
	SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
	SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
	Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
	if (SetCCOpcode == ISD::SETNE)
	Merged = DAG.getNOT(dl, Merged, CmpVT);
	Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
	return Merged;
	}

	if (CmpVT.getVectorElementType() == MVT::i64)
	// 64-bit comparisons are not legal in general.
	return SDValue();

	if (Op1.getValueType().isFloatingPoint()) {
	switch (SetCCOpcode) {
	default: llvm_unreachable("Illegal FP comparison");
	case ISD::SETUNE:
	case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH;
	case ISD::SETOEQ:
	case ISD::SETEQ: Opc = ARMISD::VCEQ; break;
	case ISD::SETOLT:
	case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT: Opc = ARMISD::VCGT; break;
	case ISD::SETOLE:
	case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETOGE:
	case ISD::SETGE: Opc = ARMISD::VCGE; break;
	case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
	case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
	case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
	case ISD::SETONE:
	// Expand this to (OLT \| OGT).
	TmpOp0 = Op0;
	TmpOp1 = Op1;
	Opc = ISD::OR;
	Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
	Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
	break;
	case ISD::SETUO:
	Invert = true;
	LLVM_FALLTHROUGH;
	case ISD::SETO:
	// Expand this to (OLT \| OGE).
	TmpOp0 = Op0;
	TmpOp1 = Op1;
	Opc = ISD::OR;
	Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
	Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
	break;
	}
	} else {
	// Integer comparisons.
	switch (SetCCOpcode) {
	default: llvm_unreachable("Illegal integer comparison");
	case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH;
	case ISD::SETEQ: Opc = ARMISD::VCEQ; break;
	case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETGT: Opc = ARMISD::VCGT; break;
	case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETGE: Opc = ARMISD::VCGE; break;
	case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
	case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
	}

	// Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
	if (Opc == ARMISD::VCEQ) {
	SDValue AndOp;
	if (ISD::isBuildVectorAllZeros(Op1.getNode()))
	AndOp = Op0;
	else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
	AndOp = Op1;

	// Ignore bitconvert.
	if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
	AndOp = AndOp.getOperand(0);

	if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
	Opc = ARMISD::VTST;
	Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
	Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
	Invert = !Invert;
	}
	}
	}

	if (Swap)
	std::swap(Op0, Op1);

	// If one of the operands is a constant vector zero, attempt to fold the
	// comparison to a specialized compare-against-zero form.
	SDValue SingleOp;
	if (ISD::isBuildVectorAllZeros(Op1.getNode()))
	SingleOp = Op0;
	else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
	if (Opc == ARMISD::VCGE)
	Opc = ARMISD::VCLEZ;
	else if (Opc == ARMISD::VCGT)
	Opc = ARMISD::VCLTZ;
	SingleOp = Op1;
	}

	SDValue Result;
	if (SingleOp.getNode()) {
	switch (Opc) {
	case ARMISD::VCEQ:
	Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
	case ARMISD::VCGE:
	Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
	case ARMISD::VCLEZ:
	Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
	case ARMISD::VCGT:
	Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
	case ARMISD::VCLTZ:
	Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
	default:
	Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
	}
	} else {
	Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
	}

	Result = DAG.getSExtOrTrunc(Result, dl, VT);

	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	return Result;
	}

	static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue Carry = Op.getOperand(2);
	SDValue Cond = Op.getOperand(3);
	SDLoc DL(Op);

	assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");

	// ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
	// have to invert the carry first.
	Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
	DAG.getConstant(1, DL, MVT::i32), Carry);
	// This converts the boolean value carry into the carry flag.
	Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);

	SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
	SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);

	SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
	SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
	SDValue ARMcc = DAG.getConstant(
	IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
	SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
	Cmp.getValue(1), SDValue());
	return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
	CCR, Chain.getValue(1));
	}

	/// isNEONModifiedImm - Check if the specified splat value corresponds to a
	/// valid vector constant for a NEON or MVE instruction with a "modified immediate"
	/// operand (e.g., VMOV). If so, return the encoded value.
	static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
	unsigned SplatBitSize, SelectionDAG &DAG,
	const SDLoc &dl, EVT &VT, bool is128Bits,
	NEONModImmType type) {
	unsigned OpCmode, Imm;

	// SplatBitSize is set to the smallest size that splats the vector, so a
	// zero vector will always have SplatBitSize == 8. However, NEON modified
	// immediate instructions others than VMOV do not support the 8-bit encoding
	// of a zero vector, and the default encoding of zero is supposed to be the
	// 32-bit version.
	if (SplatBits == 0)
	SplatBitSize = 32;

	switch (SplatBitSize) {
	case 8:
	if (type != VMOVModImm)
	return SDValue();
	// Any 1-byte value is OK. Op=0, Cmode=1110.
	assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
	OpCmode = 0xe;
	Imm = SplatBits;
	VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
	break;

	case 16:
	// NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
	VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
	if ((SplatBits & ~0xff) == 0) {
	// Value = 0x00nn: Op=x, Cmode=100x.
	OpCmode = 0x8;
	Imm = SplatBits;
	break;
	}
	if ((SplatBits & ~0xff00) == 0) {
	// Value = 0xnn00: Op=x, Cmode=101x.
	OpCmode = 0xa;
	Imm = SplatBits >> 8;
	break;
	}
	return SDValue();

	case 32:
	// NEON's 32-bit VMOV supports splat values where:
	// * only one byte is nonzero, or
	// * the least significant byte is 0xff and the second byte is nonzero, or
	// * the least significant 2 bytes are 0xff and the third is nonzero.
	VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
	if ((SplatBits & ~0xff) == 0) {
	// Value = 0x000000nn: Op=x, Cmode=000x.
	OpCmode = 0;
	Imm = SplatBits;
	break;
	}
	if ((SplatBits & ~0xff00) == 0) {
	// Value = 0x0000nn00: Op=x, Cmode=001x.
	OpCmode = 0x2;
	Imm = SplatBits >> 8;
	break;
	}
	if ((SplatBits & ~0xff0000) == 0) {
	// Value = 0x00nn0000: Op=x, Cmode=010x.
	OpCmode = 0x4;
	Imm = SplatBits >> 16;
	break;
	}
	if ((SplatBits & ~0xff000000) == 0) {
	// Value = 0xnn000000: Op=x, Cmode=011x.
	OpCmode = 0x6;
	Imm = SplatBits >> 24;
	break;
	}

	// cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
	if (type == OtherModImm) return SDValue();

	if ((SplatBits & ~0xffff) == 0 &&
	((SplatBits \| SplatUndef) & 0xff) == 0xff) {
	// Value = 0x0000nnff: Op=x, Cmode=1100.
	OpCmode = 0xc;
	Imm = SplatBits >> 8;
	break;
	}

	// cmode == 0b1101 is not supported for MVE VMVN
	if (type == MVEVMVNModImm)
	return SDValue();

	if ((SplatBits & ~0xffffff) == 0 &&
	((SplatBits \| SplatUndef) & 0xffff) == 0xffff) {
	// Value = 0x00nnffff: Op=x, Cmode=1101.
	OpCmode = 0xd;
	Imm = SplatBits >> 16;
	break;
	}

	// Note: there are a few 32-bit splat values (specifically: 00ffff00,
	// ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
	// VMOV.I32. A (very) minor optimization would be to replicate the value
	// and fall through here to test for a valid 64-bit splat. But, then the
	// caller would also need to check and handle the change in size.
	return SDValue();

	case 64: {
	if (type != VMOVModImm)
	return SDValue();
	// NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
	uint64_t BitMask = 0xff;
	uint64_t Val = 0;
	unsigned ImmMask = 1;
	Imm = 0;
	for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
	if (((SplatBits \| SplatUndef) & BitMask) == BitMask) {
	Val \|= BitMask;
	Imm \|= ImmMask;
	} else if ((SplatBits & BitMask) != 0) {
	return SDValue();
	}
	BitMask <<= 8;
	ImmMask <<= 1;
	}

	if (DAG.getDataLayout().isBigEndian())
	// swap higher and lower 32 bit word
	Imm = ((Imm & 0xf) << 4) \| ((Imm & 0xf0) >> 4);

	// Op=1, Cmode=1110.
	OpCmode = 0x1e;
	VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
	break;
	}

	default:
	llvm_unreachable("unexpected size for isNEONModifiedImm");
	}

	unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
	return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
	}

	SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
	const ARMSubtarget *ST) const {
	EVT VT = Op.getValueType();
	bool IsDouble = (VT == MVT::f64);
	ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
	const APFloat &FPVal = CFP->getValueAPF();

	// Prevent floating-point constants from using literal loads
	// when execute-only is enabled.
	if (ST->genExecuteOnly()) {
	// If we can represent the constant as an immediate, don't lower it
	if (isFPImmLegal(FPVal, VT))
	return Op;
	// Otherwise, construct as integer, and move to float register
	APInt INTVal = FPVal.bitcastToAPInt();
	SDLoc DL(CFP);
	switch (VT.getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("Unknown floating point type!");
	break;
	case MVT::f64: {
	SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
	SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
	if (!ST->isLittle())
	std::swap(Lo, Hi);
	return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
	}
	case MVT::f32:
	return DAG.getNode(ARMISD::VMOVSR, DL, VT,
	DAG.getConstant(INTVal, DL, MVT::i32));
	}
	}

	if (!ST->hasVFP3Base())
	return SDValue();

	// Use the default (constant pool) lowering for double constants when we have
	// an SP-only FPU
	if (IsDouble && !Subtarget->hasFP64())
	return SDValue();

	// Try splatting with a VMOV.f32...
	int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);

	if (ImmVal != -1) {
	if (IsDouble \|\| !ST->useNEONForSinglePrecisionFP()) {
	// We have code in place to select a valid ConstantFP already, no need to
	// do any mangling.
	return Op;
	}

	// It's a float and we are trying to use NEON operations where
	// possible. Lower it to a splat followed by an extract.
	SDLoc DL(Op);
	SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
	SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
	NewVal);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
	DAG.getConstant(0, DL, MVT::i32));
	}

	// The rest of our options are NEON only, make sure that's allowed before
	// proceeding..
	if (!ST->hasNEON() \|\| (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
	return SDValue();

	EVT VMovVT;
	uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();

	// It wouldn't really be worth bothering for doubles except for one very
	// important value, which does happen to match: 0.0. So make sure we don't do
	// anything stupid.
	if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
	return SDValue();

	// Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
	SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
	VMovVT, false, VMOVModImm);
	if (NewVal != SDValue()) {
	SDLoc DL(Op);
	SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
	NewVal);
	if (IsDouble)
	return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);

	// It's a float: cast and extract a vector element.
	SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
	VecConstant);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
	DAG.getConstant(0, DL, MVT::i32));
	}

	// Finally, try a VMVN.i32
	NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
	false, VMVNModImm);
	if (NewVal != SDValue()) {
	SDLoc DL(Op);
	SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);

	if (IsDouble)
	return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);

	// It's a float: cast and extract a vector element.
	SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
	VecConstant);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
	DAG.getConstant(0, DL, MVT::i32));
	}

	return SDValue();
	}

	// check if an VEXT instruction can handle the shuffle mask when the
	// vector sources of the shuffle are the same.
	static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
	unsigned NumElts = VT.getVectorNumElements();

	// Assume that the first shuffle index is not UNDEF. Fail if it is.
	if (M[0] < 0)
	return false;

	Imm = M[0];

	// If this is a VEXT shuffle, the immediate value is the index of the first
	// element. The other shuffle indices must be the successive elements after
	// the first one.
	unsigned ExpectedElt = Imm;
	for (unsigned i = 1; i < NumElts; ++i) {
	// Increment the expected index. If it wraps around, just follow it
	// back to index zero and keep going.
	++ExpectedElt;
	if (ExpectedElt == NumElts)
	ExpectedElt = 0;

	if (M[i] < 0) continue; // ignore UNDEF indices
	if (ExpectedElt != static_cast<unsigned>(M[i]))
	return false;
	}

	return true;
	}

	static bool isVEXTMask(ArrayRef<int> M, EVT VT,
	bool &ReverseVEXT, unsigned &Imm) {
	unsigned NumElts = VT.getVectorNumElements();
	ReverseVEXT = false;

	// Assume that the first shuffle index is not UNDEF. Fail if it is.
	if (M[0] < 0)
	return false;

	Imm = M[0];

	// If this is a VEXT shuffle, the immediate value is the index of the first
	// element. The other shuffle indices must be the successive elements after
	// the first one.
	unsigned ExpectedElt = Imm;
	for (unsigned i = 1; i < NumElts; ++i) {
	// Increment the expected index. If it wraps around, it may still be
	// a VEXT but the source vectors must be swapped.
	ExpectedElt += 1;
	if (ExpectedElt == NumElts * 2) {
	ExpectedElt = 0;
	ReverseVEXT = true;
	}

	if (M[i] < 0) continue; // ignore UNDEF indices
	if (ExpectedElt != static_cast<unsigned>(M[i]))
	return false;
	}

	// Adjust the index value if the source operands will be swapped.
	if (ReverseVEXT)
	Imm -= NumElts;

	return true;
	}

	/// isVREVMask - Check if a vector shuffle corresponds to a VREV
	/// instruction with the specified blocksize. (The order of the elements
	/// within each block of the vector is reversed.)
	static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
	assert((BlockSize==16 \|\| BlockSize==32 \|\| BlockSize==64) &&
	"Only possible block sizes for VREV are: 16, 32, 64");

	unsigned EltSz = VT.getScalarSizeInBits();
	if (EltSz == 64)
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	unsigned BlockElts = M[0] + 1;
	// If the first shuffle index is UNDEF, be optimistic.
	if (M[0] < 0)
	BlockElts = BlockSize / EltSz;

	if (BlockSize <= EltSz \|\| BlockSize != BlockElts * EltSz)
	return false;

	for (unsigned i = 0; i < NumElts; ++i) {
	if (M[i] < 0) continue; // ignore UNDEF indices
	if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
	return false;
	}

	return true;
	}

	static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
	// We can handle <8 x i8> vector shuffles. If the index in the mask is out of
	// range, then 0 is placed into the resulting vector. So pretty much any mask
	// of 8 elements can work here.
	return VT == MVT::v8i8 && M.size() == 8;
	}

	static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
	unsigned Index) {
	if (Mask.size() == Elements * 2)
	return Index / Elements;
	return Mask[Index] == 0 ? 0 : 1;
	}

	// Checks whether the shuffle mask represents a vector transpose (VTRN) by
	// checking that pairs of elements in the shuffle mask represent the same index
	// in each vector, incrementing the expected index by 2 at each step.
	// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
	// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
	// v2={e,f,g,h}
	// WhichResult gives the offset for each element in the mask based on which
	// of the two results it belongs to.
	//
	// The transpose can be represented either as:
	// result1 = shufflevector v1, v2, result1_shuffle_mask
	// result2 = shufflevector v1, v2, result2_shuffle_mask
	// where v1/v2 and the shuffle masks have the same number of elements
	// (here WhichResult (see below) indicates which result is being checked)
	//
	// or as:
	// results = shufflevector v1, v2, shuffle_mask
	// where both results are returned in one vector and the shuffle mask has twice
	// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
	// want to check the low half and high half of the shuffle mask as if it were
	// the other case
	static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned EltSz = VT.getScalarSizeInBits();
	if (EltSz == 64)
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	if (M.size() != NumElts && M.size() != NumElts*2)
	return false;

	// If the mask is twice as long as the input vector then we need to check the
	// upper and lower parts of the mask with a matching value for WhichResult
	// FIXME: A mask with only even values will be rejected in case the first
	// element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
	// M[0] is used to determine WhichResult
	for (unsigned i = 0; i < M.size(); i += NumElts) {
	WhichResult = SelectPairHalf(NumElts, M, i);
	for (unsigned j = 0; j < NumElts; j += 2) {
	if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) \|\|
	(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
	return false;
	}
	}

	if (M.size() == NumElts*2)
	WhichResult = 0;

	return true;
	}

	/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
	static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
	unsigned EltSz = VT.getScalarSizeInBits();
	if (EltSz == 64)
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	if (M.size() != NumElts && M.size() != NumElts*2)
	return false;

	for (unsigned i = 0; i < M.size(); i += NumElts) {
	WhichResult = SelectPairHalf(NumElts, M, i);
	for (unsigned j = 0; j < NumElts; j += 2) {
	if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) \|\|
	(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
	return false;
	}
	}

	if (M.size() == NumElts*2)
	WhichResult = 0;

	return true;
	}

	// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
	// that the mask elements are either all even and in steps of size 2 or all odd
	// and in steps of size 2.
	// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
	// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
	// v2={e,f,g,h}
	// Requires similar checks to that of isVTRNMask with
	// respect the how results are returned.
	static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned EltSz = VT.getScalarSizeInBits();
	if (EltSz == 64)
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	if (M.size() != NumElts && M.size() != NumElts*2)
	return false;

	for (unsigned i = 0; i < M.size(); i += NumElts) {
	WhichResult = SelectPairHalf(NumElts, M, i);
	for (unsigned j = 0; j < NumElts; ++j) {
	if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
	return false;
	}
	}

	if (M.size() == NumElts*2)
	WhichResult = 0;

	// VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
	if (VT.is64BitVector() && EltSz == 32)
	return false;

	return true;
	}

	/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
	static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
	unsigned EltSz = VT.getScalarSizeInBits();
	if (EltSz == 64)
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	if (M.size() != NumElts && M.size() != NumElts*2)
	return false;

	unsigned Half = NumElts / 2;
	for (unsigned i = 0; i < M.size(); i += NumElts) {
	WhichResult = SelectPairHalf(NumElts, M, i);
	for (unsigned j = 0; j < NumElts; j += Half) {
	unsigned Idx = WhichResult;
	for (unsigned k = 0; k < Half; ++k) {
	int MIdx = M[i + j + k];
	if (MIdx >= 0 && (unsigned) MIdx != Idx)
	return false;
	Idx += 2;
	}
	}
	}

	if (M.size() == NumElts*2)
	WhichResult = 0;

	// VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
	if (VT.is64BitVector() && EltSz == 32)
	return false;

	return true;
	}

	// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
	// that pairs of elements of the shufflemask represent the same index in each
	// vector incrementing sequentially through the vectors.
	// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
	// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
	// v2={e,f,g,h}
	// Requires similar checks to that of isVTRNMask with respect the how results
	// are returned.
	static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned EltSz = VT.getScalarSizeInBits();
	if (EltSz == 64)
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	if (M.size() != NumElts && M.size() != NumElts*2)
	return false;

	for (unsigned i = 0; i < M.size(); i += NumElts) {
	WhichResult = SelectPairHalf(NumElts, M, i);
	unsigned Idx = WhichResult * NumElts / 2;
	for (unsigned j = 0; j < NumElts; j += 2) {
	if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) \|\|
	(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
	return false;
	Idx += 1;
	}
	}

	if (M.size() == NumElts*2)
	WhichResult = 0;

	// VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
	if (VT.is64BitVector() && EltSz == 32)
	return false;

	return true;
	}

	/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
	static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
	unsigned EltSz = VT.getScalarSizeInBits();
	if (EltSz == 64)
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	if (M.size() != NumElts && M.size() != NumElts*2)
	return false;

	for (unsigned i = 0; i < M.size(); i += NumElts) {
	WhichResult = SelectPairHalf(NumElts, M, i);
	unsigned Idx = WhichResult * NumElts / 2;
	for (unsigned j = 0; j < NumElts; j += 2) {
	if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) \|\|
	(M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
	return false;
	Idx += 1;
	}
	}

	if (M.size() == NumElts*2)
	WhichResult = 0;

	// VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
	if (VT.is64BitVector() && EltSz == 32)
	return false;

	return true;
	}

	/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
	/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
	static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
	unsigned &WhichResult,
	bool &isV_UNDEF) {
	isV_UNDEF = false;
	if (isVTRNMask(ShuffleMask, VT, WhichResult))
	return ARMISD::VTRN;
	if (isVUZPMask(ShuffleMask, VT, WhichResult))
	return ARMISD::VUZP;
	if (isVZIPMask(ShuffleMask, VT, WhichResult))
	return ARMISD::VZIP;

	isV_UNDEF = true;
	if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
	return ARMISD::VTRN;
	if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
	return ARMISD::VUZP;
	if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
	return ARMISD::VZIP;

	return 0;
	}

	/// \return true if this is a reverse operation on an vector.
	static bool isReverseMask(ArrayRef<int> M, EVT VT) {
	unsigned NumElts = VT.getVectorNumElements();
	// Make sure the mask has the right size.
	if (NumElts != M.size())
	return false;

	// Look for <15, ..., 3, -1, 1, 0>.
	for (unsigned i = 0; i != NumElts; ++i)
	if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
	return false;

	return true;
	}

	// If N is an integer constant that can be moved into a register in one
	// instruction, return an SDValue of such a constant (will become a MOV
	// instruction). Otherwise return null.
	static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
	const ARMSubtarget *ST, const SDLoc &dl) {
	uint64_t Val;
	if (!isa<ConstantSDNode>(N))
	return SDValue();
	Val = cast<ConstantSDNode>(N)->getZExtValue();

	if (ST->isThumb1Only()) {
	if (Val <= 255 \|\| ~Val <= 255)
	return DAG.getConstant(Val, dl, MVT::i32);
	} else {
	if (ARM_AM::getSOImmVal(Val) != -1 \|\| ARM_AM::getSOImmVal(~Val) != -1)
	return DAG.getConstant(Val, dl, MVT::i32);
	}
	return SDValue();
	}

	// If this is a case we can't handle, return null and let the default
	// expansion code take care of it.
	SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
	const ARMSubtarget *ST) const {
	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
	SDLoc dl(Op);
	EVT VT = Op.getValueType();

	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
	if (SplatUndef.isAllOnesValue())
	return DAG.getUNDEF(VT);

	if ((ST->hasNEON() && SplatBitSize <= 64) \|\|
	(ST->hasMVEIntegerOps() && SplatBitSize <= 32)) {
	// Check if an immediate VMOV works.
	EVT VmovVT;
	SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
	SplatUndef.getZExtValue(), SplatBitSize,
	DAG, dl, VmovVT, VT.is128BitVector(),
	VMOVModImm);

	if (Val.getNode()) {
	SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
	return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
	}

	// Try an immediate VMVN.
	uint64_t NegatedImm = (~SplatBits).getZExtValue();
	Val = isNEONModifiedImm(
	NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
	DAG, dl, VmovVT, VT.is128BitVector(),
	ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
	if (Val.getNode()) {
	SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
	return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
	}

	// Use vmov.f32 to materialize other v2f32 and v4f32 splats.
	if ((VT == MVT::v2f32 \|\| VT == MVT::v4f32) && SplatBitSize == 32) {
	int ImmVal = ARM_AM::getFP32Imm(SplatBits);
	if (ImmVal != -1) {
	SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
	return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
	}
	}
	}
	}

	// Scan through the operands to see if only one value is used.
	//
	// As an optimisation, even if more than one value is used it may be more
	// profitable to splat with one value then change some lanes.
	//
	// Heuristically we decide to do this if the vector has a "dominant" value,
	// defined as splatted to more than half of the lanes.
	unsigned NumElts = VT.getVectorNumElements();
	bool isOnlyLowElement = true;
	bool usesOnlyOneValue = true;
	bool hasDominantValue = false;
	bool isConstant = true;

	// Map of the number of times a particular SDValue appears in the
	// element list.
	DenseMap<SDValue, unsigned> ValueCounts;
	SDValue Value;
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	if (i > 0)
	isOnlyLowElement = false;
	if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
	isConstant = false;

	ValueCounts.insert(std::make_pair(V, 0));
	unsigned &Count = ValueCounts[V];

	// Is this value dominant? (takes up more than half of the lanes)
	if (++Count > (NumElts / 2)) {
	hasDominantValue = true;
	Value = V;
	}
	}
	if (ValueCounts.size() != 1)
	usesOnlyOneValue = false;
	if (!Value.getNode() && !ValueCounts.empty())
	Value = ValueCounts.begin()->first;

	if (ValueCounts.empty())
	return DAG.getUNDEF(VT);

	// Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
	// Keep going if we are hitting this case.
	if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);

	unsigned EltSize = VT.getScalarSizeInBits();

	// Use VDUP for non-constant splats. For f32 constant splats, reduce to
	// i32 and try again.
	if (hasDominantValue && EltSize <= 32) {
	if (!isConstant) {
	SDValue N;

	// If we are VDUPing a value that comes directly from a vector, that will
	// cause an unnecessary move to and from a GPR, where instead we could
	// just use VDUPLANE. We can only do this if the lane being extracted
	// is at a constant index, as the VDUP from lane instructions only have
	// constant-index forms.
	ConstantSDNode *constIndex;
	if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	(constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
	// We need to create a new undef vector to use for the VDUPLANE if the
	// size of the vector from which we get the value is different than the
	// size of the vector that we need to create. We will insert the element
	// such that the register coalescer will remove unnecessary copies.
	if (VT != Value->getOperand(0).getValueType()) {
	unsigned index = constIndex->getAPIntValue().getLimitedValue() %
	VT.getVectorNumElements();
	N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
	DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
	Value, DAG.getConstant(index, dl, MVT::i32)),
	DAG.getConstant(index, dl, MVT::i32));
	} else
	N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
	Value->getOperand(0), Value->getOperand(1));
	} else
	N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);

	if (!usesOnlyOneValue) {
	// The dominant value was splatted as 'N', but we now have to insert
	// all differing elements.
	for (unsigned I = 0; I < NumElts; ++I) {
	if (Op.getOperand(I) == Value)
	continue;
	SmallVector<SDValue, 3> Ops;
	Ops.push_back(N);
	Ops.push_back(Op.getOperand(I));
	Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
	N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
	}
	}
	return N;
	}
	if (VT.getVectorElementType().isFloatingPoint()) {
	SmallVector<SDValue, 8> Ops;
	MVT FVT = VT.getVectorElementType().getSimpleVT();
	assert(FVT == MVT::f32 \|\| FVT == MVT::f16);
	MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
	for (unsigned i = 0; i < NumElts; ++i)
	Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
	Op.getOperand(i)));
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
	SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
	Val = LowerBUILD_VECTOR(Val, DAG, ST);
	if (Val.getNode())
	return DAG.getNode(ISD::BITCAST, dl, VT, Val);
	}
	if (usesOnlyOneValue) {
	SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
	if (isConstant && Val.getNode())
	return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
	}
	}

	// If all elements are constants and the case above didn't get hit, fall back
	// to the default expansion, which will generate a load from the constant
	// pool.
	if (isConstant)
	return SDValue();

	// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
	if (NumElts >= 4) {
	SDValue shuffle = ReconstructShuffle(Op, DAG);
	if (shuffle != SDValue())
	return shuffle;
	}

	if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
	// If we haven't found an efficient lowering, try splitting a 128-bit vector
	// into two 64-bit vectors; we might discover a better way to lower it.
	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
	EVT ExtVT = VT.getVectorElementType();
	EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
	SDValue Lower =
	DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
	if (Lower.getOpcode() == ISD::BUILD_VECTOR)
	Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
	SDValue Upper = DAG.getBuildVector(
	HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
	if (Upper.getOpcode() == ISD::BUILD_VECTOR)
	Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
	if (Lower && Upper)
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
	}

	// Vectors with 32- or 64-bit elements can be built by directly assigning
	// the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
	// will be legalized.
	if (EltSize >= 32) {
	// Do the expansion with floating-point types, since that is what the VFP
	// registers are defined to use, and since i64 is not legal.
	EVT EltVT = EVT::getFloatingPointVT(EltSize);
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0; i < NumElts; ++i)
	Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
	SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
	return DAG.getNode(ISD::BITCAST, dl, VT, Val);
	}

	// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
	// know the default expansion would otherwise fall back on something even
	// worse. For a vector with one or two non-undef values, that's
	// scalar_to_vector for the elements followed by a shuffle (provided the
	// shuffle is valid for the target) and materialization element by element
	// on the stack followed by a load for everything else.
	if (!isConstant && !usesOnlyOneValue) {
	SDValue Vec = DAG.getUNDEF(VT);
	for (unsigned i = 0 ; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
	Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
	}
	return Vec;
	}

	return SDValue();
	}

	// Gather data to see if the operation can be modelled as a
	// shuffle in combination with VEXTs.
	SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	unsigned NumElts = VT.getVectorNumElements();

	struct ShuffleSourceInfo {
	SDValue Vec;
	unsigned MinElt = std::numeric_limits<unsigned>::max();
	unsigned MaxElt = 0;

	// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
	// be compatible with the shuffle we intend to construct. As a result
	// ShuffleVec will be some sliding window into the original Vec.
	SDValue ShuffleVec;

	// Code should guarantee that element i in Vec starts at element "WindowBase
	// + i * WindowScale in ShuffleVec".
	int WindowBase = 0;
	int WindowScale = 1;

	ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}

	bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
	};

	// First gather all vectors used as an immediate source for this BUILD_VECTOR
	// node.
	SmallVector<ShuffleSourceInfo, 2> Sources;
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
	// A shuffle can only come from building a vector from various
	// elements of other vectors.
	return SDValue();
	} else if (!isa<ConstantSDNode>(V.getOperand(1))) {
	// Furthermore, shuffles require a constant mask, whereas extractelts
	// accept variable indices.
	return SDValue();
	}

	// Add this element source to the list if it's not already there.
	SDValue SourceVec = V.getOperand(0);
	auto Source = llvm::find(Sources, SourceVec);
	if (Source == Sources.end())
	Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));

	// Update the minimum and maximum lane number seen.
	unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
	Source->MinElt = std::min(Source->MinElt, EltNo);
	Source->MaxElt = std::max(Source->MaxElt, EltNo);
	}

	// Currently only do something sane when at most two source vectors
	// are involved.
	if (Sources.size() > 2)
	return SDValue();

	// Find out the smallest element size among result and two sources, and use
	// it as element size to build the shuffle_vector.
	EVT SmallestEltTy = VT.getVectorElementType();
	for (auto &Source : Sources) {
	EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
	if (SrcEltTy.bitsLT(SmallestEltTy))
	SmallestEltTy = SrcEltTy;
	}
	unsigned ResMultiplier =
	VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
	NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
	EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);

	// If the source vector is too wide or too narrow, we may nevertheless be able
	// to construct a compatible shuffle either by concatenating it with UNDEF or
	// extracting a suitable range of elements.
	for (auto &Src : Sources) {
	EVT SrcVT = Src.ShuffleVec.getValueType();

	if (SrcVT.getSizeInBits() == VT.getSizeInBits())
	continue;

	// This stage of the search produces a source with the same element type as
	// the original, but with a total width matching the BUILD_VECTOR output.
	EVT EltVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
	EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);

	if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
	if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
	return SDValue();
	// We can pad out the smaller vector for free, so if it's part of a
	// shuffle...
	Src.ShuffleVec =
	DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
	DAG.getUNDEF(Src.ShuffleVec.getValueType()));
	continue;
	}

	if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
	return SDValue();

	if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
	// Span too large for a VEXT to cope
	return SDValue();
	}

	if (Src.MinElt >= NumSrcElts) {
	// The extraction can just take the second half
	Src.ShuffleVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(NumSrcElts, dl, MVT::i32));
	Src.WindowBase = -NumSrcElts;
	} else if (Src.MaxElt < NumSrcElts) {
	// The extraction can just take the first half
	Src.ShuffleVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(0, dl, MVT::i32));
	} else {
	// An actual VEXT is needed
	SDValue VEXTSrc1 =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(0, dl, MVT::i32));
	SDValue VEXTSrc2 =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(NumSrcElts, dl, MVT::i32));

	Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
	VEXTSrc2,
	DAG.getConstant(Src.MinElt, dl, MVT::i32));
	Src.WindowBase = -Src.MinElt;
	}
	}

	// Another possible incompatibility occurs from the vector element types. We
	// can fix this by bitcasting the source vectors to the same type we intend
	// for the shuffle.
	for (auto &Src : Sources) {
	EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
	if (SrcEltTy == SmallestEltTy)
	continue;
	assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
	Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
	Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
	Src.WindowBase *= Src.WindowScale;
	}

	// Final sanity check before we try to actually produce a shuffle.
	LLVM_DEBUG(for (auto Src
	: Sources)
	assert(Src.ShuffleVec.getValueType() == ShuffleVT););

	// The stars all align, our next step is to produce the mask for the shuffle.
	SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
	int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
	for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
	SDValue Entry = Op.getOperand(i);
	if (Entry.isUndef())
	continue;

	auto Src = llvm::find(Sources, Entry.getOperand(0));
	int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();

	// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
	// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
	// segment.
	EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
	int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
	VT.getScalarSizeInBits());
	int LanesDefined = BitsDefined / BitsPerShuffleLane;

	// This source is expected to fill ResMultiplier lanes of the final shuffle,
	// starting at the appropriate offset.
	int LaneMask = &Mask[i ResMultiplier];

	int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
	ExtractBase += NumElts * (Src - Sources.begin());
	for (int j = 0; j < LanesDefined; ++j)
	LaneMask[j] = ExtractBase + j;
	}

	// Final check before we try to produce nonsense...
	if (!isShuffleMaskLegal(Mask, ShuffleVT))
	return SDValue();

	// We can't handle more than two sources. This should have already
	// been checked before this point.
	assert(Sources.size() <= 2 && "Too many sources!");

	SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
	for (unsigned i = 0; i < Sources.size(); ++i)
	ShuffleOps[i] = Sources[i].ShuffleVec;

	SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
	ShuffleOps[1], Mask);
	return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
	}

	enum ShuffleOpCodes {
	OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
	OP_VREV,
	OP_VDUP0,
	OP_VDUP1,
	OP_VDUP2,
	OP_VDUP3,
	OP_VEXT1,
	OP_VEXT2,
	OP_VEXT3,
	OP_VUZPL, // VUZP, left result
	OP_VUZPR, // VUZP, right result
	OP_VZIPL, // VZIP, left result
	OP_VZIPR, // VZIP, right result
	OP_VTRNL, // VTRN, left result
	OP_VTRNR // VTRN, right result
	};

	static bool isLegalMVEShuffleOp(unsigned PFEntry) {
	unsigned OpNum = (PFEntry >> 26) & 0x0F;
	switch (OpNum) {
	case OP_COPY:
	case OP_VREV:
	case OP_VDUP0:
	case OP_VDUP1:
	case OP_VDUP2:
	case OP_VDUP3:
	return true;
	}
	return false;
	}

	/// isShuffleMaskLegal - Targets can use this to indicate that they only
	/// support some VECTOR_SHUFFLE operations, those with specific masks.
	/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
	/// are assumed to be legal.
	bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
	if (VT.getVectorNumElements() == 4 &&
	(VT.is128BitVector() \|\| VT.is64BitVector())) {
	unsigned PFIndexes[4];
	for (unsigned i = 0; i != 4; ++i) {
	if (M[i] < 0)
	PFIndexes[i] = 8;
	else
	PFIndexes[i] = M[i];
	}

	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex =
	PFIndexes[0]999+PFIndexes[1]99+PFIndexes[2]9+PFIndexes[3];
	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	if (Cost <= 4 && (Subtarget->hasNEON() \|\| isLegalMVEShuffleOp(PFEntry)))
	return true;
	}

	bool ReverseVEXT, isV_UNDEF;
	unsigned Imm, WhichResult;

	unsigned EltSize = VT.getScalarSizeInBits();
	if (EltSize >= 32 \|\|
	ShuffleVectorSDNode::isSplatMask(&M[0], VT) \|\|
	isVREVMask(M, VT, 64) \|\|
	isVREVMask(M, VT, 32) \|\|
	isVREVMask(M, VT, 16))
	return true;
	else if (Subtarget->hasNEON() &&
	(isVEXTMask(M, VT, ReverseVEXT, Imm) \|\|
	isVTBLMask(M, VT) \|\|
	isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
	return true;
	else if (Subtarget->hasNEON() && (VT == MVT::v8i16 \|\| VT == MVT::v16i8) &&
	isReverseMask(M, VT))
	return true;
	else
	return false;
	}

	/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
	/// the specified operations to build the shuffle.
	static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
	SDValue RHS, SelectionDAG &DAG,
	const SDLoc &dl) {
	unsigned OpNum = (PFEntry >> 26) & 0x0F;
	unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
	unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);

	if (OpNum == OP_COPY) {
	if (LHSID == (19+2)9+3) return LHS;
	assert(LHSID == ((49+5)9+6)*9+7 && "Illegal OP_COPY!");
	return RHS;
	}

	SDValue OpLHS, OpRHS;
	OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
	OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
	EVT VT = OpLHS.getValueType();

	switch (OpNum) {
	default: llvm_unreachable("Unknown shuffle opcode!");
	case OP_VREV:
	// VREV divides the vector in half and swaps within the half.
	if (VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::f32)
	return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
	// vrev <4 x i16> -> VREV32
	if (VT.getVectorElementType() == MVT::i16)
	return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
	// vrev <4 x i8> -> VREV16
	assert(VT.getVectorElementType() == MVT::i8);
	return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
	case OP_VDUP0:
	case OP_VDUP1:
	case OP_VDUP2:
	case OP_VDUP3:
	return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
	OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
	case OP_VEXT1:
	case OP_VEXT2:
	case OP_VEXT3:
	return DAG.getNode(ARMISD::VEXT, dl, VT,
	OpLHS, OpRHS,
	DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
	case OP_VUZPL:
	case OP_VUZPR:
	return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
	OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
	case OP_VZIPL:
	case OP_VZIPR:
	return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
	OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
	case OP_VTRNL:
	case OP_VTRNR:
	return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
	OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
	}
	}

	static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
	ArrayRef<int> ShuffleMask,
	SelectionDAG &DAG) {
	// Check to see if we can use the VTBL instruction.
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	SDLoc DL(Op);

	SmallVector<SDValue, 8> VTBLMask;
	for (ArrayRef<int>::iterator
	I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
	VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));

	if (V2.getNode()->isUndef())
	return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
	DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));

	return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
	DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
	}

	static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
	SelectionDAG &DAG) {
	SDLoc DL(Op);
	SDValue OpLHS = Op.getOperand(0);
	EVT VT = OpLHS.getValueType();

	assert((VT == MVT::v8i16 \|\| VT == MVT::v16i8) &&
	"Expect an v8i16/v16i8 type");
	OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
	// For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
	// extract the first 8 bytes into the top double word and the last 8 bytes
	// into the bottom double word. The v8i16 case is similar.
	unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
	return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
	DAG.getConstant(ExtractNum, DL, MVT::i32));
	}

	static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
	const ARMSubtarget *ST) {
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());

	// Convert shuffles that are directly supported on NEON to target-specific
	// DAG nodes, instead of keeping them as shuffles and matching them again
	// during code selection. This is more efficient and avoids the possibility
	// of inconsistencies between legalization and selection.
	// FIXME: floating-point vectors should be canonicalized to integer vectors
	// of the same time so that they get CSEd properly.
	ArrayRef<int> ShuffleMask = SVN->getMask();

	unsigned EltSize = VT.getScalarSizeInBits();
	if (EltSize <= 32) {
	if (SVN->isSplat()) {
	int Lane = SVN->getSplatIndex();
	// If this is undef splat, generate it via "just" vdup, if possible.
	if (Lane == -1) Lane = 0;

	// Test if V1 is a SCALAR_TO_VECTOR.
	if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
	return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
	}
	// Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
	// (and probably will turn into a SCALAR_TO_VECTOR once legalization
	// reaches it).
	if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
	!isa<ConstantSDNode>(V1.getOperand(0))) {
	bool IsScalarToVector = true;
	for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
	if (!V1.getOperand(i).isUndef()) {
	IsScalarToVector = false;
	break;
	}
	if (IsScalarToVector)
	return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
	}
	return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
	DAG.getConstant(Lane, dl, MVT::i32));
	}

	bool ReverseVEXT = false;
	unsigned Imm = 0;
	if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
	if (ReverseVEXT)
	std::swap(V1, V2);
	return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
	DAG.getConstant(Imm, dl, MVT::i32));
	}

	if (isVREVMask(ShuffleMask, VT, 64))
	return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
	if (isVREVMask(ShuffleMask, VT, 32))
	return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
	if (isVREVMask(ShuffleMask, VT, 16))
	return DAG.getNode(ARMISD::VREV16, dl, VT, V1);

	if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
	return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
	DAG.getConstant(Imm, dl, MVT::i32));
	}

	// Check for Neon shuffles that modify both input vectors in place.
	// If both results are used, i.e., if there are two shuffles with the same
	// source operands and with masks corresponding to both results of one of
	// these operations, DAG memoization will ensure that a single node is
	// used for both shuffles.
	unsigned WhichResult = 0;
	bool isV_UNDEF = false;
	if (ST->hasNEON()) {
	if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
	ShuffleMask, VT, WhichResult, isV_UNDEF)) {
	if (isV_UNDEF)
	V2 = V1;
	return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
	.getValue(WhichResult);
	}
	}

	// Also check for these shuffles through CONCAT_VECTORS: we canonicalize
	// shuffles that produce a result larger than their operands with:
	// shuffle(concat(v1, undef), concat(v2, undef))
	// ->
	// shuffle(concat(v1, v2), undef)
	// because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
	//
	// This is useful in the general case, but there are special cases where
	// native shuffles produce larger results: the two-result ops.
	//
	// Look through the concat when lowering them:
	// shuffle(concat(v1, v2), undef)
	// ->
	// concat(VZIP(v1, v2):0, :1)
	//
	if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
	SDValue SubV1 = V1->getOperand(0);
	SDValue SubV2 = V1->getOperand(1);
	EVT SubVT = SubV1.getValueType();

	// We expect these to have been canonicalized to -1.
	assert(llvm::all_of(ShuffleMask, [&](int i) {
	return i < (int)VT.getVectorNumElements();
	}) && "Unexpected shuffle index into UNDEF operand!");

	if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
	ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
	if (isV_UNDEF)
	SubV2 = SubV1;
	assert((WhichResult == 0) &&
	"In-place shuffle of concat can only have one result!");
	SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
	SubV1, SubV2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
	Res.getValue(1));
	}
	}
	}

	// If the shuffle is not directly supported and it has 4 elements, use
	// the PerfectShuffle-generated table to synthesize it from other shuffles.
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts == 4) {
	unsigned PFIndexes[4];
	for (unsigned i = 0; i != 4; ++i) {
	if (ShuffleMask[i] < 0)
	PFIndexes[i] = 8;
	else
	PFIndexes[i] = ShuffleMask[i];
	}

	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex =
	PFIndexes[0]999+PFIndexes[1]99+PFIndexes[2]9+PFIndexes[3];
	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	if (Cost <= 4) {
	if (ST->hasNEON())
	return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
	else if (isLegalMVEShuffleOp(PFEntry)) {
	unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
	unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
	unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
	unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
	if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
	return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
	}
	}
	}

	// Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
	if (EltSize >= 32) {
	// Do the expansion with floating-point types, since that is what the VFP
	// registers are defined to use, and since i64 is not legal.
	EVT EltVT = EVT::getFloatingPointVT(EltSize);
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
	V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
	V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0; i < NumElts; ++i) {
	if (ShuffleMask[i] < 0)
	Ops.push_back(DAG.getUNDEF(EltVT));
	else
	Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
	ShuffleMask[i] < (int)NumElts ? V1 : V2,
	DAG.getConstant(ShuffleMask[i] & (NumElts-1),
	dl, MVT::i32)));
	}
	SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
	return DAG.getNode(ISD::BITCAST, dl, VT, Val);
	}

	if (ST->hasNEON() && (VT == MVT::v8i16 \|\| VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
	return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);

	if (ST->hasNEON() && VT == MVT::v8i8)
	if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
	return NewOp;

	return SDValue();
	}

	SDValue ARMTargetLowering::
	LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
	// INSERT_VECTOR_ELT is legal only for immediate indexes.
	SDValue Lane = Op.getOperand(2);
	if (!isa<ConstantSDNode>(Lane))
	return SDValue();

	SDValue Elt = Op.getOperand(1);
	EVT EltVT = Elt.getValueType();
	if (getTypeAction(*DAG.getContext(), EltVT) ==
	TargetLowering::TypePromoteFloat) {
	// INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
	// but the type system will try to do that if we don't intervene.
	// Reinterpret any such vector-element insertion as one with the
	// corresponding integer types.

	SDLoc dl(Op);

	EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
	assert(getTypeAction(*DAG.getContext(), IEltVT) !=
	TargetLowering::TypePromoteFloat);

	SDValue VecIn = Op.getOperand(0);
	EVT VecVT = VecIn.getValueType();
	EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
	VecVT.getVectorNumElements());

	SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
	SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
	SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
	IVecIn, IElt, Lane);
	return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
	}

	return Op;
	}

	static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
	// EXTRACT_VECTOR_ELT is legal only for immediate indexes.
	SDValue Lane = Op.getOperand(1);
	if (!isa<ConstantSDNode>(Lane))
	return SDValue();

	SDValue Vec = Op.getOperand(0);
	if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
	SDLoc dl(Op);
	return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
	}

	return Op;
	}

	static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
	// The only time a CONCAT_VECTORS operation can have legal types is when
	// two 64-bit vectors are concatenated to a 128-bit vector.
	assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
	"unexpected CONCAT_VECTORS");
	SDLoc dl(Op);
	SDValue Val = DAG.getUNDEF(MVT::v2f64);
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	if (!Op0.isUndef())
	Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
	DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
	DAG.getIntPtrConstant(0, dl));
	if (!Op1.isUndef())
	Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
	DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
	DAG.getIntPtrConstant(1, dl));
	return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
	}

	/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
	/// element has been zero/sign-extended, depending on the isSigned parameter,
	/// from an integer type half its size.
	static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
	bool isSigned) {
	// A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
	EVT VT = N->getValueType(0);
	if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
	SDNode *BVN = N->getOperand(0).getNode();
	if (BVN->getValueType(0) != MVT::v4i32 \|\|
	BVN->getOpcode() != ISD::BUILD_VECTOR)
	return false;
	unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
	unsigned HiElt = 1 - LoElt;
	ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
	ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
	ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
	ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
	if (!Lo0 \|\| !Hi0 \|\| !Lo1 \|\| !Hi1)
	return false;
	if (isSigned) {
	if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
	Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
	return true;
	} else {
	if (Hi0->isNullValue() && Hi1->isNullValue())
	return true;
	}
	return false;
	}

	if (N->getOpcode() != ISD::BUILD_VECTOR)
	return false;

	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	SDNode *Elt = N->getOperand(i).getNode();
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
	unsigned EltSize = VT.getScalarSizeInBits();
	unsigned HalfSize = EltSize / 2;
	if (isSigned) {
	if (!isIntN(HalfSize, C->getSExtValue()))
	return false;
	} else {
	if (!isUIntN(HalfSize, C->getZExtValue()))
	return false;
	}
	continue;
	}
	return false;
	}

	return true;
	}

	/// isSignExtended - Check if a node is a vector value that is sign-extended
	/// or a constant BUILD_VECTOR with sign-extended elements.
	static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() == ISD::SIGN_EXTEND \|\| ISD::isSEXTLoad(N))
	return true;
	if (isExtendedBUILD_VECTOR(N, DAG, true))
	return true;
	return false;
	}

	/// isZeroExtended - Check if a node is a vector value that is zero-extended
	/// or a constant BUILD_VECTOR with zero-extended elements.
	static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() == ISD::ZERO_EXTEND \|\| ISD::isZEXTLoad(N))
	return true;
	if (isExtendedBUILD_VECTOR(N, DAG, false))
	return true;
	return false;
	}

	static EVT getExtensionTo64Bits(const EVT &OrigVT) {
	if (OrigVT.getSizeInBits() >= 64)
	return OrigVT;

	assert(OrigVT.isSimple() && "Expecting a simple value type");

	MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
	switch (OrigSimpleTy) {
	default: llvm_unreachable("Unexpected Vector Type");
	case MVT::v2i8:
	case MVT::v2i16:
	return MVT::v2i32;
	case MVT::v4i8:
	return MVT::v4i16;
	}
	}

	/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
	/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
	/// We insert the required extension here to get the vector to fill a D register.
	static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
	const EVT &OrigTy,
	const EVT &ExtTy,
	unsigned ExtOpcode) {
	// The vector originally had a size of OrigTy. It was then extended to ExtTy.
	// We expect the ExtTy to be 128-bits total. If the OrigTy is less than
	// 64-bits we need to insert a new extension so that it will be 64-bits.
	assert(ExtTy.is128BitVector() && "Unexpected extension size");
	if (OrigTy.getSizeInBits() >= 64)
	return N;

	// Must extend size to at least 64 bits to be used as an operand for VMULL.
	EVT NewVT = getExtensionTo64Bits(OrigTy);

	return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
	}

	/// SkipLoadExtensionForVMULL - return a load of the original vector size that
	/// does not do any sign/zero extension. If the original vector is less
	/// than 64 bits, an appropriate extension will be added after the load to
	/// reach a total size of 64 bits. We have to add the extension separately
	/// because ARM does not have a sign/zero extending load for vectors.
	static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
	EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());

	// The load already has the right type.
	if (ExtendedTy == LD->getMemoryVT())
	return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
	LD->getBasePtr(), LD->getPointerInfo(),
	LD->getAlignment(), LD->getMemOperand()->getFlags());

	// We need to create a zextload/sextload. We cannot just create a load
	// followed by a zext/zext node because LowerMUL is also run during normal
	// operation legalization where we can't create illegal types.
	return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
	LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
	LD->getMemoryVT(), LD->getAlignment(),
	LD->getMemOperand()->getFlags());
	}

	/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
	/// extending load, or BUILD_VECTOR with extended elements, return the
	/// unextended value. The unextended vector should be 64 bits so that it can
	/// be used as an operand to a VMULL instruction. If the original vector size
	/// before extension is less than 64 bits we add a an extension to resize
	/// the vector to 64 bits.
	static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() == ISD::SIGN_EXTEND \|\| N->getOpcode() == ISD::ZERO_EXTEND)
	return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
	N->getOperand(0)->getValueType(0),
	N->getValueType(0),
	N->getOpcode());

	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	assert((ISD::isSEXTLoad(LD) \|\| ISD::isZEXTLoad(LD)) &&
	"Expected extending load");

	SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
	unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	SDValue extLoad =
	DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);

	return newLoad;
	}

	// Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
	// have been legalized as a BITCAST from v4i32.
	if (N->getOpcode() == ISD::BITCAST) {
	SDNode *BVN = N->getOperand(0).getNode();
	assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
	BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
	unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
	return DAG.getBuildVector(
	MVT::v2i32, SDLoc(N),
	{BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
	}
	// Construct a new BUILD_VECTOR with elements truncated to half the size.
	assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
	EVT VT = N->getValueType(0);
	unsigned EltSize = VT.getScalarSizeInBits() / 2;
	unsigned NumElts = VT.getVectorNumElements();
	MVT TruncVT = MVT::getIntegerVT(EltSize);
	SmallVector<SDValue, 8> Ops;
	SDLoc dl(N);
	for (unsigned i = 0; i != NumElts; ++i) {
	ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
	const APInt &CInt = C->getAPIntValue();
	// Element types smaller than 32 bits are not legal, so use i32 elements.
	// The values are implicitly truncated so sext vs. zext doesn't matter.
	Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
	}
	return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
	}

	static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
	SDNode *N0 = N->getOperand(0).getNode();
	SDNode *N1 = N->getOperand(1).getNode();
	return N0->hasOneUse() && N1->hasOneUse() &&
	isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
	}
	return false;
	}

	static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
	SDNode *N0 = N->getOperand(0).getNode();
	SDNode *N1 = N->getOperand(1).getNode();
	return N0->hasOneUse() && N1->hasOneUse() &&
	isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
	}
	return false;
	}

	static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
	// Multiplications are only custom-lowered for 128-bit vectors so that
	// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
	EVT VT = Op.getValueType();
	assert(VT.is128BitVector() && VT.isInteger() &&
	"unexpected type for custom-lowering ISD::MUL");
	SDNode *N0 = Op.getOperand(0).getNode();
	SDNode *N1 = Op.getOperand(1).getNode();
	unsigned NewOpc = 0;
	bool isMLA = false;
	bool isN0SExt = isSignExtended(N0, DAG);
	bool isN1SExt = isSignExtended(N1, DAG);
	if (isN0SExt && isN1SExt)
	NewOpc = ARMISD::VMULLs;
	else {
	bool isN0ZExt = isZeroExtended(N0, DAG);
	bool isN1ZExt = isZeroExtended(N1, DAG);
	if (isN0ZExt && isN1ZExt)
	NewOpc = ARMISD::VMULLu;
	else if (isN1SExt \|\| isN1ZExt) {
	// Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
	// into (s/zext A * s/zext C) + (s/zext B * s/zext C)
	if (isN1SExt && isAddSubSExt(N0, DAG)) {
	NewOpc = ARMISD::VMULLs;
	isMLA = true;
	} else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
	NewOpc = ARMISD::VMULLu;
	isMLA = true;
	} else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
	std::swap(N0, N1);
	NewOpc = ARMISD::VMULLu;
	isMLA = true;
	}
	}

	if (!NewOpc) {
	if (VT == MVT::v2i64)
	// Fall through to expand this. It is not legal.
	return SDValue();
	else
	// Other vector multiplications are legal.
	return Op;
	}
	}

	// Legalize to a VMULL instruction.
	SDLoc DL(Op);
	SDValue Op0;
	SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
	if (!isMLA) {
	Op0 = SkipExtensionForVMULL(N0, DAG);
	assert(Op0.getValueType().is64BitVector() &&
	Op1.getValueType().is64BitVector() &&
	"unexpected types for extended operands to VMULL");
	return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
	}

	// Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
	// isel lowering to take advantage of no-stall back to back vmul + vmla.
	// vmull q0, d4, d6
	// vmlal q0, d5, d6
	// is faster than
	// vaddl q0, d4, d5
	// vmovl q1, d6
	// vmul q0, q0, q1
	SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
	SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
	EVT Op1VT = Op1.getValueType();
	return DAG.getNode(N0->getOpcode(), DL, VT,
	DAG.getNode(NewOpc, DL, VT,
	DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
	DAG.getNode(NewOpc, DL, VT,
	DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
	}

	static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
	SelectionDAG &DAG) {
	// TODO: Should this propagate fast-math-flags?

	// Convert to float
	// float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
	// float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
	X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
	Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
	X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
	Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
	// Get reciprocal estimate.
	// float4 recip = vrecpeq_f32(yf);
	Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
	DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
	Y);
	// Because char has a smaller range than uchar, we can actually get away
	// without any newton steps. This requires that we use a weird bias
	// of 0xb000, however (again, this has been exhaustively tested).
	// float4 result = as_float4(as_int4(xf*recip) + 0xb000);
	X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
	X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
	Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
	X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
	X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
	// Convert back to short.
	X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
	X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
	return X;
	}

	static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
	SelectionDAG &DAG) {
	// TODO: Should this propagate fast-math-flags?

	SDValue N2;
	// Convert to float.
	// float4 yf = vcvt_f32_s32(vmovl_s16(y));
	// float4 xf = vcvt_f32_s32(vmovl_s16(x));
	N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
	N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
	N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
	N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);

	// Use reciprocal estimate and one refinement step.
	// float4 recip = vrecpeq_f32(yf);
	// recip *= vrecpsq_f32(yf, recip);
	N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
	DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
	N1);
	N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
	DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
	N1, N2);
	N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
	// Because short has a smaller range than ushort, we can actually get away
	// with only a single newton step. This requires that we use a weird bias
	// of 89, however (again, this has been exhaustively tested).
	// float4 result = as_float4(as_int4(xf*recip) + 0x89);
	N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
	N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
	N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
	N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
	N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
	// Convert back to integer and return.
	// return vmovn_s32(vcvt_s32_f32(result));
	N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
	N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
	return N0;
	}

	static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();
	assert((VT == MVT::v4i16 \|\| VT == MVT::v8i8) &&
	"unexpected type for custom-lowering ISD::SDIV");

	SDLoc dl(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	SDValue N2, N3;

	if (VT == MVT::v8i8) {
	N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
	N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);

	N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
	DAG.getIntPtrConstant(4, dl));
	N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
	DAG.getIntPtrConstant(4, dl));
	N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
	DAG.getIntPtrConstant(0, dl));
	N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
	DAG.getIntPtrConstant(0, dl));

	N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
	N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16

	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
	N0 = LowerCONCAT_VECTORS(N0, DAG);

	N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
	return N0;
	}
	return LowerSDIV_v4i16(N0, N1, dl, DAG);
	}

	static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
	// TODO: Should this propagate fast-math-flags?
	EVT VT = Op.getValueType();
	assert((VT == MVT::v4i16 \|\| VT == MVT::v8i8) &&
	"unexpected type for custom-lowering ISD::UDIV");

	SDLoc dl(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	SDValue N2, N3;

	if (VT == MVT::v8i8) {
	N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
	N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);

	N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
	DAG.getIntPtrConstant(4, dl));
	N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
	DAG.getIntPtrConstant(4, dl));
	N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
	DAG.getIntPtrConstant(0, dl));
	N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
	DAG.getIntPtrConstant(0, dl));

	N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
	N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16

	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
	N0 = LowerCONCAT_VECTORS(N0, DAG);

	N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
	DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
	MVT::i32),
	N0);
	return N0;
	}

	// v4i16 sdiv ... Convert to float.
	// float4 yf = vcvt_f32_s32(vmovl_u16(y));
	// float4 xf = vcvt_f32_s32(vmovl_u16(x));
	N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
	N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
	N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
	SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);

	// Use reciprocal estimate and two refinement steps.
	// float4 recip = vrecpeq_f32(yf);
	// recip *= vrecpsq_f32(yf, recip);
	// recip *= vrecpsq_f32(yf, recip);
	N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
	DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
	BN1);
	N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
	DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
	BN1, N2);
	N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
	N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
	DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
	BN1, N2);
	N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
	// Simply multiplying by the reciprocal estimate can leave us a few ulps
	// too low, so we add 2 ulps (exhaustive testing shows that this is enough,
	// and that it will never cause us to return an answer too large).
	// float4 result = as_float4(as_int4(xf*recip) + 2);
	N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
	N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
	N1 = DAG.getConstant(2, dl, MVT::v4i32);
	N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
	N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
	// Convert back to integer and return.
	// return vmovn_u32(vcvt_s32_f32(result));
	N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
	N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
	return N0;
	}

	static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
	SDNode *N = Op.getNode();
	EVT VT = N->getValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	SDValue Carry = Op.getOperand(2);

	SDLoc DL(Op);

	SDValue Result;
	if (Op.getOpcode() == ISD::ADDCARRY) {
	// This converts the boolean value carry into the carry flag.
	Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);

	// Do the addition proper using the carry flag we wanted.
	Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
	Op.getOperand(1), Carry);

	// Now convert the carry flag into a boolean value.
	Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
	} else {
	// ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
	// have to invert the carry first.
	Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
	DAG.getConstant(1, DL, MVT::i32), Carry);
	// This converts the boolean value carry into the carry flag.
	Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);

	// Do the subtraction proper using the carry flag we wanted.
	Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
	Op.getOperand(1), Carry);

	// Now convert the carry flag into a boolean value.
	Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
	// But the carry returned by ARMISD::SUBE is not a borrow as expected
	// by ISD::SUBCARRY, so compute 1 - C.
	Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
	DAG.getConstant(1, DL, MVT::i32), Carry);
	}

	// Return both values.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
	}

	SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget->isTargetDarwin());

	// For iOS, we want to call an alternative entry point: __sincos_stret,
	// return values are passed via sret.
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Pair of floats / doubles used to pass the result.
	Type *RetTy = StructType::get(ArgTy, ArgTy);
	auto &DL = DAG.getDataLayout();

	ArgListTy Args;
	bool ShouldUseSRet = Subtarget->isAPCS_ABI();
	SDValue SRet;
	if (ShouldUseSRet) {
	// Create stack object for sret.
	const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
	const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
	int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
	SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));

	ArgListEntry Entry;
	Entry.Node = SRet;
	Entry.Ty = RetTy->getPointerTo();
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Entry.IsSRet = true;
	Args.push_back(Entry);
	RetTy = Type::getVoidTy(*DAG.getContext());
	}

	ArgListEntry Entry;
	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	RTLIB::Libcall LC =
	(ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
	const char *LibcallName = getLibcallName(LC);
	CallingConv::ID CC = getLibcallCallingConv(LC);
	SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setCallee(CC, RetTy, Callee, std::move(Args))
	.setDiscardResult(ShouldUseSRet);
	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);

	if (!ShouldUseSRet)
	return CallResult.first;

	SDValue LoadSin =
	DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());

	// Address of cos field.
	SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
	DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
	SDValue LoadCos =
	DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());

	SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
	LoadSin.getValue(0), LoadCos.getValue(0));
	}

	SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
	bool Signed,
	SDValue &Chain) const {
	EVT VT = Op.getValueType();
	assert((VT == MVT::i32 \|\| VT == MVT::i64) &&
	"unexpected type for custom lowering DIV");
	SDLoc dl(Op);

	const auto &DL = DAG.getDataLayout();
	const auto &TLI = DAG.getTargetLoweringInfo();

	const char *Name = nullptr;
	if (Signed)
	Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
	else
	Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";

	SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));

	ARMTargetLowering::ArgListTy Args;

	for (auto AI : {1, 0}) {
	ArgListEntry Arg;
	Arg.Node = Op.getOperand(AI);
	Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
	Args.push_back(Arg);
	}

	CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(Chain)
	.setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
	ES, std::move(Args));

	return LowerCallTo(CLI).first;
	}

	// This is a code size optimisation: return the original SDIV node to
	// DAGCombiner when we don't want to expand SDIV into a sequence of
	// instructions, and an empty node otherwise which will cause the
	// SDIV to be expanded in DAGCombine.
	SDValue
	ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	SmallVectorImpl<SDNode *> &Created) const {
	// TODO: Support SREM
	if (N->getOpcode() != ISD::SDIV)
	return SDValue();

	const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
	const bool MinSize = ST.hasMinSize();
	const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
	: ST.hasDivideInARMMode();

	// Don't touch vector types; rewriting this may lead to scalarizing
	// the int divs.
	if (N->getOperand(0).getValueType().isVector())
	return SDValue();

	// Bail if MinSize is not set, and also for both ARM and Thumb mode we need
	// hwdiv support for this to be really profitable.
	if (!(MinSize && HasDivide))
	return SDValue();

	// ARM mode is a bit simpler than Thumb: we can handle large power
	// of 2 immediates with 1 mov instruction; no further checks required,
	// just return the sdiv node.
	if (!ST.isThumb())
	return SDValue(N, 0);

	// In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
	// and thus lose the code size benefits of a MOVS that requires only 2.
	// TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
	// but as it's doing exactly this, it's not worth the trouble to get TTI.
	if (Divisor.sgt(128))
	return SDValue();

	return SDValue(N, 0);
	}

	SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
	bool Signed) const {
	assert(Op.getValueType() == MVT::i32 &&
	"unexpected type for custom lowering DIV");
	SDLoc dl(Op);

	SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
	DAG.getEntryNode(), Op.getOperand(1));

	return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
	}

	static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
	SDLoc DL(N);
	SDValue Op = N->getOperand(1);
	if (N->getValueType(0) == MVT::i32)
	return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
	DAG.getConstant(0, DL, MVT::i32));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
	DAG.getConstant(1, DL, MVT::i32));
	return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
	DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
	}

	void ARMTargetLowering::ExpandDIV_Windows(
	SDValue Op, SelectionDAG &DAG, bool Signed,
	SmallVectorImpl<SDValue> &Results) const {
	const auto &DL = DAG.getDataLayout();
	const auto &TLI = DAG.getTargetLoweringInfo();

	assert(Op.getValueType() == MVT::i64 &&
	"unexpected type for custom lowering DIV");
	SDLoc dl(Op);

	SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());

	SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);

	SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
	SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
	DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
	Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);

	Results.push_back(Lower);
	Results.push_back(Upper);
	}

	static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
	if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
	// Acquire/Release load/store is not legal for targets without a dmb or
	// equivalent available.
	return SDValue();

	// Monotonic load/store is legal for all targets.
	return Op;
	}

	static void ReplaceREADCYCLECOUNTER(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG,
	const ARMSubtarget *Subtarget) {
	SDLoc DL(N);
	// Under Power Management extensions, the cycle-count is:
	// mrc p15, #0, <Rt>, c9, c13, #0
	SDValue Ops[] = { N->getOperand(0), // Chain
	DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
	DAG.getConstant(15, DL, MVT::i32),
	DAG.getConstant(0, DL, MVT::i32),
	DAG.getConstant(9, DL, MVT::i32),
	DAG.getConstant(13, DL, MVT::i32),
	DAG.getConstant(0, DL, MVT::i32)
	};

	SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
	DAG.getVTList(MVT::i32, MVT::Other), Ops);
	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
	DAG.getConstant(0, DL, MVT::i32)));
	Results.push_back(Cycles32.getValue(1));
	}

	static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
	SDLoc dl(V.getNode());
	SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
	SDValue VHi = DAG.getAnyExtOrTrunc(
	DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
	dl, MVT::i32);
	bool isBigEndian = DAG.getDataLayout().isBigEndian();
	if (isBigEndian)
	std::swap (VLo, VHi);
	SDValue RegClass =
	DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
	SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
	SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
	const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
	return SDValue(
	DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
	}

	static void ReplaceCMP_SWAP_64Results(SDNode *N,
	SmallVectorImpl<SDValue> & Results,
	SelectionDAG &DAG) {
	assert(N->getValueType(0) == MVT::i64 &&
	"AtomicCmpSwap on types less than 64 should be legal");
	SDValue Ops[] = {N->getOperand(1),
	createGPRPairNode(DAG, N->getOperand(2)),
	createGPRPairNode(DAG, N->getOperand(3)),
	N->getOperand(0)};
	SDNode *CmpSwap = DAG.getMachineNode(
	ARM::CMP_SWAP_64, SDLoc(N),
	DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);

	MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
	DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});

	bool isBigEndian = DAG.getDataLayout().isBigEndian();

	Results.push_back(
	DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
	SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
	Results.push_back(
	DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
	SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
	Results.push_back(SDValue(CmpSwap, 2));
	}

	static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
	SelectionDAG &DAG) {
	const auto &TLI = DAG.getTargetLoweringInfo();

	assert(Subtarget.getTargetTriple().isOSMSVCRT() &&
	"Custom lowering is MSVCRT specific!");

	SDLoc dl(Op);
	SDValue Val = Op.getOperand(0);
	MVT Ty = Val->getSimpleValueType(0);
	SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1));
	SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow",
	TLI.getPointerTy(DAG.getDataLayout()));

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	Entry.Node = Val;
	Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
	Entry.IsZExt = true;
	Args.push_back(Entry);

	Entry.Node = Exponent;
	Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
	Entry.IsZExt = true;
	Args.push_back(Entry);

	Type LCRTy = Val.getValueType().getTypeForEVT(DAG.getContext());

	// In the in-chain to the call is the entry node If we are emitting a
	// tailcall, the chain will be mutated if the node has a non-entry input
	// chain.
	SDValue InChain = DAG.getEntryNode();
	SDValue TCChain = InChain;

	const Function &F = DAG.getMachineFunction().getFunction();
	bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
	F.getReturnType() == LCRTy;
	if (IsTC)
	InChain = TCChain;

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(InChain)
	.setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args))
	.setTailCall(IsTC);
	std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI);

	// Return the chain (the DAG root) if it is a tail call
	return !CI.second.getNode() ? DAG.getRoot() : CI.first;
	}

	SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Don't know how to custom lower this!");
	case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::SELECT: return LowerSELECT(Op, DAG);
	case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
	case ISD::BRCOND: return LowerBRCOND(Op, DAG);
	case ISD::BR_CC: return LowerBR_CC(Op, DAG);
	case ISD::BR_JT: return LowerBR_JT(Op, DAG);
	case ISD::VASTART: return LowerVASTART(Op, DAG);
	case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
	case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
	case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
	case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
	case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
	case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
	Subtarget);
	case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
	case ISD::SREM: return LowerREM(Op.getNode(), DAG);
	case ISD::UREM: return LowerREM(Op.getNode(), DAG);
	case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
	case ISD::SRL_PARTS:
	case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
	case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
	case ISD::SETCC: return LowerVSETCC(Op, DAG);
	case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
	case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
	case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::MUL: return LowerMUL(Op, DAG);
	case ISD::SDIV:
	if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
	return LowerDIV_Windows(Op, DAG, /* Signed */ true);
	return LowerSDIV(Op, DAG);
	case ISD::UDIV:
	if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
	return LowerDIV_Windows(Op, DAG, /* Signed */ false);
	return LowerUDIV(Op, DAG);
	case ISD::ADDCARRY:
	case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
	case ISD::SADDO:
	case ISD::SSUBO:
	return LowerSignedALUO(Op, DAG);
	case ISD::UADDO:
	case ISD::USUBO:
	return LowerUnsignedALUO(Op, DAG);
	case ISD::ATOMIC_LOAD:
	case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
	case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
	case ISD::SDIVREM:
	case ISD::UDIVREM: return LowerDivRem(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC:
	if (Subtarget->isTargetWindows())
	return LowerDYNAMIC_STACKALLOC(Op, DAG);
	llvm_unreachable("Don't know how to custom lower this!");
	case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
	case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
	case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG);
	case ARMISD::WIN__DBZCHK: return SDValue();
	}
	}

	static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) {
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
	unsigned Opc = 0;
	if (IntNo == Intrinsic::arm_smlald)
	Opc = ARMISD::SMLALD;
	else if (IntNo == Intrinsic::arm_smlaldx)
	Opc = ARMISD::SMLALDX;
	else if (IntNo == Intrinsic::arm_smlsld)
	Opc = ARMISD::SMLSLD;
	else if (IntNo == Intrinsic::arm_smlsldx)
	Opc = ARMISD::SMLSLDX;
	else
	return;

	SDLoc dl(N);
	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
	N->getOperand(3),
	DAG.getConstant(0, dl, MVT::i32));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
	N->getOperand(3),
	DAG.getConstant(1, dl, MVT::i32));

	SDValue LongMul = DAG.getNode(Opc, dl,
	DAG.getVTList(MVT::i32, MVT::i32),
	N->getOperand(1), N->getOperand(2),
	Lo, Hi);
	Results.push_back(LongMul.getValue(0));
	Results.push_back(LongMul.getValue(1));
	}

	/// ReplaceNodeResults - Replace the results of node with an illegal result
	/// type with new values built out of custom code.
	void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDValue Res;
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Don't know how to custom expand this!");
	case ISD::READ_REGISTER:
	ExpandREAD_REGISTER(N, Results, DAG);
	break;
	case ISD::BITCAST:
	Res = ExpandBITCAST(N, DAG, Subtarget);
	break;
	case ISD::SRL:
	case ISD::SRA:
	case ISD::SHL:
	Res = Expand64BitShift(N, DAG, Subtarget);
	break;
	case ISD::SREM:
	case ISD::UREM:
	Res = LowerREM(N, DAG);
	break;
	case ISD::SDIVREM:
	case ISD::UDIVREM:
	Res = LowerDivRem(SDValue(N, 0), DAG);
	assert(Res.getNumOperands() == 2 && "DivRem needs two values");
	Results.push_back(Res.getValue(0));
	Results.push_back(Res.getValue(1));
	return;
	case ISD::READCYCLECOUNTER:
	ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
	return;
	case ISD::UDIV:
	case ISD::SDIV:
	assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
	return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
	Results);
	case ISD::ATOMIC_CMP_SWAP:
	ReplaceCMP_SWAP_64Results(N, Results, DAG);
	return;
	case ISD::INTRINSIC_WO_CHAIN:
	return ReplaceLongIntrinsic(N, Results, DAG);
	case ISD::ABS:
	lowerABS(N, Results, DAG);
	return ;

	}
	if (Res.getNode())
	Results.push_back(Res);
	}

	//===----------------------------------------------------------------------===//
	// ARM Scheduler Hooks
	//===----------------------------------------------------------------------===//

	/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
	/// registers the function context.
	void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
	MachineBasicBlock *MBB,
	MachineBasicBlock *DispatchBB,
	int FI) const {
	assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
	"ROPI/RWPI not currently supported with SjLj");
	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	DebugLoc dl = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	MachineConstantPool *MCP = MF->getConstantPool();
	ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
	const Function &F = MF->getFunction();

	bool isThumb = Subtarget->isThumb();
	bool isThumb2 = Subtarget->isThumb2();

	unsigned PCLabelId = AFI->createPICLabelUId();
	unsigned PCAdj = (isThumb \|\| isThumb2) ? 4 : 8;
	ARMConstantPoolValue *CPV =
	ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
	unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);

	const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
	: &ARM::GPRRegClass;

	// Grab constant pool and fixed stack memory operands.
	MachineMemOperand *CPMMO =
	MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
	MachineMemOperand::MOLoad, 4, 4);

	MachineMemOperand *FIMMOSt =
	MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
	MachineMemOperand::MOStore, 4, 4);

	// Load the address of the dispatch MBB into the jump buffer.
	if (isThumb2) {
	// Incoming value: jbuf
	// ldr.n r5, LCPI1_1
	// orr r5, r5, #1
	// add r5, pc
	// str r5, [$jbuf, #+4] ; &jbuf[1]
	unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
	BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
	.addConstantPoolIndex(CPI)
	.addMemOperand(CPMMO)
	.add(predOps(ARMCC::AL));
	// Set the low bit because of thumb mode.
	unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
	BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
	.addReg(NewVReg1, RegState::Kill)
	.addImm(0x01)
	.add(predOps(ARMCC::AL))
	.add(condCodeOp());
	unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
	BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
	.addReg(NewVReg2, RegState::Kill)
	.addImm(PCLabelId);
	BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
	.addReg(NewVReg3, RegState::Kill)
	.addFrameIndex(FI)
	.addImm(36) // &jbuf[1] :: pc
	.addMemOperand(FIMMOSt)
	.add(predOps(ARMCC::AL));
	} else if (isThumb) {
	// Incoming value: jbuf
	// ldr.n r1, LCPI1_4
	// add r1, pc
	// mov r2, #1
	// orrs r1, r2
	// add r2, $jbuf, #+4 ; &jbuf[1]
	// str r1, [r2]
	unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
	BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
	.addConstantPoolIndex(CPI)
	.addMemOperand(CPMMO)
	.add(predOps(ARMCC::AL));
	unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
	BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
	.addReg(NewVReg1, RegState::Kill)
	.addImm(PCLabelId);
	// Set the low bit because of thumb mode.
	unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
	BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
	.addReg(ARM::CPSR, RegState::Define)
	.addImm(1)
	.add(predOps(ARMCC::AL));
	unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
	BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
	.addReg(ARM::CPSR, RegState::Define)
	.addReg(NewVReg2, RegState::Kill)
	.addReg(NewVReg3, RegState::Kill)
	.add(predOps(ARMCC::AL));
	unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
	BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
	.addFrameIndex(FI)
	.addImm(36); // &jbuf[1] :: pc
	BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
	.addReg(NewVReg4, RegState::Kill)
	.addReg(NewVReg5, RegState::Kill)
	.addImm(0)
	.addMemOperand(FIMMOSt)
	.add(predOps(ARMCC::AL));
	} else {
	// Incoming value: jbuf
	// ldr r1, LCPI1_1
	// add r1, pc, r1
	// str r1, [$jbuf, #+4] ; &jbuf[1]
	unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
	BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
	.addConstantPoolIndex(CPI)
	.addImm(0)
	.addMemOperand(CPMMO)
	.add(predOps(ARMCC::AL));
	unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
	BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
	.addReg(NewVReg1, RegState::Kill)
	.addImm(PCLabelId)
	.add(predOps(ARMCC::AL));
	BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
	.addReg(NewVReg2, RegState::Kill)
	.addFrameIndex(FI)
	.addImm(36) // &jbuf[1] :: pc
	.addMemOperand(FIMMOSt)
	.add(predOps(ARMCC::AL));
	}
	}

	void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	DebugLoc dl = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	MachineFrameInfo &MFI = MF->getFrameInfo();
	int FI = MFI.getFunctionContextIndex();

	const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
	: &ARM::GPRnopcRegClass;

	// Get a mapping of the call site numbers to all of the landing pads they're
	// associated with.
	DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
	unsigned MaxCSNum = 0;
	for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
	++BB) {
	if (!BB->isEHPad()) continue;

	// FIXME: We should assert that the EH_LABEL is the first MI in the landing
	// pad.
	for (MachineBasicBlock::iterator
	II = BB->begin(), IE = BB->end(); II != IE; ++II) {
	if (!II->isEHLabel()) continue;

	MCSymbol *Sym = II->getOperand(0).getMCSymbol();
	if (!MF->hasCallSiteLandingPad(Sym)) continue;

	SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
	for (SmallVectorImpl<unsigned>::iterator
	CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
	CSI != CSE; ++CSI) {
	CallSiteNumToLPad[CSI].push_back(&BB);
	MaxCSNum = std::max(MaxCSNum, *CSI);
	}
	break;
	}
	}

	// Get an ordered list of the machine basic blocks for the jump table.
	std::vector<MachineBasicBlock*> LPadList;
	SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
	LPadList.reserve(CallSiteNumToLPad.size());
	for (unsigned I = 1; I <= MaxCSNum; ++I) {
	SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
	for (SmallVectorImpl<MachineBasicBlock*>::iterator
	II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
	LPadList.push_back(*II);
	InvokeBBs.insert((II)->pred_begin(), (II)->pred_end());
	}
	}

	assert(!LPadList.empty() &&
	"No landing pad destinations for the dispatch jump table!");

	// Create the jump table and associated information.
	MachineJumpTableInfo *JTI =
	MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
	unsigned MJTI = JTI->createJumpTableIndex(LPadList);

	// Create the MBBs for the dispatch code.

	// Shove the dispatch's address into the return slot in the function context.
	MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
	DispatchBB->setIsEHPad();

	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
	unsigned trap_opcode;
	if (Subtarget->isThumb())
	trap_opcode = ARM::tTRAP;
	else
	trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;

	BuildMI(TrapBB, dl, TII->get(trap_opcode));
	DispatchBB->addSuccessor(TrapBB);

	MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
	DispatchBB->addSuccessor(DispContBB);

	// Insert and MBBs.
	MF->insert(MF->end(), DispatchBB);
	MF->insert(MF->end(), DispContBB);
	MF->insert(MF->end(), TrapBB);

	// Insert code into the entry block that creates and registers the function
	// context.
	SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);

	MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*MF, FI),
	MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile, 4, 4);

	MachineInstrBuilder MIB;
	MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));

	const ARMBaseInstrInfo AII = static_cast<const ARMBaseInstrInfo>(TII);
	const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();

	// Add a register mask with no preserved registers. This results in all
	// registers being marked as clobbered. This can't work if the dispatch block
	// is in a Thumb1 function and is linked with ARM code which uses the FP
	// registers, as there is no way to preserve the FP registers in Thumb1 mode.
	MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));

	bool IsPositionIndependent = isPositionIndependent();
	unsigned NumLPads = LPadList.size();
	if (Subtarget->isThumb2()) {
	unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
	BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
	.addFrameIndex(FI)
	.addImm(4)
	.addMemOperand(FIMMOLd)
	.add(predOps(ARMCC::AL));

	if (NumLPads < 256) {
	BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
	.addReg(NewVReg1)
	.addImm(LPadList.size())
	.add(predOps(ARMCC::AL));
	} else {
	unsigned VReg1 = MRI->createVirtualRegister(TRC);
	BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
	.addImm(NumLPads & 0xFFFF)
	.add(predOps(ARMCC::AL));

	unsigned VReg2 = VReg1;
	if ((NumLPads & 0xFFFF0000) != 0) {
	VReg2 = MRI->createVirtualRegister(TRC);
	BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
	.addReg(VReg1)
	.addImm(NumLPads >> 16)
	.add(predOps(ARMCC::AL));
	}

	BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
	.addReg(NewVReg1)
	.addReg(VReg2)
	.add(predOps(ARMCC::AL));
	}

	BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
	.addMBB(TrapBB)
	.addImm(ARMCC::HI)
	.addReg(ARM::CPSR);

	unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
	BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
	.addJumpTableIndex(MJTI)
	.add(predOps(ARMCC::AL));

	unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
	BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
	.addReg(NewVReg3, RegState::Kill)
	.addReg(NewVReg1)
	.addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
	.add(predOps(ARMCC::AL))
	.add(condCodeOp());

	BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
	.addReg(NewVReg4, RegState::Kill)
	.addReg(NewVReg1)
	.addJumpTableIndex(MJTI);
	} else if (Subtarget->isThumb()) {
	unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
	BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
	.addFrameIndex(FI)
	.addImm(1)
	.addMemOperand(FIMMOLd)
	.add(predOps(ARMCC::AL));

	if (NumLPads < 256) {
	BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
	.addReg(NewVReg1)
	.addImm(NumLPads)
	.add(predOps(ARMCC::AL));
	} else {
	MachineConstantPool *ConstantPool = MF->getConstantPool();
	Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
	const Constant *C = ConstantInt::get(Int32Ty, NumLPads);

	// MachineConstantPool wants an explicit alignment.
	unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
	if (Align == 0)
	Align = MF->getDataLayout().getTypeAllocSize(C->getType());
	unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);

	unsigned VReg1 = MRI->createVirtualRegister(TRC);
	BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
	.addReg(VReg1, RegState::Define)
	.addConstantPoolIndex(Idx)
	.add(predOps(ARMCC::AL));
	BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
	.addReg(NewVReg1)
	.addReg(VReg1)
	.add(predOps(ARMCC::AL));
	}

	BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
	.addMBB(TrapBB)
	.addImm(ARMCC::HI)
	.addReg(ARM::CPSR);

	unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
	BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
	.addReg(ARM::CPSR, RegState::Define)
	.addReg(NewVReg1)
	.addImm(2)
	.add(predOps(ARMCC::AL));

	unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
	BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
	.addJumpTableIndex(MJTI)
	.add(predOps(ARMCC::AL));

	unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
	BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
	.addReg(ARM::CPSR, RegState::Define)
	.addReg(NewVReg2, RegState::Kill)
	.addReg(NewVReg3)
	.add(predOps(ARMCC::AL));

	MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
	MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);

	unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
	BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
	.addReg(NewVReg4, RegState::Kill)
	.addImm(0)
	.addMemOperand(JTMMOLd)
	.add(predOps(ARMCC::AL));

	unsigned NewVReg6 = NewVReg5;
	if (IsPositionIndependent) {
	NewVReg6 = MRI->createVirtualRegister(TRC);
	BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
	.addReg(ARM::CPSR, RegState::Define)
	.addReg(NewVReg5, RegState::Kill)
	.addReg(NewVReg3)
	.add(predOps(ARMCC::AL));
	}

	BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
	.addReg(NewVReg6, RegState::Kill)
	.addJumpTableIndex(MJTI);
	} else {
	unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
	BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
	.addFrameIndex(FI)
	.addImm(4)
	.addMemOperand(FIMMOLd)
	.add(predOps(ARMCC::AL));

	if (NumLPads < 256) {
	BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
	.addReg(NewVReg1)
	.addImm(NumLPads)
	.add(predOps(ARMCC::AL));
	} else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
	unsigned VReg1 = MRI->createVirtualRegister(TRC);
	BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
	.addImm(NumLPads & 0xFFFF)
	.add(predOps(ARMCC::AL));

	unsigned VReg2 = VReg1;
	if ((NumLPads & 0xFFFF0000) != 0) {
	VReg2 = MRI->createVirtualRegister(TRC);
	BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
	.addReg(VReg1)
	.addImm(NumLPads >> 16)
	.add(predOps(ARMCC::AL));
	}

	BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
	.addReg(NewVReg1)
	.addReg(VReg2)
	.add(predOps(ARMCC::AL));
	} else {
	MachineConstantPool *ConstantPool = MF->getConstantPool();
	Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
	const Constant *C = ConstantInt::get(Int32Ty, NumLPads);

	// MachineConstantPool wants an explicit alignment.
	unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
	if (Align == 0)
	Align = MF->getDataLayout().getTypeAllocSize(C->getType());
	unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);

	unsigned VReg1 = MRI->createVirtualRegister(TRC);
	BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
	.addReg(VReg1, RegState::Define)
	.addConstantPoolIndex(Idx)
	.addImm(0)
	.add(predOps(ARMCC::AL));
	BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
	.addReg(NewVReg1)
	.addReg(VReg1, RegState::Kill)
	.add(predOps(ARMCC::AL));
	}

	BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
	.addMBB(TrapBB)
	.addImm(ARMCC::HI)
	.addReg(ARM::CPSR);

	unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
	BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
	.addReg(NewVReg1)
	.addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
	.add(predOps(ARMCC::AL))
	.add(condCodeOp());
	unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
	BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
	.addJumpTableIndex(MJTI)
	.add(predOps(ARMCC::AL));

	MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
	MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
	unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
	BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
	.addReg(NewVReg3, RegState::Kill)
	.addReg(NewVReg4)
	.addImm(0)
	.addMemOperand(JTMMOLd)
	.add(predOps(ARMCC::AL));

	if (IsPositionIndependent) {
	BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
	.addReg(NewVReg5, RegState::Kill)
	.addReg(NewVReg4)
	.addJumpTableIndex(MJTI);
	} else {
	BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
	.addReg(NewVReg5, RegState::Kill)
	.addJumpTableIndex(MJTI);
	}
	}

	// Add the jump table entries as successors to the MBB.
	SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
	for (std::vector<MachineBasicBlock*>::iterator
	I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
	MachineBasicBlock CurMBB = I;
	if (SeenMBBs.insert(CurMBB).second)
	DispContBB->addSuccessor(CurMBB);
	}

	// N.B. the order the invoke BBs are processed in doesn't matter here.
	const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
	SmallVector<MachineBasicBlock*, 64> MBBLPads;
	for (MachineBasicBlock *BB : InvokeBBs) {

	// Remove the landing pad successor from the invoke block and replace it
	// with the new dispatch block.
	SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
	BB->succ_end());
	while (!Successors.empty()) {
	MachineBasicBlock *SMBB = Successors.pop_back_val();
	if (SMBB->isEHPad()) {
	BB->removeSuccessor(SMBB);
	MBBLPads.push_back(SMBB);
	}
	}

	BB->addSuccessor(DispatchBB, BranchProbability::getZero());
	BB->normalizeSuccProbs();

	// Find the invoke call and mark all of the callee-saved registers as
	// 'implicit defined' so that they're spilled. This prevents code from
	// moving instructions to before the EH block, where they will never be
	// executed.
	for (MachineBasicBlock::reverse_iterator
	II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
	if (!II->isCall()) continue;

	DenseMap<unsigned, bool> DefRegs;
	for (MachineInstr::mop_iterator
	OI = II->operands_begin(), OE = II->operands_end();
	OI != OE; ++OI) {
	if (!OI->isReg()) continue;
	DefRegs[OI->getReg()] = true;
	}

	MachineInstrBuilder MIB(MF, &II);

	for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
	unsigned Reg = SavedRegs[i];
	if (Subtarget->isThumb2() &&
	!ARM::tGPRRegClass.contains(Reg) &&
	!ARM::hGPRRegClass.contains(Reg))
	continue;
	if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
	continue;
	if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
	continue;
	if (!DefRegs[Reg])
	MIB.addReg(Reg, RegState::ImplicitDefine \| RegState::Dead);
	}

	break;
	}
	}

	// Mark all former landing pads as non-landing pads. The dispatch is the only
	// landing pad now.
	for (SmallVectorImpl<MachineBasicBlock*>::iterator
	I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
	(*I)->setIsEHPad(false);

	// The instruction is gone now.
	MI.eraseFromParent();
	}

	static
	MachineBasicBlock OtherSucc(MachineBasicBlock MBB, MachineBasicBlock *Succ) {
	for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
	E = MBB->succ_end(); I != E; ++I)
	if (*I != Succ)
	return *I;
	llvm_unreachable("Expecting a BB with two successors!");
	}

	/// Return the load opcode for a given load size. If load size >= 8,
	/// neon opcode will be returned.
	static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
	if (LdSize >= 8)
	return LdSize == 16 ? ARM::VLD1q32wb_fixed
	: LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
	if (IsThumb1)
	return LdSize == 4 ? ARM::tLDRi
	: LdSize == 2 ? ARM::tLDRHi
	: LdSize == 1 ? ARM::tLDRBi : 0;
	if (IsThumb2)
	return LdSize == 4 ? ARM::t2LDR_POST
	: LdSize == 2 ? ARM::t2LDRH_POST
	: LdSize == 1 ? ARM::t2LDRB_POST : 0;
	return LdSize == 4 ? ARM::LDR_POST_IMM
	: LdSize == 2 ? ARM::LDRH_POST
	: LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
	}

	/// Return the store opcode for a given store size. If store size >= 8,
	/// neon opcode will be returned.
	static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
	if (StSize >= 8)
	return StSize == 16 ? ARM::VST1q32wb_fixed
	: StSize == 8 ? ARM::VST1d32wb_fixed : 0;
	if (IsThumb1)
	return StSize == 4 ? ARM::tSTRi
	: StSize == 2 ? ARM::tSTRHi
	: StSize == 1 ? ARM::tSTRBi : 0;
	if (IsThumb2)
	return StSize == 4 ? ARM::t2STR_POST
	: StSize == 2 ? ARM::t2STRH_POST
	: StSize == 1 ? ARM::t2STRB_POST : 0;
	return StSize == 4 ? ARM::STR_POST_IMM
	: StSize == 2 ? ARM::STRH_POST
	: StSize == 1 ? ARM::STRB_POST_IMM : 0;
	}

	/// Emit a post-increment load operation with given size. The instructions
	/// will be added to BB at Pos.
	static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
	const TargetInstrInfo *TII, const DebugLoc &dl,
	unsigned LdSize, unsigned Data, unsigned AddrIn,
	unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
	unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
	assert(LdOpc != 0 && "Should have a load opcode");
	if (LdSize >= 8) {
	BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
	.addReg(AddrOut, RegState::Define)
	.addReg(AddrIn)
	.addImm(0)
	.add(predOps(ARMCC::AL));
	} else if (IsThumb1) {
	// load + update AddrIn
	BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
	.addReg(AddrIn)
	.addImm(0)
	.add(predOps(ARMCC::AL));
	BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
	.add(t1CondCodeOp())
	.addReg(AddrIn)
	.addImm(LdSize)
	.add(predOps(ARMCC::AL));
	} else if (IsThumb2) {
	BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
	.addReg(AddrOut, RegState::Define)
	.addReg(AddrIn)
	.addImm(LdSize)
	.add(predOps(ARMCC::AL));
	} else { // arm
	BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
	.addReg(AddrOut, RegState::Define)
	.addReg(AddrIn)
	.addReg(0)
	.addImm(LdSize)
	.add(predOps(ARMCC::AL));
	}
	}

	/// Emit a post-increment store operation with given size. The instructions
	/// will be added to BB at Pos.
	static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
	const TargetInstrInfo *TII, const DebugLoc &dl,
	unsigned StSize, unsigned Data, unsigned AddrIn,
	unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
	unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
	assert(StOpc != 0 && "Should have a store opcode");
	if (StSize >= 8) {
	BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
	.addReg(AddrIn)
	.addImm(0)
	.addReg(Data)
	.add(predOps(ARMCC::AL));
	} else if (IsThumb1) {
	// store + update AddrIn
	BuildMI(*BB, Pos, dl, TII->get(StOpc))
	.addReg(Data)
	.addReg(AddrIn)
	.addImm(0)
	.add(predOps(ARMCC::AL));
	BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
	.add(t1CondCodeOp())
	.addReg(AddrIn)
	.addImm(StSize)
	.add(predOps(ARMCC::AL));
	} else if (IsThumb2) {
	BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
	.addReg(Data)
	.addReg(AddrIn)
	.addImm(StSize)
	.add(predOps(ARMCC::AL));
	} else { // arm
	BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
	.addReg(Data)
	.addReg(AddrIn)
	.addReg(0)
	.addImm(StSize)
	.add(predOps(ARMCC::AL));
	}
	}

	MachineBasicBlock *
	ARMTargetLowering::EmitStructByval(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// This pseudo instruction has 3 operands: dst, src, size
	// We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
	// Otherwise, we will generate unrolled scalar copies.
	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction::iterator It = ++BB->getIterator();

	unsigned dest = MI.getOperand(0).getReg();
	unsigned src = MI.getOperand(1).getReg();
	unsigned SizeVal = MI.getOperand(2).getImm();
	unsigned Align = MI.getOperand(3).getImm();
	DebugLoc dl = MI.getDebugLoc();

	MachineFunction *MF = BB->getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();
	unsigned UnitSize = 0;
	const TargetRegisterClass *TRC = nullptr;
	const TargetRegisterClass *VecTRC = nullptr;

	bool IsThumb1 = Subtarget->isThumb1Only();
	bool IsThumb2 = Subtarget->isThumb2();
	bool IsThumb = Subtarget->isThumb();

	if (Align & 1) {
	UnitSize = 1;
	} else if (Align & 2) {
	UnitSize = 2;
	} else {
	// Check whether we can use NEON instructions.
	if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
	Subtarget->hasNEON()) {
	if ((Align % 16 == 0) && SizeVal >= 16)
	UnitSize = 16;
	else if ((Align % 8 == 0) && SizeVal >= 8)
	UnitSize = 8;
	}
	// Can't use NEON instructions.
	if (UnitSize == 0)
	UnitSize = 4;
	}

	// Select the correct opcode and register class for unit size load/store
	bool IsNeon = UnitSize >= 8;
	TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
	if (IsNeon)
	VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
	: UnitSize == 8 ? &ARM::DPRRegClass
	: nullptr;

	unsigned BytesLeft = SizeVal % UnitSize;
	unsigned LoopSize = SizeVal - BytesLeft;

	if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
	// Use LDR and STR to copy.
	// [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
	// [destOut] = STR_POST(scratch, destIn, UnitSize)
	unsigned srcIn = src;
	unsigned destIn = dest;
	for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
	unsigned srcOut = MRI.createVirtualRegister(TRC);
	unsigned destOut = MRI.createVirtualRegister(TRC);
	unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
	emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
	IsThumb1, IsThumb2);
	emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
	IsThumb1, IsThumb2);
	srcIn = srcOut;
	destIn = destOut;
	}

	// Handle the leftover bytes with LDRB and STRB.
	// [scratch, srcOut] = LDRB_POST(srcIn, 1)
	// [destOut] = STRB_POST(scratch, destIn, 1)
	for (unsigned i = 0; i < BytesLeft; i++) {
	unsigned srcOut = MRI.createVirtualRegister(TRC);
	unsigned destOut = MRI.createVirtualRegister(TRC);
	unsigned scratch = MRI.createVirtualRegister(TRC);
	emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
	IsThumb1, IsThumb2);
	emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
	IsThumb1, IsThumb2);
	srcIn = srcOut;
	destIn = destOut;
	}
	MI.eraseFromParent(); // The instruction is gone now.
	return BB;
	}

	// Expand the pseudo op to a loop.
	// thisMBB:
	// ...
	// movw varEnd, # --> with thumb2
	// movt varEnd, #
	// ldrcp varEnd, idx --> without thumb2
	// fallthrough --> loopMBB
	// loopMBB:
	// PHI varPhi, varEnd, varLoop
	// PHI srcPhi, src, srcLoop
	// PHI destPhi, dst, destLoop
	// [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
	// [destLoop] = STR_POST(scratch, destPhi, UnitSize)
	// subs varLoop, varPhi, #UnitSize
	// bne loopMBB
	// fallthrough --> exitMBB
	// exitMBB:
	// epilogue to handle left-over bytes
	// [scratch, srcOut] = LDRB_POST(srcLoop, 1)
	// [destOut] = STRB_POST(scratch, destLoop, 1)
	MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MF->insert(It, loopMBB);
	MF->insert(It, exitMBB);

	// Transfer the remainder of BB and its successor edges to exitMBB.
	exitMBB->splice(exitMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	exitMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Load an immediate to varEnd.
	unsigned varEnd = MRI.createVirtualRegister(TRC);
	if (Subtarget->useMovt()) {
	unsigned Vtmp = varEnd;
	if ((LoopSize & 0xFFFF0000) != 0)
	Vtmp = MRI.createVirtualRegister(TRC);
	BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp)
	.addImm(LoopSize & 0xFFFF)
	.add(predOps(ARMCC::AL));

	if ((LoopSize & 0xFFFF0000) != 0)
	BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd)
	.addReg(Vtmp)
	.addImm(LoopSize >> 16)
	.add(predOps(ARMCC::AL));
	} else {
	MachineConstantPool *ConstantPool = MF->getConstantPool();
	Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
	const Constant *C = ConstantInt::get(Int32Ty, LoopSize);

	// MachineConstantPool wants an explicit alignment.
	unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
	if (Align == 0)
	Align = MF->getDataLayout().getTypeAllocSize(C->getType());
	unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
	MachineMemOperand *CPMMO =
	MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
	MachineMemOperand::MOLoad, 4, 4);

	if (IsThumb)
	BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
	.addReg(varEnd, RegState::Define)
	.addConstantPoolIndex(Idx)
	.add(predOps(ARMCC::AL))
	.addMemOperand(CPMMO);
	else
	BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
	.addReg(varEnd, RegState::Define)
	.addConstantPoolIndex(Idx)
	.addImm(0)
	.add(predOps(ARMCC::AL))
	.addMemOperand(CPMMO);
	}
	BB->addSuccessor(loopMBB);

	// Generate the loop body:
	// varPhi = PHI(varLoop, varEnd)
	// srcPhi = PHI(srcLoop, src)
	// destPhi = PHI(destLoop, dst)
	MachineBasicBlock *entryBB = BB;
	BB = loopMBB;
	unsigned varLoop = MRI.createVirtualRegister(TRC);
	unsigned varPhi = MRI.createVirtualRegister(TRC);
	unsigned srcLoop = MRI.createVirtualRegister(TRC);
	unsigned srcPhi = MRI.createVirtualRegister(TRC);
	unsigned destLoop = MRI.createVirtualRegister(TRC);
	unsigned destPhi = MRI.createVirtualRegister(TRC);

	BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
	.addReg(varLoop).addMBB(loopMBB)
	.addReg(varEnd).addMBB(entryBB);
	BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
	.addReg(srcLoop).addMBB(loopMBB)
	.addReg(src).addMBB(entryBB);
	BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
	.addReg(destLoop).addMBB(loopMBB)
	.addReg(dest).addMBB(entryBB);

	// [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
	// [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
	unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
	emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
	IsThumb1, IsThumb2);
	emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
	IsThumb1, IsThumb2);

	// Decrement loop variable by UnitSize.
	if (IsThumb1) {
	BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
	.add(t1CondCodeOp())
	.addReg(varPhi)
	.addImm(UnitSize)
	.add(predOps(ARMCC::AL));
	} else {
	MachineInstrBuilder MIB =
	BuildMI(*BB, BB->end(), dl,
	TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
	MIB.addReg(varPhi)
	.addImm(UnitSize)
	.add(predOps(ARMCC::AL))
	.add(condCodeOp());
	MIB->getOperand(5).setReg(ARM::CPSR);
	MIB->getOperand(5).setIsDef(true);
	}
	BuildMI(*BB, BB->end(), dl,
	TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
	.addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);

	// loopMBB can loop back to loopMBB or fall through to exitMBB.
	BB->addSuccessor(loopMBB);
	BB->addSuccessor(exitMBB);

	// Add epilogue to handle BytesLeft.
	BB = exitMBB;
	auto StartOfExit = exitMBB->begin();

	// [scratch, srcOut] = LDRB_POST(srcLoop, 1)
	// [destOut] = STRB_POST(scratch, destLoop, 1)
	unsigned srcIn = srcLoop;
	unsigned destIn = destLoop;
	for (unsigned i = 0; i < BytesLeft; i++) {
	unsigned srcOut = MRI.createVirtualRegister(TRC);
	unsigned destOut = MRI.createVirtualRegister(TRC);
	unsigned scratch = MRI.createVirtualRegister(TRC);
	emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
	IsThumb1, IsThumb2);
	emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
	IsThumb1, IsThumb2);
	srcIn = srcOut;
	destIn = destOut;
	}

	MI.eraseFromParent(); // The instruction is gone now.
	return BB;
	}

	MachineBasicBlock *
	ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	const TargetMachine &TM = getTargetMachine();
	const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	assert(Subtarget->isTargetWindows() &&
	"__chkstk is only supported on Windows");
	assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");

	// __chkstk takes the number of words to allocate on the stack in R4, and
	// returns the stack adjustment in number of bytes in R4. This will not
	// clober any other registers (other than the obvious lr).
	//
	// Although, technically, IP should be considered a register which may be
	// clobbered, the call itself will not touch it. Windows on ARM is a pure
	// thumb-2 environment, so there is no interworking required. As a result, we
	// do not expect a veneer to be emitted by the linker, clobbering IP.
	//
	// Each module receives its own copy of __chkstk, so no import thunk is
	// required, again, ensuring that IP is not clobbered.
	//
	// Finally, although some linkers may theoretically provide a trampoline for
	// out of range calls (which is quite common due to a 32M range limitation of
	// branches for Thumb), we can generate the long-call version via
	// -mcmodel=large, alleviating the need for the trampoline which may clobber
	// IP.

	switch (TM.getCodeModel()) {
	case CodeModel::Tiny:
	llvm_unreachable("Tiny code model not available on ARM.");
	case CodeModel::Small:
	case CodeModel::Medium:
	case CodeModel::Kernel:
	BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
	.add(predOps(ARMCC::AL))
	.addExternalSymbol("__chkstk")
	.addReg(ARM::R4, RegState::Implicit \| RegState::Kill)
	.addReg(ARM::R4, RegState::Implicit \| RegState::Define)
	.addReg(ARM::R12,
	RegState::Implicit \| RegState::Define \| RegState::Dead)
	.addReg(ARM::CPSR,
	RegState::Implicit \| RegState::Define \| RegState::Dead);
	break;
	case CodeModel::Large: {
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);

	BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
	.addExternalSymbol("__chkstk");
	BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
	.add(predOps(ARMCC::AL))
	.addReg(Reg, RegState::Kill)
	.addReg(ARM::R4, RegState::Implicit \| RegState::Kill)
	.addReg(ARM::R4, RegState::Implicit \| RegState::Define)
	.addReg(ARM::R12,
	RegState::Implicit \| RegState::Define \| RegState::Dead)
	.addReg(ARM::CPSR,
	RegState::Implicit \| RegState::Define \| RegState::Dead);
	break;
	}
	}

	BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
	.addReg(ARM::SP, RegState::Kill)
	.addReg(ARM::R4, RegState::Kill)
	.setMIFlags(MachineInstr::FrameSetup)
	.add(predOps(ARMCC::AL))
	.add(condCodeOp());

	MI.eraseFromParent();
	return MBB;
	}

	MachineBasicBlock *
	ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget->getInstrInfo();

	MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
	MF->insert(++MBB->getIterator(), ContBB);
	ContBB->splice(ContBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	ContBB->transferSuccessorsAndUpdatePHIs(MBB);
	MBB->addSuccessor(ContBB);

	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
	BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
	MF->push_back(TrapBB);
	MBB->addSuccessor(TrapBB);

	BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
	.addReg(MI.getOperand(0).getReg())
	.addImm(0)
	.add(predOps(ARMCC::AL));
	BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
	.addMBB(TrapBB)
	.addImm(ARMCC::EQ)
	.addReg(ARM::CPSR);

	MI.eraseFromParent();
	return ContBB;
	}

	// The CPSR operand of SelectItr might be missing a kill marker
	// because there were multiple uses of CPSR, and ISel didn't know
	// which to mark. Figure out whether SelectItr should have had a
	// kill marker, and set it if it should. Returns the correct kill
	// marker value.
	static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
	MachineBasicBlock* BB,
	const TargetRegisterInfo* TRI) {
	// Scan forward through BB for a use/def of CPSR.
	MachineBasicBlock::iterator miI(std::next(SelectItr));
	for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
	const MachineInstr& mi = *miI;
	if (mi.readsRegister(ARM::CPSR))
	return false;
	if (mi.definesRegister(ARM::CPSR))
	break; // Should have kill-flag - update below.
	}

	// If we hit the end of the block, check whether CPSR is live into a
	// successor.
	if (miI == BB->end()) {
	for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
	sEnd = BB->succ_end();
	sItr != sEnd; ++sItr) {
	MachineBasicBlock* succ = *sItr;
	if (succ->isLiveIn(ARM::CPSR))
	return false;
	}
	}

	// We found a def, or hit the end of the basic block and CPSR wasn't live
	// out. SelectMI should have a kill flag on CPSR.
	SelectItr->addRegisterKilled(ARM::CPSR, TRI);
	return true;
	}

	MachineBasicBlock *
	ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	DebugLoc dl = MI.getDebugLoc();
	bool isThumb2 = Subtarget->isThumb2();
	switch (MI.getOpcode()) {
	default: {
	MI.print(errs());
	llvm_unreachable("Unexpected instr type to insert");
	}

	// Thumb1 post-indexed loads are really just single-register LDMs.
	case ARM::tLDR_postidx: {
	MachineOperand Def(MI.getOperand(1));
	BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
	.add(Def) // Rn_wb
	.add(MI.getOperand(2)) // Rn
	.add(MI.getOperand(3)) // PredImm
	.add(MI.getOperand(4)) // PredReg
	.add(MI.getOperand(0)) // Rt
	.cloneMemRefs(MI);
	MI.eraseFromParent();
	return BB;
	}

	// The Thumb2 pre-indexed stores have the same MI operands, they just
	// define them differently in the .td files from the isel patterns, so
	// they need pseudos.
	case ARM::t2STR_preidx:
	MI.setDesc(TII->get(ARM::t2STR_PRE));
	return BB;
	case ARM::t2STRB_preidx:
	MI.setDesc(TII->get(ARM::t2STRB_PRE));
	return BB;
	case ARM::t2STRH_preidx:
	MI.setDesc(TII->get(ARM::t2STRH_PRE));
	return BB;

	case ARM::STRi_preidx:
	case ARM::STRBi_preidx: {
	unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
	: ARM::STRB_PRE_IMM;
	// Decode the offset.
	unsigned Offset = MI.getOperand(4).getImm();
	bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
	Offset = ARM_AM::getAM2Offset(Offset);
	if (isSub)
	Offset = -Offset;

	MachineMemOperand MMO = MI.memoperands_begin();
	BuildMI(*BB, MI, dl, TII->get(NewOpc))
	.add(MI.getOperand(0)) // Rn_wb
	.add(MI.getOperand(1)) // Rt
	.add(MI.getOperand(2)) // Rn
	.addImm(Offset) // offset (skip GPR==zero_reg)
	.add(MI.getOperand(5)) // pred
	.add(MI.getOperand(6))
	.addMemOperand(MMO);
	MI.eraseFromParent();
	return BB;
	}
	case ARM::STRr_preidx:
	case ARM::STRBr_preidx:
	case ARM::STRH_preidx: {
	unsigned NewOpc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("unexpected opcode!");
	case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
	case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
	case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
	}
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
	for (unsigned i = 0; i < MI.getNumOperands(); ++i)
	MIB.add(MI.getOperand(i));
	MI.eraseFromParent();
	return BB;
	}

	case ARM::tMOVCCr_pseudo: {
	// To "insert" a SELECT_CC instruction, we actually have to insert the
	// diamond control-flow pattern. The incoming instruction knows the
	// destination vreg to set, the condition code register to branch on, the
	// true/false values to select between, and a branch opcode to use.
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction::iterator It = ++BB->getIterator();

	// thisMBB:
	// ...
	// TrueVal = ...
	// cmpTY ccX, r1, r2
	// bCC copy1MBB
	// fallthrough --> copy0MBB
	MachineBasicBlock *thisMBB = BB;
	MachineFunction *F = BB->getParent();
	MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, copy0MBB);
	F->insert(It, sinkMBB);

	// Check whether CPSR is live past the tMOVCCr_pseudo.
	const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
	if (!MI.killsRegister(ARM::CPSR) &&
	!checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
	copy0MBB->addLiveIn(ARM::CPSR);
	sinkMBB->addLiveIn(ARM::CPSR);
	}

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

	BB->addSuccessor(copy0MBB);
	BB->addSuccessor(sinkMBB);

	BuildMI(BB, dl, TII->get(ARM::tBcc))
	.addMBB(sinkMBB)
	.addImm(MI.getOperand(3).getImm())
	.addReg(MI.getOperand(4).getReg());

	// copy0MBB:
	// %FalseValue = ...
	// # fallthrough to sinkMBB
	BB = copy0MBB;

	// Update machine-CFG edges
	BB->addSuccessor(sinkMBB);

	// sinkMBB:
	// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
	// ...
	BB = sinkMBB;
	BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
	.addReg(MI.getOperand(1).getReg())
	.addMBB(copy0MBB)
	.addReg(MI.getOperand(2).getReg())
	.addMBB(thisMBB);

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	case ARM::BCCi64:
	case ARM::BCCZi64: {
	// If there is an unconditional branch to the other successor, remove it.
	BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());

	// Compare both parts that make up the double comparison separately for
	// equality.
	bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;

	unsigned LHS1 = MI.getOperand(1).getReg();
	unsigned LHS2 = MI.getOperand(2).getReg();
	if (RHSisZero) {
	BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
	.addReg(LHS1)
	.addImm(0)
	.add(predOps(ARMCC::AL));
	BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
	.addReg(LHS2).addImm(0)
	.addImm(ARMCC::EQ).addReg(ARM::CPSR);
	} else {
	unsigned RHS1 = MI.getOperand(3).getReg();
	unsigned RHS2 = MI.getOperand(4).getReg();
	BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
	.addReg(LHS1)
	.addReg(RHS1)
	.add(predOps(ARMCC::AL));
	BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
	.addReg(LHS2).addReg(RHS2)
	.addImm(ARMCC::EQ).addReg(ARM::CPSR);
	}

	MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
	MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
	if (MI.getOperand(0).getImm() == ARMCC::NE)
	std::swap(destMBB, exitMBB);

	BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
	.addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
	if (isThumb2)
	BuildMI(BB, dl, TII->get(ARM::t2B))
	.addMBB(exitMBB)
	.add(predOps(ARMCC::AL));
	else
	BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	case ARM::Int_eh_sjlj_setjmp:
	case ARM::Int_eh_sjlj_setjmp_nofp:
	case ARM::tInt_eh_sjlj_setjmp:
	case ARM::t2Int_eh_sjlj_setjmp:
	case ARM::t2Int_eh_sjlj_setjmp_nofp:
	return BB;

	case ARM::Int_eh_sjlj_setup_dispatch:
	EmitSjLjDispatchBlock(MI, BB);
	return BB;

	case ARM::ABS:
	case ARM::t2ABS: {
	// To insert an ABS instruction, we have to insert the
	// diamond control-flow pattern. The incoming instruction knows the
	// source vreg to test against 0, the destination vreg to set,
	// the condition code register to branch on, the
	// true/false values to select between, and a branch opcode to use.
	// It transforms
	// V1 = ABS V0
	// into
	// V2 = MOVS V0
	// BCC (branch to SinkBB if V0 >= 0)
	// RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)
	// SinkBB: V1 = PHI(V2, V3)
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction::iterator BBI = ++BB->getIterator();
	MachineFunction *Fn = BB->getParent();
	MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB);
	Fn->insert(BBI, RSBBB);
	Fn->insert(BBI, SinkBB);

	unsigned int ABSSrcReg = MI.getOperand(1).getReg();
	unsigned int ABSDstReg = MI.getOperand(0).getReg();
	bool ABSSrcKIll = MI.getOperand(1).isKill();
	bool isThumb2 = Subtarget->isThumb2();
	MachineRegisterInfo &MRI = Fn->getRegInfo();
	// In Thumb mode S must not be specified if source register is the SP or
	// PC and if destination register is the SP, so restrict register class
	unsigned NewRsbDstReg =
	MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	SinkBB->splice(SinkBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	SinkBB->transferSuccessorsAndUpdatePHIs(BB);

	BB->addSuccessor(RSBBB);
	BB->addSuccessor(SinkBB);

	// fall through to SinkMBB
	RSBBB->addSuccessor(SinkBB);

	// insert a cmp at the end of BB
	BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
	.addReg(ABSSrcReg)
	.addImm(0)
	.add(predOps(ARMCC::AL));

	// insert a bcc with opposite CC to ARMCC::MI at the end of BB
	BuildMI(BB, dl,
	TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
	.addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);

	// insert rsbri in RSBBB
	// Note: BCC and rsbri will be converted into predicated rsbmi
	// by if-conversion pass
	BuildMI(*RSBBB, RSBBB->begin(), dl,
	TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
	.addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
	.addImm(0)
	.add(predOps(ARMCC::AL))
	.add(condCodeOp());

	// insert PHI in SinkBB,
	// reuse ABSDstReg to not change uses of ABS instruction
	BuildMI(*SinkBB, SinkBB->begin(), dl,
	TII->get(ARM::PHI), ABSDstReg)
	.addReg(NewRsbDstReg).addMBB(RSBBB)
	.addReg(ABSSrcReg).addMBB(BB);

	// remove ABS instruction
	MI.eraseFromParent();

	// return last added BB
	return SinkBB;
	}
	case ARM::COPY_STRUCT_BYVAL_I32:
	++NumLoopByVals;
	return EmitStructByval(MI, BB);
	case ARM::WIN__CHKSTK:
	return EmitLowered__chkstk(MI, BB);
	case ARM::WIN__DBZCHK:
	return EmitLowered__dbzchk(MI, BB);
	}
	}

	/// Attaches vregs to MEMCPY that it will use as scratch registers
	/// when it is expanded into LDM/STM. This is done as a post-isel lowering
	/// instead of as a custom inserter because we need the use list from the SDNode.
	static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
	MachineInstr &MI, const SDNode *Node) {
	bool isThumb1 = Subtarget->isThumb1Only();

	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MI.getParent()->getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();
	MachineInstrBuilder MIB(*MF, MI);

	// If the new dst/src is unused mark it as dead.
	if (!Node->hasAnyUseOfValue(0)) {
	MI.getOperand(0).setIsDead(true);
	}
	if (!Node->hasAnyUseOfValue(1)) {
	MI.getOperand(1).setIsDead(true);
	}

	// The MEMCPY both defines and kills the scratch registers.
	for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
	unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
	: &ARM::GPRRegClass);
	MIB.addReg(TmpReg, RegState::Define\|RegState::Dead);
	}
	}

	void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
	SDNode *Node) const {
	if (MI.getOpcode() == ARM::MEMCPY) {
	attachMEMCPYScratchRegs(Subtarget, MI, Node);
	return;
	}

	const MCInstrDesc *MCID = &MI.getDesc();
	// Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
	// RSC. Coming out of isel, they have an implicit CPSR def, but the optional
	// operand is still set to noreg. If needed, set the optional operand's
	// register to CPSR, and remove the redundant implicit def.
	//
	// e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).

	// Rename pseudo opcodes.
	unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
	unsigned ccOutIdx;
	if (NewOpc) {
	const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
	MCID = &TII->get(NewOpc);

	assert(MCID->getNumOperands() ==
	MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
	&& "converted opcode should be the same except for cc_out"
	" (and, on Thumb1, pred)");

	MI.setDesc(*MCID);

	// Add the optional cc_out operand
	MI.addOperand(MachineOperand::CreateReg(0, /isDef=/true));

	// On Thumb1, move all input operands to the end, then add the predicate
	if (Subtarget->isThumb1Only()) {
	for (unsigned c = MCID->getNumOperands() - 4; c--;) {
	MI.addOperand(MI.getOperand(1));
	MI.RemoveOperand(1);
	}

	// Restore the ties
	for (unsigned i = MI.getNumOperands(); i--;) {
	const MachineOperand& op = MI.getOperand(i);
	if (op.isReg() && op.isUse()) {
	int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
	if (DefIdx != -1)
	MI.tieOperands(DefIdx, i);
	}
	}

	MI.addOperand(MachineOperand::CreateImm(ARMCC::AL));
	MI.addOperand(MachineOperand::CreateReg(0, /isDef=/false));
	ccOutIdx = 1;
	} else
	ccOutIdx = MCID->getNumOperands() - 1;
	} else
	ccOutIdx = MCID->getNumOperands() - 1;

	// Any ARM instruction that sets the 's' bit should specify an optional
	// "cc_out" operand in the last operand position.
	if (!MI.hasOptionalDef() \|\| !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
	assert(!NewOpc && "Optional cc_out operand required");
	return;
	}
	// Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
	// since we already have an optional CPSR def.
	bool definesCPSR = false;
	bool deadCPSR = false;
	for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
	++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
	definesCPSR = true;
	if (MO.isDead())
	deadCPSR = true;
	MI.RemoveOperand(i);
	break;
	}
	}
	if (!definesCPSR) {
	assert(!NewOpc && "Optional cc_out operand required");
	return;
	}
	assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
	if (deadCPSR) {
	assert(!MI.getOperand(ccOutIdx).getReg() &&
	"expect uninitialized optional cc_out operand");
	// Thumb1 instructions must have the S bit even if the CPSR is dead.
	if (!Subtarget->isThumb1Only())
	return;
	}

	// If this instruction was defined with an optional CPSR def and its dag node
	// had a live implicit CPSR def, then activate the optional CPSR def.
	MachineOperand &MO = MI.getOperand(ccOutIdx);
	MO.setReg(ARM::CPSR);
	MO.setIsDef(true);
	}

	//===----------------------------------------------------------------------===//
	// ARM Optimization Hooks
	//===----------------------------------------------------------------------===//

	// Helper function that checks if N is a null or all ones constant.
	static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
	return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
	}

	// Return true if N is conditionally 0 or all ones.
	// Detects these expressions where cc is an i1 value:
	//
	// (select cc 0, y) [AllOnes=0]
	// (select cc y, 0) [AllOnes=0]
	// (zext cc) [AllOnes=0]
	// (sext cc) [AllOnes=0/1]
	// (select cc -1, y) [AllOnes=1]
	// (select cc y, -1) [AllOnes=1]
	//
	// Invert is set when N is the null/all ones constant when CC is false.
	// OtherOp is set to the alternative value of N.
	static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
	SDValue &CC, bool &Invert,
	SDValue &OtherOp,
	SelectionDAG &DAG) {
	switch (N->getOpcode()) {
	default: return false;
	case ISD::SELECT: {
	CC = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	if (isZeroOrAllOnes(N1, AllOnes)) {
	Invert = false;
	OtherOp = N2;
	return true;
	}
	if (isZeroOrAllOnes(N2, AllOnes)) {
	Invert = true;
	OtherOp = N1;
	return true;
	}
	return false;
	}
	case ISD::ZERO_EXTEND:
	// (zext cc) can never be the all ones value.
	if (AllOnes)
	return false;
	LLVM_FALLTHROUGH;
	case ISD::SIGN_EXTEND: {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	CC = N->getOperand(0);
	if (CC.getValueType() != MVT::i1 \|\| CC.getOpcode() != ISD::SETCC)
	return false;
	Invert = !AllOnes;
	if (AllOnes)
	// When looking for an AllOnes constant, N is an sext, and the 'other'
	// value is 0.
	OtherOp = DAG.getConstant(0, dl, VT);
	else if (N->getOpcode() == ISD::ZERO_EXTEND)
	// When looking for a 0 constant, N can be zext or sext.
	OtherOp = DAG.getConstant(1, dl, VT);
	else
	OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
	VT);
	return true;
	}
	}
	}

	// Combine a constant select operand into its use:
	//
	// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
	// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
	// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
	// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
	// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
	//
	// The transform is rejected if the select doesn't have a constant operand that
	// is null, or all ones when AllOnes is set.
	//
	// Also recognize sext/zext from i1:
	//
	// (add (zext cc), x) -> (select cc (add x, 1), x)
	// (add (sext cc), x) -> (select cc (add x, -1), x)
	//
	// These transformations eventually create predicated instructions.
	//
	// @param N The node to transform.
	// @param Slct The N operand that is a select.
	// @param OtherOp The other N operand (x above).
	// @param DCI Context.
	// @param AllOnes Require the select constant to be all ones instead of null.
	// @returns The new node, or SDValue() on failure.
	static
	SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
	TargetLowering::DAGCombinerInfo &DCI,
	bool AllOnes = false) {
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);
	SDValue NonConstantVal;
	SDValue CCOp;
	bool SwapSelectOps;
	if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
	NonConstantVal, DAG))
	return SDValue();

	// Slct is now know to be the desired identity constant when CC is true.
	SDValue TrueVal = OtherOp;
	SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
	OtherOp, NonConstantVal);
	// Unless SwapSelectOps says CC should be false.
	if (SwapSelectOps)
	std::swap(TrueVal, FalseVal);

	return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
	CCOp, TrueVal, FalseVal);
	}

	// Attempt combineSelectAndUse on each operand of a commutative operator N.
	static
	SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	if (N0.getNode()->hasOneUse())
	if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
	return Result;
	if (N1.getNode()->hasOneUse())
	if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
	return Result;
	return SDValue();
	}

	static bool IsVUZPShuffleNode(SDNode *N) {
	// VUZP shuffle node.
	if (N->getOpcode() == ARMISD::VUZP)
	return true;

	// "VUZP" on i32 is an alias for VTRN.
	if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
	return true;

	return false;
	}

	static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	// Look for ADD(VUZP.0, VUZP.1).
	if (!IsVUZPShuffleNode(N0.getNode()) \|\| N0.getNode() != N1.getNode() \|\|
	N0 == N1)
	return SDValue();

	// Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
	if (!N->getValueType(0).is64BitVector())
	return SDValue();

	// Generate vpadd.
	SelectionDAG &DAG = DCI.DAG;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDLoc dl(N);
	SDNode *Unzip = N0.getNode();
	EVT VT = N->getValueType(0);

	SmallVector<SDValue, 8> Ops;
	Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
	TLI.getPointerTy(DAG.getDataLayout())));
	Ops.push_back(Unzip->getOperand(0));
	Ops.push_back(Unzip->getOperand(1));

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
	}

	static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	// Check for two extended operands.
	if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
	N1.getOpcode() == ISD::SIGN_EXTEND) &&
	!(N0.getOpcode() == ISD::ZERO_EXTEND &&
	N1.getOpcode() == ISD::ZERO_EXTEND))
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N10 = N1.getOperand(0);

	// Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
	if (!IsVUZPShuffleNode(N00.getNode()) \|\| N00.getNode() != N10.getNode() \|\|
	N00 == N10)
	return SDValue();

	// We only recognize Q register paddl here; this can't be reached until
	// after type legalization.
	if (!N00.getValueType().is64BitVector() \|\|
	!N0.getValueType().is128BitVector())
	return SDValue();

	// Generate vpaddl.
	SelectionDAG &DAG = DCI.DAG;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	SmallVector<SDValue, 8> Ops;
	// Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
	unsigned Opcode;
	if (N0.getOpcode() == ISD::SIGN_EXTEND)
	Opcode = Intrinsic::arm_neon_vpaddls;
	else
	Opcode = Intrinsic::arm_neon_vpaddlu;
	Ops.push_back(DAG.getConstant(Opcode, dl,
	TLI.getPointerTy(DAG.getDataLayout())));
	EVT ElemTy = N00.getValueType().getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();
	EVT ConcatVT = EVT::getVectorVT(DAG.getContext(), ElemTy, NumElts 2);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
	N00.getOperand(0), N00.getOperand(1));
	Ops.push_back(Concat);

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
	}

	// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
	// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
	// much easier to match.
	static SDValue
	AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	// Only perform optimization if after legalize, and if NEON is available. We
	// also expected both operands to be BUILD_VECTORs.
	if (DCI.isBeforeLegalize() \|\| !Subtarget->hasNEON()
	\|\| N0.getOpcode() != ISD::BUILD_VECTOR
	\|\| N1.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// Check output type since VPADDL operand elements can only be 8, 16, or 32.
	EVT VT = N->getValueType(0);
	if (!VT.isInteger() \|\| VT.getVectorElementType() == MVT::i64)
	return SDValue();

	// Check that the vector operands are of the right form.
	// N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
	// operands, where N is the size of the formed vector.
	// Each EXTRACT_VECTOR should have the same input vector and odd or even
	// index such that we have a pair wise add pattern.

	// Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
	if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	SDValue Vec = N0->getOperand(0)->getOperand(0);
	SDNode *V = Vec.getNode();
	unsigned nextIndex = 0;

	// For each operands to the ADD which are BUILD_VECTORs,
	// check to see if each of their operands are an EXTRACT_VECTOR with
	// the same vector and appropriate index.
	for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
	if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
	&& N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

	SDValue ExtVec0 = N0->getOperand(i);
	SDValue ExtVec1 = N1->getOperand(i);

	// First operand is the vector, verify its the same.
	if (V != ExtVec0->getOperand(0).getNode() \|\|
	V != ExtVec1->getOperand(0).getNode())
	return SDValue();

	// Second is the constant, verify its correct.
	ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
	ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));

	// For the constant, we want to see all the even or all the odd.
	if (!C0 \|\| !C1 \|\| C0->getZExtValue() != nextIndex
	\|\| C1->getZExtValue() != nextIndex+1)
	return SDValue();

	// Increment index.
	nextIndex+=2;
	} else
	return SDValue();
	}

	// Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
	// we're using the entire input vector, otherwise there's a size/legality
	// mismatch somewhere.
	if (nextIndex != Vec.getValueType().getVectorNumElements() \|\|
	Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
	return SDValue();

	// Create VPADDL node.
	SelectionDAG &DAG = DCI.DAG;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	SDLoc dl(N);

	// Build operand list.
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
	TLI.getPointerTy(DAG.getDataLayout())));

	// Input is the vector.
	Ops.push_back(Vec);

	// Get widened type and narrowed type.
	MVT widenType;
	unsigned numElem = VT.getVectorNumElements();

	EVT inputLaneType = Vec.getValueType().getVectorElementType();
	switch (inputLaneType.getSimpleVT().SimpleTy) {
	case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
	case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
	case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
	default:
	llvm_unreachable("Invalid vector element type for padd optimization.");
	}

	SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
	unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
	return DAG.getNode(ExtOp, dl, VT, tmp);
	}

	static SDValue findMUL_LOHI(SDValue V) {
	if (V->getOpcode() == ISD::UMUL_LOHI \|\|
	V->getOpcode() == ISD::SMUL_LOHI)
	return V;
	return SDValue();
	}

	static SDValue AddCombineTo64BitSMLAL16(SDNode AddcNode, SDNode AddeNode,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	if (Subtarget->isThumb()) {
	if (!Subtarget->hasDSP())
	return SDValue();
	} else if (!Subtarget->hasV5TEOps())
	return SDValue();

	// SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
	// accumulates the product into a 64-bit value. The 16-bit values will
	// be sign extended somehow or SRA'd into 32-bit values
	// (addc (adde (mul 16bit, 16bit), lo), hi)
	SDValue Mul = AddcNode->getOperand(0);
	SDValue Lo = AddcNode->getOperand(1);
	if (Mul.getOpcode() != ISD::MUL) {
	Lo = AddcNode->getOperand(0);
	Mul = AddcNode->getOperand(1);
	if (Mul.getOpcode() != ISD::MUL)
	return SDValue();
	}

	SDValue SRA = AddeNode->getOperand(0);
	SDValue Hi = AddeNode->getOperand(1);
	if (SRA.getOpcode() != ISD::SRA) {
	SRA = AddeNode->getOperand(1);
	Hi = AddeNode->getOperand(0);
	if (SRA.getOpcode() != ISD::SRA)
	return SDValue();
	}
	if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
	if (Const->getZExtValue() != 31)
	return SDValue();
	} else
	return SDValue();

	if (SRA.getOperand(0) != Mul)
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(AddcNode);
	unsigned Opcode = 0;
	SDValue Op0;
	SDValue Op1;

	if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
	Opcode = ARMISD::SMLALBB;
	Op0 = Mul.getOperand(0);
	Op1 = Mul.getOperand(1);
	} else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
	Opcode = ARMISD::SMLALBT;
	Op0 = Mul.getOperand(0);
	Op1 = Mul.getOperand(1).getOperand(0);
	} else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
	Opcode = ARMISD::SMLALTB;
	Op0 = Mul.getOperand(0).getOperand(0);
	Op1 = Mul.getOperand(1);
	} else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
	Opcode = ARMISD::SMLALTT;
	Op0 = Mul->getOperand(0).getOperand(0);
	Op1 = Mul->getOperand(1).getOperand(0);
	}

	if (!Op0 \|\| !Op1)
	return SDValue();

	SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
	Op0, Op1, Lo, Hi);
	// Replace the ADDs' nodes uses by the MLA node's values.
	SDValue HiMLALResult(SMLAL.getNode(), 1);
	SDValue LoMLALResult(SMLAL.getNode(), 0);

	DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
	DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);

	// Return original node to notify the driver to stop replacing.
	SDValue resNode(AddcNode, 0);
	return resNode;
	}

	static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	// Look for multiply add opportunities.
	// The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
	// each add nodes consumes a value from ISD::UMUL_LOHI and there is
	// a glue link from the first add to the second add.
	// If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
	// a S/UMLAL instruction.
	// UMUL_LOHI
	// / :lo \ :hi
	// V \ [no multiline comment]
	// loAdd -> ADDC \|
	// \ :carry /
	// V V
	// ADDE <- hiAdd
	//
	// In the special case where only the higher part of a signed result is used
	// and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
	// a constant with the exact value of 0x80000000, we recognize we are dealing
	// with a "rounded multiply and add" (or subtract) and transform it into
	// either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.

	assert((AddeSubeNode->getOpcode() == ARMISD::ADDE \|\|
	AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
	"Expect an ADDE or SUBE");

	assert(AddeSubeNode->getNumOperands() == 3 &&
	AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
	"ADDE node has the wrong inputs");

	// Check that we are chained to the right ADDC or SUBC node.
	SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
	if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
	AddcSubcNode->getOpcode() != ARMISD::ADDC) \|\|
	(AddeSubeNode->getOpcode() == ARMISD::SUBE &&
	AddcSubcNode->getOpcode() != ARMISD::SUBC))
	return SDValue();

	SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
	SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);

	// Check if the two operands are from the same mul_lohi node.
	if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
	return SDValue();

	assert(AddcSubcNode->getNumValues() == 2 &&
	AddcSubcNode->getValueType(0) == MVT::i32 &&
	"Expect ADDC with two result values. First: i32");

	// Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
	// maybe a SMLAL which multiplies two 16-bit values.
	if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
	AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
	AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
	AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
	AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
	return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);

	// Check for the triangle shape.
	SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
	SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);

	// Make sure that the ADDE/SUBE operands are not coming from the same node.
	if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
	return SDValue();

	// Find the MUL_LOHI node walking up ADDE/SUBE's operands.
	bool IsLeftOperandMUL = false;
	SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
	if (MULOp == SDValue())
	MULOp = findMUL_LOHI(AddeSubeOp1);
	else
	IsLeftOperandMUL = true;
	if (MULOp == SDValue())
	return SDValue();

	// Figure out the right opcode.
	unsigned Opc = MULOp->getOpcode();
	unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;

	// Figure out the high and low input values to the MLAL node.
	SDValue *HiAddSub = nullptr;
	SDValue *LoMul = nullptr;
	SDValue *LowAddSub = nullptr;

	// Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
	if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
	return SDValue();

	if (IsLeftOperandMUL)
	HiAddSub = &AddeSubeOp1;
	else
	HiAddSub = &AddeSubeOp0;

	// Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
	// whose low result is fed to the ADDC/SUBC we are checking.

	if (AddcSubcOp0 == MULOp.getValue(0)) {
	LoMul = &AddcSubcOp0;
	LowAddSub = &AddcSubcOp1;
	}
	if (AddcSubcOp1 == MULOp.getValue(0)) {
	LoMul = &AddcSubcOp1;
	LowAddSub = &AddcSubcOp0;
	}

	if (!LoMul)
	return SDValue();

	// If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
	// the replacement below will create a cycle.
	if (AddcSubcNode == HiAddSub->getNode() \|\|
	AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
	return SDValue();

	// Create the merged node.
	SelectionDAG &DAG = DCI.DAG;

	// Start building operand list.
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(LoMul->getOperand(0));
	Ops.push_back(LoMul->getOperand(1));

	// Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
	// the case, we must be doing signed multiplication and only use the higher
	// part of the result of the MLAL, furthermore the LowAddSub must be a constant
	// addition or subtraction with the value of 0x800000.
	if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
	FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
	LowAddSub->getNode()->getOpcode() == ISD::Constant &&
	static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
	0x80000000) {
	Ops.push_back(*HiAddSub);
	if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
	FinalOpc = ARMISD::SMMLSR;
	} else {
	FinalOpc = ARMISD::SMMLAR;
	}
	SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
	DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);

	return SDValue(AddeSubeNode, 0);
	} else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
	// SMMLS is generated during instruction selection and the rest of this
	// function can not handle the case where AddcSubcNode is a SUBC.
	return SDValue();

	// Finish building the operand list for {U/S}MLAL
	Ops.push_back(*LowAddSub);
	Ops.push_back(*HiAddSub);

	SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
	DAG.getVTList(MVT::i32, MVT::i32), Ops);

	// Replace the ADDs' nodes uses by the MLA node's values.
	SDValue HiMLALResult(MLALNode.getNode(), 1);
	DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);

	SDValue LoMLALResult(MLALNode.getNode(), 0);
	DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);

	// Return original node to notify the driver to stop replacing.
	return SDValue(AddeSubeNode, 0);
	}

	static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	// UMAAL is similar to UMLAL except that it adds two unsigned values.
	// While trying to combine for the other MLAL nodes, first search for the
	// chance to use UMAAL. Check if Addc uses a node which has already
	// been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
	// as the addend, and it's handled in PerformUMLALCombine.

	if (!Subtarget->hasV6Ops() \|\| !Subtarget->hasDSP())
	return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);

	// Check that we have a glued ADDC node.
	SDNode* AddcNode = AddeNode->getOperand(2).getNode();
	if (AddcNode->getOpcode() != ARMISD::ADDC)
	return SDValue();

	// Find the converted UMAAL or quit if it doesn't exist.
	SDNode *UmlalNode = nullptr;
	SDValue AddHi;
	if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
	UmlalNode = AddcNode->getOperand(0).getNode();
	AddHi = AddcNode->getOperand(1);
	} else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
	UmlalNode = AddcNode->getOperand(1).getNode();
	AddHi = AddcNode->getOperand(0);
	} else {
	return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
	}

	// The ADDC should be glued to an ADDE node, which uses the same UMLAL as
	// the ADDC as well as Zero.
	if (!isNullConstant(UmlalNode->getOperand(3)))
	return SDValue();

	if ((isNullConstant(AddeNode->getOperand(0)) &&
	AddeNode->getOperand(1).getNode() == UmlalNode) \|\|
	(AddeNode->getOperand(0).getNode() == UmlalNode &&
	isNullConstant(AddeNode->getOperand(1)))) {
	SelectionDAG &DAG = DCI.DAG;
	SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
	UmlalNode->getOperand(2), AddHi };
	SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
	DAG.getVTList(MVT::i32, MVT::i32), Ops);

	// Replace the ADDs' nodes uses by the UMAAL node's values.
	DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
	DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));

	// Return original node to notify the driver to stop replacing.
	return SDValue(AddeNode, 0);
	}
	return SDValue();
	}

	static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,
	const ARMSubtarget *Subtarget) {
	if (!Subtarget->hasV6Ops() \|\| !Subtarget->hasDSP())
	return SDValue();

	// Check that we have a pair of ADDC and ADDE as operands.
	// Both addends of the ADDE must be zero.
	SDNode* AddcNode = N->getOperand(2).getNode();
	SDNode* AddeNode = N->getOperand(3).getNode();
	if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
	(AddeNode->getOpcode() == ARMISD::ADDE) &&
	isNullConstant(AddeNode->getOperand(0)) &&
	isNullConstant(AddeNode->getOperand(1)) &&
	(AddeNode->getOperand(2).getNode() == AddcNode))
	return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
	DAG.getVTList(MVT::i32, MVT::i32),
	{N->getOperand(0), N->getOperand(1),
	AddcNode->getOperand(0), AddcNode->getOperand(1)});
	else
	return SDValue();
	}

	static SDValue PerformAddcSubcCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	SelectionDAG &DAG(DCI.DAG);

	if (N->getOpcode() == ARMISD::SUBC) {
	// (SUBC (ADDE 0, 0, C), 1) -> C
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	if (LHS->getOpcode() == ARMISD::ADDE &&
	isNullConstant(LHS->getOperand(0)) &&
	isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
	return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
	}
	}

	if (Subtarget->isThumb1Only()) {
	SDValue RHS = N->getOperand(1);
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
	int32_t imm = C->getSExtValue();
	if (imm < 0 && imm > std::numeric_limits<int>::min()) {
	SDLoc DL(N);
	RHS = DAG.getConstant(-imm, DL, MVT::i32);
	unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
	: ARMISD::ADDC;
	return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
	}
	}
	}

	return SDValue();
	}

	static SDValue PerformAddeSubeCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	if (Subtarget->isThumb1Only()) {
	SelectionDAG &DAG = DCI.DAG;
	SDValue RHS = N->getOperand(1);
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
	int64_t imm = C->getSExtValue();
	if (imm < 0) {
	SDLoc DL(N);

	// The with-carry-in form matches bitwise not instead of the negation.
	// Effectively, the inverse interpretation of the carry flag already
	// accounts for part of the negation.
	RHS = DAG.getConstant(~imm, DL, MVT::i32);

	unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
	: ARMISD::ADDE;
	return DAG.getNode(Opcode, DL, N->getVTList(),
	N->getOperand(0), RHS, N->getOperand(2));
	}
	}
	} else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
	return AddCombineTo64bitMLAL(N, DCI, Subtarget);
	}
	return SDValue();
	}

	static SDValue PerformABSCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	SDValue res;
	SelectionDAG &DAG = DCI.DAG;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0)))
	return SDValue();

	if (!TLI.expandABS(N, res, DAG))
	return SDValue();

	return res;
	}

	/// PerformADDECombine - Target-specific dag combine transform from
	/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
	/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
	static SDValue PerformADDECombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	// Only ARM and Thumb2 support UMLAL/SMLAL.
	if (Subtarget->isThumb1Only())
	return PerformAddeSubeCombine(N, DCI, Subtarget);

	// Only perform the checks after legalize when the pattern is available.
	if (DCI.isBeforeLegalize()) return SDValue();

	return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
	}

	/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
	/// operands N0 and N1. This is a helper for PerformADDCombine that is
	/// called with the default operands, and if that fails, with commuted
	/// operands.
	static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget){
	// Attempt to create vpadd for this add.
	if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
	return Result;

	// Attempt to create vpaddl for this add.
	if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
	return Result;
	if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
	Subtarget))
	return Result;

	// fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
	if (N0.getNode()->hasOneUse())
	if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
	return Result;
	return SDValue();
	}

	bool
	ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
	CombineLevel Level) const {
	if (Level == BeforeLegalizeTypes)
	return true;

	if (N->getOpcode() != ISD::SHL)
	return true;

	if (Subtarget->isThumb1Only()) {
	// Avoid making expensive immediates by commuting shifts. (This logic
	// only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
	// for free.)
	if (N->getOpcode() != ISD::SHL)
	return true;
	SDValue N1 = N->getOperand(0);
	if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
	N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
	return true;
	if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
	if (Const->getAPIntValue().ult(256))
	return false;
	if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
	Const->getAPIntValue().sgt(-256))
	return false;
	}
	return true;
	}

	// Turn off commute-with-shift transform after legalization, so it doesn't
	// conflict with PerformSHLSimplify. (We could try to detect when
	// PerformSHLSimplify would trigger more precisely, but it isn't
	// really necessary.)
	return false;
	}

	bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
	const SDNode *N, CombineLevel Level) const {
	if (!Subtarget->isThumb1Only())
	return true;

	if (Level == BeforeLegalizeTypes)
	return true;

	return false;
	}

	bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
	if (!Subtarget->hasNEON()) {
	if (Subtarget->isThumb1Only())
	return VT.getScalarSizeInBits() <= 32;
	return true;
	}
	return VT.isScalarInteger();
	}

	static SDValue PerformSHLSimplify(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *ST) {
	// Allow the generic combiner to identify potential bswaps.
	if (DCI.isBeforeLegalize())
	return SDValue();

	// DAG combiner will fold:
	// (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
	// (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
	// Other code patterns that can be also be modified have the following form:
	// b + ((a << 1) \| 510)
	// b + ((a << 1) & 510)
	// b + ((a << 1) ^ 510)
	// b + ((a << 1) + 510)

	// Many instructions can perform the shift for free, but it requires both
	// the operands to be registers. If c1 << c2 is too large, a mov immediate
	// instruction will needed. So, unfold back to the original pattern if:
	// - if c1 and c2 are small enough that they don't require mov imms.
	// - the user(s) of the node can perform an shl

	// No shifted operands for 16-bit instructions.
	if (ST->isThumb() && ST->isThumb1Only())
	return SDValue();

	// Check that all the users could perform the shl themselves.
	for (auto U : N->uses()) {
	switch(U->getOpcode()) {
	default:
	return SDValue();
	case ISD::SUB:
	case ISD::ADD:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	case ISD::SETCC:
	case ARMISD::CMP:
	// Check that the user isn't already using a constant because there
	// aren't any instructions that support an immediate operand and a
	// shifted operand.
	if (isa<ConstantSDNode>(U->getOperand(0)) \|\|
	isa<ConstantSDNode>(U->getOperand(1)))
	return SDValue();

	// Check that it's not already using a shift.
	if (U->getOperand(0).getOpcode() == ISD::SHL \|\|
	U->getOperand(1).getOpcode() == ISD::SHL)
	return SDValue();
	break;
	}
	}

	if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
	N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
	return SDValue();

	if (N->getOperand(0).getOpcode() != ISD::SHL)
	return SDValue();

	SDValue SHL = N->getOperand(0);

	auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
	auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
	if (!C1ShlC2 \|\| !C2)
	return SDValue();

	APInt C2Int = C2->getAPIntValue();
	APInt C1Int = C1ShlC2->getAPIntValue();

	// Check that performing a lshr will not lose any information.
	APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
	C2Int.getBitWidth() - C2->getZExtValue());
	if ((C1Int & Mask) != C1Int)
	return SDValue();

	// Shift the first constant.
	C1Int.lshrInPlace(C2Int);

	// The immediates are encoded as an 8-bit value that can be rotated.
	auto LargeImm = [](const APInt &Imm) {
	unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros();
	return Imm.getBitWidth() - Zeros > 8;
	};

	if (LargeImm(C1Int) \|\| LargeImm(C2Int))
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	SDValue X = SHL.getOperand(0);
	SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
	DAG.getConstant(C1Int, dl, MVT::i32));
	// Shift left to compensate for the lshr of C1Int.
	SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));

	LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
	SHL.dump(); N->dump());
	LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
	return Res;
	}


	/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
	///
	static SDValue PerformADDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Only works one way, because it needs an immediate operand.
	if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
	return Result;

	// First try with the default operand order.
	if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
	return Result;

	// If that didn't work, try again with the operands commuted.
	return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
	}

	/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
	///
	static SDValue PerformSUBCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
	if (N1.getNode()->hasOneUse())
	if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
	return Result;

	return SDValue();
	}

	/// PerformVMULCombine
	/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
	/// special multiplier accumulator forwarding.
	/// vmul d3, d0, d2
	/// vmla d3, d1, d2
	/// is faster than
	/// vadd d3, d0, d1
	/// vmul d3, d3, d2
	// However, for (A + B) * (A + B),
	// vadd d2, d0, d1
	// vmul d3, d0, d2
	// vmla d3, d1, d2
	// is slower than
	// vadd d2, d0, d1
	// vmul d3, d2, d2
	static SDValue PerformVMULCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	if (!Subtarget->hasVMLxForwarding())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned Opcode = N0.getOpcode();
	if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
	Opcode != ISD::FADD && Opcode != ISD::FSUB) {
	Opcode = N1.getOpcode();
	if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
	Opcode != ISD::FADD && Opcode != ISD::FSUB)
	return SDValue();
	std::swap(N0, N1);
	}

	if (N0 == N1)
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	SDValue N00 = N0->getOperand(0);
	SDValue N01 = N0->getOperand(1);
	return DAG.getNode(Opcode, DL, VT,
	DAG.getNode(ISD::MUL, DL, VT, N00, N1),
	DAG.getNode(ISD::MUL, DL, VT, N01, N1));
	}

	static SDValue PerformMULCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	SelectionDAG &DAG = DCI.DAG;

	if (Subtarget->isThumb1Only())
	return SDValue();

	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	EVT VT = N->getValueType(0);
	if (VT.is64BitVector() \|\| VT.is128BitVector())
	return PerformVMULCombine(N, DCI, Subtarget);
	if (VT != MVT::i32)
	return SDValue();

	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();

	int64_t MulAmt = C->getSExtValue();
	unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);

	ShiftAmt = ShiftAmt & (32 - 1);
	SDValue V = N->getOperand(0);
	SDLoc DL(N);

	SDValue Res;
	MulAmt >>= ShiftAmt;

	if (MulAmt >= 0) {
	if (isPowerOf2_32(MulAmt - 1)) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	Res = DAG.getNode(ISD::ADD, DL, VT,
	V,
	DAG.getNode(ISD::SHL, DL, VT,
	V,
	DAG.getConstant(Log2_32(MulAmt - 1), DL,
	MVT::i32)));
	} else if (isPowerOf2_32(MulAmt + 1)) {
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	Res = DAG.getNode(ISD::SUB, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT,
	V,
	DAG.getConstant(Log2_32(MulAmt + 1), DL,
	MVT::i32)),
	V);
	} else
	return SDValue();
	} else {
	uint64_t MulAmtAbs = -MulAmt;
	if (isPowerOf2_32(MulAmtAbs + 1)) {
	// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
	Res = DAG.getNode(ISD::SUB, DL, VT,
	V,
	DAG.getNode(ISD::SHL, DL, VT,
	V,
	DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
	MVT::i32)));
	} else if (isPowerOf2_32(MulAmtAbs - 1)) {
	// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
	Res = DAG.getNode(ISD::ADD, DL, VT,
	V,
	DAG.getNode(ISD::SHL, DL, VT,
	V,
	DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
	MVT::i32)));
	Res = DAG.getNode(ISD::SUB, DL, VT,
	DAG.getConstant(0, DL, MVT::i32), Res);
	} else
	return SDValue();
	}

	if (ShiftAmt != 0)
	Res = DAG.getNode(ISD::SHL, DL, VT,
	Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));

	// Do not add new nodes to DAG combiner worklist.
	DCI.CombineTo(N, Res, false);
	return SDValue();
	}

	static SDValue CombineANDShift(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	// Allow DAGCombine to pattern-match before we touch the canonical form.
	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	if (N->getValueType(0) != MVT::i32)
	return SDValue();

	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!N1C)
	return SDValue();

	uint32_t C1 = (uint32_t)N1C->getZExtValue();
	// Don't transform uxtb/uxth.
	if (C1 == 255 \|\| C1 == 65535)
	return SDValue();

	SDNode *N0 = N->getOperand(0).getNode();
	if (!N0->hasOneUse())
	return SDValue();

	if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
	return SDValue();

	bool LeftShift = N0->getOpcode() == ISD::SHL;

	ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
	if (!N01C)
	return SDValue();

	uint32_t C2 = (uint32_t)N01C->getZExtValue();
	if (!C2 \|\| C2 >= 32)
	return SDValue();

	// Clear irrelevant bits in the mask.
	if (LeftShift)
	C1 &= (-1U << C2);
	else
	C1 &= (-1U >> C2);

	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	// We have a pattern of the form "(and (shl x, c2) c1)" or
	// "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
	// transform to a pair of shifts, to save materializing c1.

	// First pattern: right shift, then mask off leading bits.
	// FIXME: Use demanded bits?
	if (!LeftShift && isMask_32(C1)) {
	uint32_t C3 = countLeadingZeros(C1);
	if (C2 < C3) {
	SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
	DAG.getConstant(C3 - C2, DL, MVT::i32));
	return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
	DAG.getConstant(C3, DL, MVT::i32));
	}
	}

	// First pattern, reversed: left shift, then mask off trailing bits.
	if (LeftShift && isMask_32(~C1)) {
	uint32_t C3 = countTrailingZeros(C1);
	if (C2 < C3) {
	SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
	DAG.getConstant(C3 - C2, DL, MVT::i32));
	return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
	DAG.getConstant(C3, DL, MVT::i32));
	}
	}

	// Second pattern: left shift, then mask off leading bits.
	// FIXME: Use demanded bits?
	if (LeftShift && isShiftedMask_32(C1)) {
	uint32_t Trailing = countTrailingZeros(C1);
	uint32_t C3 = countLeadingZeros(C1);
	if (Trailing == C2 && C2 + C3 < 32) {
	SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
	DAG.getConstant(C2 + C3, DL, MVT::i32));
	return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
	DAG.getConstant(C3, DL, MVT::i32));
	}
	}

	// Second pattern, reversed: right shift, then mask off trailing bits.
	// FIXME: Handle other patterns of known/demanded bits.
	if (!LeftShift && isShiftedMask_32(C1)) {
	uint32_t Leading = countLeadingZeros(C1);
	uint32_t C3 = countTrailingZeros(C1);
	if (Leading == C2 && C2 + C3 < 32) {
	SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
	DAG.getConstant(C2 + C3, DL, MVT::i32));
	return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
	DAG.getConstant(C3, DL, MVT::i32));
	}
	}

	// FIXME: Transform "(and (shl x, c2) c1)" ->
	// "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
	// c1.
	return SDValue();
	}

	static SDValue PerformANDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	// Attempt to use immediate-form VBIC
	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	SelectionDAG &DAG = DCI.DAG;

	if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (BVN && Subtarget->hasNEON() &&
	BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
	if (SplatBitSize <= 64) {
	EVT VbicVT;
	SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
	SplatUndef.getZExtValue(), SplatBitSize,
	DAG, dl, VbicVT, VT.is128BitVector(),
	OtherModImm);
	if (Val.getNode()) {
	SDValue Input =
	DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
	SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
	return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
	}
	}
	}

	if (!Subtarget->isThumb1Only()) {
	// fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
	if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
	return Result;

	if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
	return Result;
	}

	if (Subtarget->isThumb1Only())
	if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
	return Result;

	return SDValue();
	}

	// Try combining OR nodes to SMULWB, SMULWT.
	static SDValue PerformORCombineToSMULWBT(SDNode *OR,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	if (!Subtarget->hasV6Ops() \|\|
	(Subtarget->isThumb() &&
	(!Subtarget->hasThumb2() \|\| !Subtarget->hasDSP())))
	return SDValue();

	SDValue SRL = OR->getOperand(0);
	SDValue SHL = OR->getOperand(1);

	if (SRL.getOpcode() != ISD::SRL \|\| SHL.getOpcode() != ISD::SHL) {
	SRL = OR->getOperand(1);
	SHL = OR->getOperand(0);
	}
	if (!isSRL16(SRL) \|\| !isSHL16(SHL))
	return SDValue();

	// The first operands to the shifts need to be the two results from the
	// same smul_lohi node.
	if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) \|\|
	SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
	return SDValue();

	SDNode *SMULLOHI = SRL.getOperand(0).getNode();
	if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) \|\|
	SHL.getOperand(0) != SDValue(SMULLOHI, 1))
	return SDValue();

	// Now we have:
	// (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
	// For SMUL[B\|T] smul_lohi will take a 32-bit and a 16-bit arguments.
	// For SMUWB the 16-bit value will signed extended somehow.
	// For SMULWT only the SRA is required.
	// Check both sides of SMUL_LOHI
	SDValue OpS16 = SMULLOHI->getOperand(0);
	SDValue OpS32 = SMULLOHI->getOperand(1);

	SelectionDAG &DAG = DCI.DAG;
	if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
	OpS16 = OpS32;
	OpS32 = SMULLOHI->getOperand(0);
	}

	SDLoc dl(OR);
	unsigned Opcode = 0;
	if (isS16(OpS16, DAG))
	Opcode = ARMISD::SMULWB;
	else if (isSRA16(OpS16)) {
	Opcode = ARMISD::SMULWT;
	OpS16 = OpS16->getOperand(0);
	}
	else
	return SDValue();

	SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
	DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
	return SDValue(OR, 0);
	}

	static SDValue PerformORCombineToBFI(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	// BFI is only available on V6T2+
	if (Subtarget->isThumb1Only() \|\| !Subtarget->hasV6T2Ops())
	return SDValue();

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);
	// 1) or (and A, mask), val => ARMbfi A, val, mask
	// iff (val & mask) == val
	//
	// 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
	// 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
	// && mask == ~mask2
	// 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
	// && ~mask == mask2
	// (i.e., copy a bitfield value into another bitfield of the same width)

	if (VT != MVT::i32)
	return SDValue();

	SDValue N00 = N0.getOperand(0);

	// The value and the mask need to be constants so we can verify this is
	// actually a bitfield set. If the mask is 0xffff, we can do better
	// via a movt instruction, so don't use BFI in that case.
	SDValue MaskOp = N0.getOperand(1);
	ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
	if (!MaskC)
	return SDValue();
	unsigned Mask = MaskC->getZExtValue();
	if (Mask == 0xffff)
	return SDValue();
	SDValue Res;
	// Case (1): or (and A, mask), val => ARMbfi A, val, mask
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (N1C) {
	unsigned Val = N1C->getZExtValue();
	if ((Val & ~Mask) != Val)
	return SDValue();

	if (ARM::isBitFieldInvertedMask(Mask)) {
	Val >>= countTrailingZeros(~Mask);

	Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
	DAG.getConstant(Val, DL, MVT::i32),
	DAG.getConstant(Mask, DL, MVT::i32));

	DCI.CombineTo(N, Res, false);
	// Return value from the original node to inform the combiner than N is
	// now dead.
	return SDValue(N, 0);
	}
	} else if (N1.getOpcode() == ISD::AND) {
	// case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
	ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
	if (!N11C)
	return SDValue();
	unsigned Mask2 = N11C->getZExtValue();

	// Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
	// as is to match.
	if (ARM::isBitFieldInvertedMask(Mask) &&
	(Mask == ~Mask2)) {
	// The pack halfword instruction works better for masks that fit it,
	// so use that when it's available.
	if (Subtarget->hasDSP() &&
	(Mask == 0xffff \|\| Mask == 0xffff0000))
	return SDValue();
	// 2a
	unsigned amt = countTrailingZeros(Mask2);
	Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
	DAG.getConstant(amt, DL, MVT::i32));
	Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
	DAG.getConstant(Mask, DL, MVT::i32));
	DCI.CombineTo(N, Res, false);
	// Return value from the original node to inform the combiner than N is
	// now dead.
	return SDValue(N, 0);
	} else if (ARM::isBitFieldInvertedMask(~Mask) &&
	(~Mask == Mask2)) {
	// The pack halfword instruction works better for masks that fit it,
	// so use that when it's available.
	if (Subtarget->hasDSP() &&
	(Mask2 == 0xffff \|\| Mask2 == 0xffff0000))
	return SDValue();
	// 2b
	unsigned lsb = countTrailingZeros(Mask);
	Res = DAG.getNode(ISD::SRL, DL, VT, N00,
	DAG.getConstant(lsb, DL, MVT::i32));
	Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
	DAG.getConstant(Mask2, DL, MVT::i32));
	DCI.CombineTo(N, Res, false);
	// Return value from the original node to inform the combiner than N is
	// now dead.
	return SDValue(N, 0);
	}
	}

	if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
	N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
	ARM::isBitFieldInvertedMask(~Mask)) {
	// Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
	// where lsb(mask) == #shamt and masked bits of B are known zero.
	SDValue ShAmt = N00.getOperand(1);
	unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
	unsigned LSB = countTrailingZeros(Mask);
	if (ShAmtC != LSB)
	return SDValue();

	Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
	DAG.getConstant(~Mask, DL, MVT::i32));

	DCI.CombineTo(N, Res, false);
	// Return value from the original node to inform the combiner than N is
	// now dead.
	return SDValue(N, 0);
	}

	return SDValue();
	}

	/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
	static SDValue PerformORCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	// Attempt to use immediate-form VORR
	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	SelectionDAG &DAG = DCI.DAG;

	if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (BVN && Subtarget->hasNEON() &&
	BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
	if (SplatBitSize <= 64) {
	EVT VorrVT;
	SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
	SplatUndef.getZExtValue(), SplatBitSize,
	DAG, dl, VorrVT, VT.is128BitVector(),
	OtherModImm);
	if (Val.getNode()) {
	SDValue Input =
	DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
	SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
	return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
	}
	}
	}

	if (!Subtarget->isThumb1Only()) {
	// fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
	if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
	return Result;
	if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
	return Result;
	}

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
	if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
	DAG.getTargetLoweringInfo().isTypeLegal(VT)) {

	// The code below optimizes (or (and X, Y), Z).
	// The AND operand needs to have a single user to make these optimizations
	// profitable.
	if (N0.getOpcode() != ISD::AND \|\| !N0.hasOneUse())
	return SDValue();

	APInt SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;

	APInt SplatBits0, SplatBits1;
	BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
	BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
	// Ensure that the second operand of both ands are constants
	if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
	HasAnyUndefs) && !HasAnyUndefs) {
	if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
	HasAnyUndefs) && !HasAnyUndefs) {
	// Ensure that the bit width of the constants are the same and that
	// the splat arguments are logical inverses as per the pattern we
	// are trying to simplify.
	if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
	SplatBits0 == ~SplatBits1) {
	// Canonicalize the vector type to make instruction selection
	// simpler.
	EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
	SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
	N0->getOperand(1),
	N0->getOperand(0),
	N1->getOperand(0));
	return DAG.getNode(ISD::BITCAST, dl, VT, Result);
	}
	}
	}
	}

	// Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
	// reasonable.
	if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
	if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
	return Res;
	}

	if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
	return Result;

	return SDValue();
	}

	static SDValue PerformXORCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	EVT VT = N->getValueType(0);
	SelectionDAG &DAG = DCI.DAG;

	if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	if (!Subtarget->isThumb1Only()) {
	// fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
	if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
	return Result;

	if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
	return Result;
	}

	return SDValue();
	}

	// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
	// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
	// their position in "to" (Rd).
	static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
	assert(N->getOpcode() == ARMISD::BFI);

	SDValue From = N->getOperand(1);
	ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
	FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());

	// If the Base came from a SHR #C, we can deduce that it is really testing bit
	// #C in the base of the SHR.
	if (From->getOpcode() == ISD::SRL &&
	isa<ConstantSDNode>(From->getOperand(1))) {
	APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
	assert(Shift.getLimitedValue() < 32 && "Shift too large!");
	FromMask <<= Shift.getLimitedValue(31);
	From = From->getOperand(0);
	}

	return From;
	}

	// If A and B contain one contiguous set of bits, does A \| B == A . B?
	//
	// Neither A nor B must be zero.
	static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
	unsigned LastActiveBitInA = A.countTrailingZeros();
	unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
	return LastActiveBitInA - 1 == FirstActiveBitInB;
	}

	static SDValue FindBFIToCombineWith(SDNode *N) {
	// We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
	// if one exists.
	APInt ToMask, FromMask;
	SDValue From = ParseBFI(N, ToMask, FromMask);
	SDValue To = N->getOperand(0);

	// Now check for a compatible BFI to merge with. We can pass through BFIs that
	// aren't compatible, but not if they set the same bit in their destination as
	// we do (or that of any BFI we're going to combine with).
	SDValue V = To;
	APInt CombinedToMask = ToMask;
	while (V.getOpcode() == ARMISD::BFI) {
	APInt NewToMask, NewFromMask;
	SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
	if (NewFrom != From) {
	// This BFI has a different base. Keep going.
	CombinedToMask \|= NewToMask;
	V = V.getOperand(0);
	continue;
	}

	// Do the written bits conflict with any we've seen so far?
	if ((NewToMask & CombinedToMask).getBoolValue())
	// Conflicting bits - bail out because going further is unsafe.
	return SDValue();

	// Are the new bits contiguous when combined with the old bits?
	if (BitsProperlyConcatenate(ToMask, NewToMask) &&
	BitsProperlyConcatenate(FromMask, NewFromMask))
	return V;
	if (BitsProperlyConcatenate(NewToMask, ToMask) &&
	BitsProperlyConcatenate(NewFromMask, FromMask))
	return V;

	// We've seen a write to some bits, so track it.
	CombinedToMask \|= NewToMask;
	// Keep going...
	V = V.getOperand(0);
	}

	return SDValue();
	}

	static SDValue PerformBFICombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue N1 = N->getOperand(1);
	if (N1.getOpcode() == ISD::AND) {
	// (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
	// the bits being cleared by the AND are not demanded by the BFI.
	ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
	if (!N11C)
	return SDValue();
	unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
	unsigned LSB = countTrailingZeros(~InvMask);
	unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
	assert(Width <
	static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
	"undefined behavior");
	unsigned Mask = (1u << Width) - 1;
	unsigned Mask2 = N11C->getZExtValue();
	if ((Mask & (~Mask2)) == 0)
	return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N1.getOperand(0),
	N->getOperand(2));
	} else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
	// We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
	// Keep track of any consecutive bits set that all come from the same base
	// value. We can combine these together into a single BFI.
	SDValue CombineBFI = FindBFIToCombineWith(N);
	if (CombineBFI == SDValue())
	return SDValue();

	// We've found a BFI.
	APInt ToMask1, FromMask1;
	SDValue From1 = ParseBFI(N, ToMask1, FromMask1);

	APInt ToMask2, FromMask2;
	SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
	assert(From1 == From2);
	(void)From2;

	// First, unlink CombineBFI.
	DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
	// Then create a new BFI, combining the two together.
	APInt NewFromMask = FromMask1 \| FromMask2;
	APInt NewToMask = ToMask1 \| ToMask2;

	EVT VT = N->getValueType(0);
	SDLoc dl(N);

	if (NewFromMask[0] == 0)
	From1 = DCI.DAG.getNode(
	ISD::SRL, dl, VT, From1,
	DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
	return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
	DCI.DAG.getConstant(~NewToMask, dl, VT));
	}
	return SDValue();
	}

	/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
	/// ARMISD::VMOVRRD.
	static SDValue PerformVMOVRRDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	// vmovrrd(vmovdrr x, y) -> x,y
	SDValue InDouble = N->getOperand(0);
	if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
	return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));

	// vmovrrd(load f64) -> (load i32), (load i32)
	SDNode *InNode = InDouble.getNode();
	if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
	InNode->getValueType(0) == MVT::f64 &&
	InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
	!cast<LoadSDNode>(InNode)->isVolatile()) {
	// TODO: Should this be done for non-FrameIndex operands?
	LoadSDNode *LD = cast<LoadSDNode>(InNode);

	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(LD);
	SDValue BasePtr = LD->getBasePtr();
	SDValue NewLD1 =
	DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
	LD->getAlignment(), LD->getMemOperand()->getFlags());

	SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
	DAG.getConstant(4, DL, MVT::i32));

	SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
	LD->getPointerInfo().getWithOffset(4),
	std::min(4U, LD->getAlignment()),
	LD->getMemOperand()->getFlags());

	DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
	if (DCI.DAG.getDataLayout().isBigEndian())
	std::swap (NewLD1, NewLD2);
	SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
	return Result;
	}

	return SDValue();
	}

	/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
	/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
	static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
	// N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	if (Op0.getOpcode() == ISD::BITCAST)
	Op0 = Op0.getOperand(0);
	if (Op1.getOpcode() == ISD::BITCAST)
	Op1 = Op1.getOperand(0);
	if (Op0.getOpcode() == ARMISD::VMOVRRD &&
	Op0.getNode() == Op1.getNode() &&
	Op0.getResNo() == 0 && Op1.getResNo() == 1)
	return DAG.getNode(ISD::BITCAST, SDLoc(N),
	N->getValueType(0), Op0.getOperand(0));
	return SDValue();
	}

	/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
	/// are normal, non-volatile loads. If so, it is profitable to bitcast an
	/// i64 vector to have f64 elements, since the value can then be loaded
	/// directly into a VFP register.
	static bool hasNormalLoadOperand(SDNode *N) {
	unsigned NumElts = N->getValueType(0).getVectorNumElements();
	for (unsigned i = 0; i < NumElts; ++i) {
	SDNode *Elt = N->getOperand(i).getNode();
	if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
	return true;
	}
	return false;
	}

	/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
	/// ISD::BUILD_VECTOR.
	static SDValue PerformBUILD_VECTORCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	// build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
	// VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
	// into a pair of GPRs, which is fine when the value is used as a scalar,
	// but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
	SelectionDAG &DAG = DCI.DAG;
	if (N->getNumOperands() == 2)
	if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
	return RV;

	// Load i64 elements as f64 values so that type legalization does not split
	// them up into i32 values.
	EVT VT = N->getValueType(0);
	if (VT.getVectorElementType() != MVT::i64 \|\| !hasNormalLoadOperand(N))
	return SDValue();
	SDLoc dl(N);
	SmallVector<SDValue, 8> Ops;
	unsigned NumElts = VT.getVectorNumElements();
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
	Ops.push_back(V);
	// Make the DAGCombiner fold the bitcast.
	DCI.AddToWorklist(V.getNode());
	}
	EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
	SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
	return DAG.getNode(ISD::BITCAST, dl, VT, BV);
	}

	/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
	static SDValue
	PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
	// ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
	// At that time, we may have inserted bitcasts from integer to float.
	// If these bitcasts have survived DAGCombine, change the lowering of this
	// BUILD_VECTOR in something more vector friendly, i.e., that does not
	// force to use floating point types.

	// Make sure we can change the type of the vector.
	// This is possible iff:
	// 1. The vector is only used in a bitcast to a integer type. I.e.,
	// 1.1. Vector is used only once.
	// 1.2. Use is a bit convert to an integer type.
	// 2. The size of its operands are 32-bits (64-bits are not legal).
	EVT VT = N->getValueType(0);
	EVT EltVT = VT.getVectorElementType();

	// Check 1.1. and 2.
	if (EltVT.getSizeInBits() != 32 \|\| !N->hasOneUse())
	return SDValue();

	// By construction, the input type must be float.
	assert(EltVT == MVT::f32 && "Unexpected type!");

	// Check 1.2.
	SDNode Use = N->use_begin();
	if (Use->getOpcode() != ISD::BITCAST \|\|
	Use->getValueType(0).isFloatingPoint())
	return SDValue();

	// Check profitability.
	// Model is, if more than half of the relevant operands are bitcast from
	// i32, turn the build_vector into a sequence of insert_vector_elt.
	// Relevant operands are everything that is not statically
	// (i.e., at compile time) bitcasted.
	unsigned NumOfBitCastedElts = 0;
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumOfRelevantElts = NumElts;
	for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
	SDValue Elt = N->getOperand(Idx);
	if (Elt->getOpcode() == ISD::BITCAST) {
	// Assume only bit cast to i32 will go away.
	if (Elt->getOperand(0).getValueType() == MVT::i32)
	++NumOfBitCastedElts;
	} else if (Elt.isUndef() \|\| isa<ConstantSDNode>(Elt))
	// Constants are statically casted, thus do not count them as
	// relevant operands.
	--NumOfRelevantElts;
	}

	// Check if more than half of the elements require a non-free bitcast.
	if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	// Create the new vector type.
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
	// Check if the type is legal.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(VecVT))
	return SDValue();

	// Combine:
	// ARMISD::BUILD_VECTOR E1, E2, ..., EN.
	// => BITCAST INSERT_VECTOR_ELT
	// (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
	// (BITCAST EN), N.
	SDValue Vec = DAG.getUNDEF(VecVT);
	SDLoc dl(N);
	for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
	SDValue V = N->getOperand(Idx);
	if (V.isUndef())
	continue;
	if (V.getOpcode() == ISD::BITCAST &&
	V->getOperand(0).getValueType() == MVT::i32)
	// Fold obvious case.
	V = V.getOperand(0);
	else {
	V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
	// Make the DAGCombiner fold the bitcasts.
	DCI.AddToWorklist(V.getNode());
	}
	SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
	Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
	}
	Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
	// Make the DAGCombiner fold the bitcasts.
	DCI.AddToWorklist(Vec.getNode());
	return Vec;
	}

	/// PerformInsertEltCombine - Target-specific dag combine xforms for
	/// ISD::INSERT_VECTOR_ELT.
	static SDValue PerformInsertEltCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	// Bitcast an i64 load inserted into a vector to f64.
	// Otherwise, the i64 value will be legalized to a pair of i32 values.
	EVT VT = N->getValueType(0);
	SDNode *Elt = N->getOperand(1).getNode();
	if (VT.getVectorElementType() != MVT::i64 \|\|
	!ISD::isNormalLoad(Elt) \|\| cast<LoadSDNode>(Elt)->isVolatile())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(N);
	EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
	VT.getVectorNumElements());
	SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
	SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
	// Make the DAGCombiner fold the bitcasts.
	DCI.AddToWorklist(Vec.getNode());
	DCI.AddToWorklist(V.getNode());
	SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
	Vec, V, N->getOperand(2));
	return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
	}

	/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
	/// ISD::VECTOR_SHUFFLE.
	static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
	// The LLVM shufflevector instruction does not require the shuffle mask
	// length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
	// have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
	// operands do not match the mask length, they are extended by concatenating
	// them with undef vectors. That is probably the right thing for other
	// targets, but for NEON it is better to concatenate two double-register
	// size vector operands into a single quad-register size vector. Do that
	// transformation here:
	// shuffle(concat(v1, undef), concat(v2, undef)) ->
	// shuffle(concat(v1, v2), undef)
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	if (Op0.getOpcode() != ISD::CONCAT_VECTORS \|\|
	Op1.getOpcode() != ISD::CONCAT_VECTORS \|\|
	Op0.getNumOperands() != 2 \|\|
	Op1.getNumOperands() != 2)
	return SDValue();
	SDValue Concat0Op1 = Op0.getOperand(1);
	SDValue Concat1Op1 = Op1.getOperand(1);
	if (!Concat0Op1.isUndef() \|\| !Concat1Op1.isUndef())
	return SDValue();
	// Skip the transformation if any of the types are illegal.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = N->getValueType(0);
	if (!TLI.isTypeLegal(VT) \|\|
	!TLI.isTypeLegal(Concat0Op1.getValueType()) \|\|
	!TLI.isTypeLegal(Concat1Op1.getValueType()))
	return SDValue();

	SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
	Op0.getOperand(0), Op1.getOperand(0));
	// Translate the shuffle mask.
	SmallVector<int, 16> NewMask;
	unsigned NumElts = VT.getVectorNumElements();
	unsigned HalfElts = NumElts/2;
	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
	for (unsigned n = 0; n < NumElts; ++n) {
	int MaskElt = SVN->getMaskElt(n);
	int NewElt = -1;
	if (MaskElt < (int)HalfElts)
	NewElt = MaskElt;
	else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
	NewElt = HalfElts + MaskElt - NumElts;
	NewMask.push_back(NewElt);
	}
	return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
	DAG.getUNDEF(VT), NewMask);
	}

	/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
	/// NEON load/store intrinsics, and generic vector load/stores, to merge
	/// base address updates.
	/// For generic load/stores, the memory type is assumed to be a vector.
	/// The caller is assumed to have checked legality.
	static SDValue CombineBaseUpdate(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID \|\|
	N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
	const bool isStore = N->getOpcode() == ISD::STORE;
	const unsigned AddrOpIdx = ((isIntrinsic \|\| isStore) ? 2 : 1);
	SDValue Addr = N->getOperand(AddrOpIdx);
	MemSDNode *MemN = cast<MemSDNode>(N);
	SDLoc dl(N);

	// Search for a use of the address operand that is an increment.
	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
	UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User->getOpcode() != ISD::ADD \|\|
	UI.getUse().getResNo() != Addr.getResNo())
	continue;

	// Check that the add is independent of the load/store. Otherwise, folding
	// it would create a cycle. We can avoid searching through Addr as it's a
	// predecessor to both.
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Visited.insert(Addr.getNode());
	Worklist.push_back(N);
	Worklist.push_back(User);
	if (SDNode::hasPredecessorHelper(N, Visited, Worklist) \|\|
	SDNode::hasPredecessorHelper(User, Visited, Worklist))
	continue;

	// Find the new opcode for the updating load/store.
	bool isLoadOp = true;
	bool isLaneOp = false;
	unsigned NewOpc = 0;
	unsigned NumVecs = 0;
	if (isIntrinsic) {
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default: llvm_unreachable("unexpected intrinsic for Neon base update");
	case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD;
	NumVecs = 1; break;
	case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD;
	NumVecs = 2; break;
	case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD;
	NumVecs = 3; break;
	case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
	NumVecs = 4; break;
	case Intrinsic::arm_neon_vld2dup:
	case Intrinsic::arm_neon_vld3dup:
	case Intrinsic::arm_neon_vld4dup:
	// TODO: Support updating VLDxDUP nodes. For now, we just skip
	// combining base updates for such intrinsics.
	continue;
	case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
	NumVecs = 2; isLaneOp = true; break;
	case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
	NumVecs = 3; isLaneOp = true; break;
	case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
	NumVecs = 4; isLaneOp = true; break;
	case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD;
	NumVecs = 1; isLoadOp = false; break;
	case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD;
	NumVecs = 2; isLoadOp = false; break;
	case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD;
	NumVecs = 3; isLoadOp = false; break;
	case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD;
	NumVecs = 4; isLoadOp = false; break;
	case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
	NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
	case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
	NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
	case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
	NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
	}
	} else {
	isLaneOp = true;
	switch (N->getOpcode()) {
	default: llvm_unreachable("unexpected opcode for Neon base update");
	case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
	case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
	case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
	case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
	case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD;
	NumVecs = 1; isLaneOp = false; break;
	case ISD::STORE: NewOpc = ARMISD::VST1_UPD;
	NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
	}
	}

	// Find the size of memory referenced by the load/store.
	EVT VecTy;
	if (isLoadOp) {
	VecTy = N->getValueType(0);
	} else if (isIntrinsic) {
	VecTy = N->getOperand(AddrOpIdx+1).getValueType();
	} else {
	assert(isStore && "Node has to be a load, a store, or an intrinsic!");
	VecTy = N->getOperand(1).getValueType();
	}

	unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
	if (isLaneOp)
	NumBytes /= VecTy.getVectorNumElements();

	// If the increment is a constant, it must match the memory ref size.
	SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
	ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
	if (NumBytes >= 3 * 16 && (!CInc \|\| CInc->getZExtValue() != NumBytes)) {
	// VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
	// separate instructions that make it harder to use a non-constant update.
	continue;
	}

	// OK, we found an ADD we can fold into the base update.
	// Now, create a _UPD node, taking care of not breaking alignment.

	EVT AlignedVecTy = VecTy;
	unsigned Alignment = MemN->getAlignment();

	// If this is a less-than-standard-aligned load/store, change the type to
	// match the standard alignment.
	// The alignment is overlooked when selecting _UPD variants; and it's
	// easier to introduce bitcasts here than fix that.
	// There are 3 ways to get to this base-update combine:
	// - intrinsics: they are assumed to be properly aligned (to the standard
	// alignment of the memory type), so we don't need to do anything.
	// - ARMISD::VLDx nodes: they are only generated from the aforementioned
	// intrinsics, so, likewise, there's nothing to do.
	// - generic load/store instructions: the alignment is specified as an
	// explicit operand, rather than implicitly as the standard alignment
	// of the memory type (like the intrisics). We need to change the
	// memory type to match the explicit alignment. That way, we don't
	// generate non-standard-aligned ARMISD::VLDx nodes.
	if (isa<LSBaseSDNode>(N)) {
	if (Alignment == 0)
	Alignment = 1;
	if (Alignment < VecTy.getScalarSizeInBits() / 8) {
	MVT EltTy = MVT::getIntegerVT(Alignment * 8);
	assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
	assert(!isLaneOp && "Unexpected generic load/store lane.");
	unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
	AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
	}
	// Don't set an explicit alignment on regular load/stores that we want
	// to transform to VLD/VST 1_UPD nodes.
	// This matches the behavior of regular load/stores, which only get an
	// explicit alignment if the MMO alignment is larger than the standard
	// alignment of the memory type.
	// Intrinsics, however, always get an explicit alignment, set to the
	// alignment of the MMO.
	Alignment = 1;
	}

	// Create the new updating load/store node.
	// First, create an SDVTList for the new updating node's results.
	EVT Tys[6];
	unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
	unsigned n;
	for (n = 0; n < NumResultVecs; ++n)
	Tys[n] = AlignedVecTy;
	Tys[n++] = MVT::i32;
	Tys[n] = MVT::Other;
	SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));

	// Then, gather the new node's operands.
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(N->getOperand(0)); // incoming chain
	Ops.push_back(N->getOperand(AddrOpIdx));
	Ops.push_back(Inc);

	if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
	// Try to match the intrinsic's signature
	Ops.push_back(StN->getValue());
	} else {
	// Loads (and of course intrinsics) match the intrinsics' signature,
	// so just add all but the alignment operand.
	for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
	Ops.push_back(N->getOperand(i));
	}

	// For all node types, the alignment operand is always the last one.
	Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));

	// If this is a non-standard-aligned STORE, the penultimate operand is the
	// stored value. Bitcast it to the aligned type.
	if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
	SDValue &StVal = Ops[Ops.size()-2];
	StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
	}

	EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
	SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
	MemN->getMemOperand());

	// Update the uses.
	SmallVector<SDValue, 5> NewResults;
	for (unsigned i = 0; i < NumResultVecs; ++i)
	NewResults.push_back(SDValue(UpdN.getNode(), i));

	// If this is an non-standard-aligned LOAD, the first result is the loaded
	// value. Bitcast it to the expected result type.
	if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
	SDValue &LdVal = NewResults[0];
	LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
	}

	NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
	DCI.CombineTo(N, NewResults);
	DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));

	break;
	}
	return SDValue();
	}

	static SDValue PerformVLDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	return CombineBaseUpdate(N, DCI);
	}

	/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
	/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
	/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
	/// return true.
	static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);
	// vldN-dup instructions only support 64-bit vectors for N > 1.
	if (!VT.is64BitVector())
	return false;

	// Check if the VDUPLANE operand is a vldN-dup intrinsic.
	SDNode *VLD = N->getOperand(0).getNode();
	if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
	return false;
	unsigned NumVecs = 0;
	unsigned NewOpc = 0;
	unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
	if (IntNo == Intrinsic::arm_neon_vld2lane) {
	NumVecs = 2;
	NewOpc = ARMISD::VLD2DUP;
	} else if (IntNo == Intrinsic::arm_neon_vld3lane) {
	NumVecs = 3;
	NewOpc = ARMISD::VLD3DUP;
	} else if (IntNo == Intrinsic::arm_neon_vld4lane) {
	NumVecs = 4;
	NewOpc = ARMISD::VLD4DUP;
	} else {
	return false;
	}

	// First check that all the vldN-lane uses are VDUPLANEs and that the lane
	// numbers match the load.
	unsigned VLDLaneNo =
	cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
	for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
	UI != UE; ++UI) {
	// Ignore uses of the chain result.
	if (UI.getUse().getResNo() == NumVecs)
	continue;
	SDNode User = UI;
	if (User->getOpcode() != ARMISD::VDUPLANE \|\|
	VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
	return false;
	}

	// Create the vldN-dup node.
	EVT Tys[5];
	unsigned n;
	for (n = 0; n < NumVecs; ++n)
	Tys[n] = VT;
	Tys[n] = MVT::Other;
	SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
	SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
	MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
	SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
	Ops, VLDMemInt->getMemoryVT(),
	VLDMemInt->getMemOperand());

	// Update the uses.
	for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
	UI != UE; ++UI) {
	unsigned ResNo = UI.getUse().getResNo();
	// Ignore uses of the chain result.
	if (ResNo == NumVecs)
	continue;
	SDNode User = UI;
	DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
	}

	// Now the vldN-lane intrinsic is dead except for its chain result.
	// Update uses of the chain.
	std::vector<SDValue> VLDDupResults;
	for (unsigned n = 0; n < NumVecs; ++n)
	VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
	VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
	DCI.CombineTo(VLD, VLDDupResults);

	return true;
	}

	/// PerformVDUPLANECombine - Target-specific dag combine xforms for
	/// ARMISD::VDUPLANE.
	static SDValue PerformVDUPLANECombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue Op = N->getOperand(0);

	// If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
	// of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
	if (CombineVLDDUP(N, DCI))
	return SDValue(N, 0);

	// If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
	// redundant. Ignore bit_converts for now; element sizes are checked below.
	while (Op.getOpcode() == ISD::BITCAST)
	Op = Op.getOperand(0);
	if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
	return SDValue();

	// Make sure the VMOV element size is not bigger than the VDUPLANE elements.
	unsigned EltSize = Op.getScalarValueSizeInBits();
	// The canonical VMOV for a zero vector uses a 32-bit element size.
	unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	unsigned EltBits;
	if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
	EltSize = 8;
	EVT VT = N->getValueType(0);
	if (EltSize > VT.getScalarSizeInBits())
	return SDValue();

	return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
	}

	/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
	static SDValue PerformVDUPCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *Subtarget) {
	SelectionDAG &DAG = DCI.DAG;
	SDValue Op = N->getOperand(0);

	if (!Subtarget->hasNEON())
	return SDValue();

	// Match VDUP(LOAD) -> VLD1DUP.
	// We match this pattern here rather than waiting for isel because the
	// transform is only legal for unindexed loads.
	LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
	if (LD && Op.hasOneUse() && LD->isUnindexed() &&
	LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
	SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
	DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
	SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
	SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
	Ops, LD->getMemoryVT(),
	LD->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
	return VLDDup;
	}

	return SDValue();
	}

	static SDValue PerformLOADCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);

	// If this is a legal vector load, try to combine it into a VLD1_UPD.
	if (ISD::isNormalLoad(N) && VT.isVector() &&
	DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return CombineBaseUpdate(N, DCI);

	return SDValue();
	}

	/// PerformSTORECombine - Target-specific dag combine xforms for
	/// ISD::STORE.
	static SDValue PerformSTORECombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	StoreSDNode *St = cast<StoreSDNode>(N);
	if (St->isVolatile())
	return SDValue();

	// Optimize trunc store (of multiple scalars) to shuffle and store. First,
	// pack all of the elements in one place. Next, store to memory in fewer
	// chunks.
	SDValue StVal = St->getValue();
	EVT VT = StVal.getValueType();
	if (St->isTruncatingStore() && VT.isVector()) {
	SelectionDAG &DAG = DCI.DAG;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT StVT = St->getMemoryVT();
	unsigned NumElems = VT.getVectorNumElements();
	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromEltSz = VT.getScalarSizeInBits();
	unsigned ToEltSz = StVT.getScalarSizeInBits();

	// From, To sizes and ElemCount must be pow of two
	if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();

	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();

	unsigned SizeRatio = FromEltSz / ToEltSz;
	assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
	NumElems*SizeRatio);
	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDLoc DL(St);
	SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
	SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i < NumElems; ++i)
	ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
	? (i + 1) * SizeRatio - 1
	: i * SizeRatio;

	// Can't shuffle using an illegal type.
	if (!TLI.isTypeLegal(WideVecVT)) return SDValue();

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
	DAG.getUNDEF(WideVec.getValueType()),
	ShuffleVec);
	// At this point all of the data is stored at the bottom of the
	// register. We now need to save it to mem.

	// Find the largest store unit
	MVT StoreType = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
	StoreType = Tp;
	}
	// Didn't find a legal store type.
	if (!TLI.isTypeLegal(StoreType))
	return SDValue();

	// Bitcast the original vector into a vector of store-size units
	EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
	StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
	assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
	SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
	SmallVector<SDValue, 8> Chains;
	SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
	TLI.getPointerTy(DAG.getDataLayout()));
	SDValue BasePtr = St->getBasePtr();

	// Perform one or more big stores into memory.
	unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
	for (unsigned I = 0; I < E; I++) {
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
	StoreType, ShuffWide,
	DAG.getIntPtrConstant(I, DL));
	SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
	Increment);
	Chains.push_back(Ch);
	}
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
	}

	if (!ISD::isNormalStore(St))
	return SDValue();

	// Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
	// ARM stores of arguments in the same cache line.
	if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
	StVal.getNode()->hasOneUse()) {
	SelectionDAG &DAG = DCI.DAG;
	bool isBigEndian = DAG.getDataLayout().isBigEndian();
	SDLoc DL(St);
	SDValue BasePtr = St->getBasePtr();
	SDValue NewST1 = DAG.getStore(
	St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
	BasePtr, St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());

	SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
	DAG.getConstant(4, DL, MVT::i32));
	return DAG.getStore(NewST1.getValue(0), DL,
	StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
	OffsetPtr, St->getPointerInfo(),
	std::min(4U, St->getAlignment() / 2),
	St->getMemOperand()->getFlags());
	}

	if (StVal.getValueType() == MVT::i64 &&
	StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

	// Bitcast an i64 store extracted from a vector to f64.
	// Otherwise, the i64 value will be legalized to a pair of i32 values.
	SelectionDAG &DAG = DCI.DAG;
	SDLoc dl(StVal);
	SDValue IntVec = StVal.getOperand(0);
	EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
	IntVec.getValueType().getVectorNumElements());
	SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
	SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	Vec, StVal.getOperand(1));
	dl = SDLoc(N);
	SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
	// Make the DAGCombiner fold the bitcasts.
	DCI.AddToWorklist(Vec.getNode());
	DCI.AddToWorklist(ExtElt.getNode());
	DCI.AddToWorklist(V.getNode());
	return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags(), St->getAAInfo());
	}

	// If this is a legal vector store, try to combine it into a VST1_UPD.
	if (ISD::isNormalStore(N) && VT.isVector() &&
	DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return CombineBaseUpdate(N, DCI);

	return SDValue();
	}

	/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
	/// can replace combinations of VMUL and VCVT (floating-point to integer)
	/// when the VMUL has a constant operand that is a power of 2.
	///
	/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
	/// vmul.f32 d16, d17, d16
	/// vcvt.s32.f32 d16, d16
	/// becomes:
	/// vcvt.s32.f32 d16, d16, #3
	static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
	const ARMSubtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	SDValue Op = N->getOperand(0);
	if (!Op.getValueType().isVector() \|\| !Op.getValueType().isSimple() \|\|
	Op.getOpcode() != ISD::FMUL)
	return SDValue();

	SDValue ConstVec = Op->getOperand(1);
	if (!isa<BuildVectorSDNode>(ConstVec))
	return SDValue();

	MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
	uint32_t FloatBits = FloatTy.getSizeInBits();
	MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
	uint32_t IntBits = IntTy.getSizeInBits();
	unsigned NumLanes = Op.getValueType().getVectorNumElements();
	if (FloatBits != 32 \|\| IntBits > 32 \|\| (NumLanes != 4 && NumLanes != 2)) {
	// These instructions only exist converting from f32 to i32. We can handle
	// smaller integers by generating an extra truncate, but larger ones would
	// be lossy. We also can't handle anything other than 2 or 4 lanes, since
	// these intructions only support v2i32/v4i32 types.
	return SDValue();
	}

	BitVector UndefElements;
	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
	if (C == -1 \|\| C == 0 \|\| C > 32)
	return SDValue();

	SDLoc dl(N);
	bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
	unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
	Intrinsic::arm_neon_vcvtfp2fxu;
	SDValue FixConv = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
	DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
	DAG.getConstant(C, dl, MVT::i32));

	if (IntBits < FloatBits)
	FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);

	return FixConv;
	}

	/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
	/// can replace combinations of VCVT (integer to floating-point) and VDIV
	/// when the VDIV has a constant operand that is a power of 2.
	///
	/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
	/// vcvt.f32.s32 d16, d16
	/// vdiv.f32 d16, d17, d16
	/// becomes:
	/// vcvt.f32.s32 d16, d16, #3
	static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
	const ARMSubtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	SDValue Op = N->getOperand(0);
	unsigned OpOpcode = Op.getNode()->getOpcode();
	if (!N->getValueType(0).isVector() \|\| !N->getValueType(0).isSimple() \|\|
	(OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
	return SDValue();

	SDValue ConstVec = N->getOperand(1);
	if (!isa<BuildVectorSDNode>(ConstVec))
	return SDValue();

	MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
	uint32_t FloatBits = FloatTy.getSizeInBits();
	MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
	uint32_t IntBits = IntTy.getSizeInBits();
	unsigned NumLanes = Op.getValueType().getVectorNumElements();
	if (FloatBits != 32 \|\| IntBits > 32 \|\| (NumLanes != 4 && NumLanes != 2)) {
	// These instructions only exist converting from i32 to f32. We can handle
	// smaller integers by generating an extra extend, but larger ones would
	// be lossy. We also can't handle anything other than 2 or 4 lanes, since
	// these intructions only support v2i32/v4i32 types.
	return SDValue();
	}

	BitVector UndefElements;
	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
	if (C == -1 \|\| C == 0 \|\| C > 32)
	return SDValue();

	SDLoc dl(N);
	bool isSigned = OpOpcode == ISD::SINT_TO_FP;
	SDValue ConvInput = Op.getOperand(0);
	if (IntBits < FloatBits)
	ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
	dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
	ConvInput);

	unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
	Intrinsic::arm_neon_vcvtfxu2fp;
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
	Op.getValueType(),
	DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
	ConvInput, DAG.getConstant(C, dl, MVT::i32));
	}

	/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
	static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
	switch (IntNo) {
	default:
	// Don't do anything for most intrinsics.
	break;

	// Vector shifts: check for immediate versions and lower them.
	// Note: This is done during DAG combining instead of DAG legalizing because
	// the build_vectors for 64-bit vector element shift counts are generally
	// not legal, and it is hard to see their values after they get legalized to
	// loads from a constant pool.
	case Intrinsic::arm_neon_vshifts:
	case Intrinsic::arm_neon_vshiftu:
	case Intrinsic::arm_neon_vrshifts:
	case Intrinsic::arm_neon_vrshiftu:
	case Intrinsic::arm_neon_vrshiftn:
	case Intrinsic::arm_neon_vqshifts:
	case Intrinsic::arm_neon_vqshiftu:
	case Intrinsic::arm_neon_vqshiftsu:
	case Intrinsic::arm_neon_vqshiftns:
	case Intrinsic::arm_neon_vqshiftnu:
	case Intrinsic::arm_neon_vqshiftnsu:
	case Intrinsic::arm_neon_vqrshiftns:
	case Intrinsic::arm_neon_vqrshiftnu:
	case Intrinsic::arm_neon_vqrshiftnsu: {
	EVT VT = N->getOperand(1).getValueType();
	int64_t Cnt;
	unsigned VShiftOpc = 0;

	switch (IntNo) {
	case Intrinsic::arm_neon_vshifts:
	case Intrinsic::arm_neon_vshiftu:
	if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
	VShiftOpc = ARMISD::VSHLIMM;
	break;
	}
	if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
	VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
	: ARMISD::VSHRuIMM);
	break;
	}
	return SDValue();

	case Intrinsic::arm_neon_vrshifts:
	case Intrinsic::arm_neon_vrshiftu:
	if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
	break;
	return SDValue();

	case Intrinsic::arm_neon_vqshifts:
	case Intrinsic::arm_neon_vqshiftu:
	if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
	break;
	return SDValue();

	case Intrinsic::arm_neon_vqshiftsu:
	if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
	break;
	llvm_unreachable("invalid shift count for vqshlu intrinsic");

	case Intrinsic::arm_neon_vrshiftn:
	case Intrinsic::arm_neon_vqshiftns:
	case Intrinsic::arm_neon_vqshiftnu:
	case Intrinsic::arm_neon_vqshiftnsu:
	case Intrinsic::arm_neon_vqrshiftns:
	case Intrinsic::arm_neon_vqrshiftnu:
	case Intrinsic::arm_neon_vqrshiftnsu:
	// Narrowing shifts require an immediate right shift.
	if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
	break;
	llvm_unreachable("invalid shift count for narrowing vector shift "
	"intrinsic");

	default:
	llvm_unreachable("unhandled vector shift");
	}

	switch (IntNo) {
	case Intrinsic::arm_neon_vshifts:
	case Intrinsic::arm_neon_vshiftu:
	// Opcode already set above.
	break;
	case Intrinsic::arm_neon_vrshifts:
	VShiftOpc = ARMISD::VRSHRsIMM;
	break;
	case Intrinsic::arm_neon_vrshiftu:
	VShiftOpc = ARMISD::VRSHRuIMM;
	break;
	case Intrinsic::arm_neon_vrshiftn:
	VShiftOpc = ARMISD::VRSHRNIMM;
	break;
	case Intrinsic::arm_neon_vqshifts:
	VShiftOpc = ARMISD::VQSHLsIMM;
	break;
	case Intrinsic::arm_neon_vqshiftu:
	VShiftOpc = ARMISD::VQSHLuIMM;
	break;
	case Intrinsic::arm_neon_vqshiftsu:
	VShiftOpc = ARMISD::VQSHLsuIMM;
	break;
	case Intrinsic::arm_neon_vqshiftns:
	VShiftOpc = ARMISD::VQSHRNsIMM;
	break;
	case Intrinsic::arm_neon_vqshiftnu:
	VShiftOpc = ARMISD::VQSHRNuIMM;
	break;
	case Intrinsic::arm_neon_vqshiftnsu:
	VShiftOpc = ARMISD::VQSHRNsuIMM;
	break;
	case Intrinsic::arm_neon_vqrshiftns:
	VShiftOpc = ARMISD::VQRSHRNsIMM;
	break;
	case Intrinsic::arm_neon_vqrshiftnu:
	VShiftOpc = ARMISD::VQRSHRNuIMM;
	break;
	case Intrinsic::arm_neon_vqrshiftnsu:
	VShiftOpc = ARMISD::VQRSHRNsuIMM;
	break;
	}

	SDLoc dl(N);
	return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
	N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
	}

	case Intrinsic::arm_neon_vshiftins: {
	EVT VT = N->getOperand(1).getValueType();
	int64_t Cnt;
	unsigned VShiftOpc = 0;

	if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
	VShiftOpc = ARMISD::VSLIIMM;
	else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
	VShiftOpc = ARMISD::VSRIIMM;
	else {
	llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
	}

	SDLoc dl(N);
	return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
	N->getOperand(1), N->getOperand(2),
	DAG.getConstant(Cnt, dl, MVT::i32));
	}

	case Intrinsic::arm_neon_vqrshifts:
	case Intrinsic::arm_neon_vqrshiftu:
	// No immediate versions of these to check for.
	break;
	}

	return SDValue();
	}

	/// PerformShiftCombine - Checks for immediate versions of vector shifts and
	/// lowers them. As with the vector shift intrinsics, this is done during DAG
	/// combining instead of DAG legalizing because the build_vectors for 64-bit
	/// vector element shift counts are generally not legal, and it is hard to see
	/// their values after they get legalized to loads from a constant pool.
	static SDValue PerformShiftCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *ST) {
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);
	if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
	// Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
	// 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
	SDValue N1 = N->getOperand(1);
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
	SDValue N0 = N->getOperand(0);
	if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
	DAG.MaskedValueIsZero(N0.getOperand(0),
	APInt::getHighBitsSet(32, 16)))
	return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
	}
	}

	if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
	N->getOperand(0)->getOpcode() == ISD::AND &&
	N->getOperand(0)->hasOneUse()) {
	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();
	// Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
	// usually show up because instcombine prefers to canonicalize it to
	// (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
	// out of GEP lowering in some cases.
	SDValue N0 = N->getOperand(0);
	ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!ShiftAmtNode)
	return SDValue();
	uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
	ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
	if (!AndMaskNode)
	return SDValue();
	uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
	// Don't transform uxtb/uxth.
	if (AndMask == 255 \|\| AndMask == 65535)
	return SDValue();
	if (isMask_32(AndMask)) {
	uint32_t MaskedBits = countLeadingZeros(AndMask);
	if (MaskedBits > ShiftAmt) {
	SDLoc DL(N);
	SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
	DAG.getConstant(MaskedBits, DL, MVT::i32));
	return DAG.getNode(
	ISD::SRL, DL, MVT::i32, SHL,
	DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
	}
	}
	}

	// Nothing to be done for scalar shifts.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!VT.isVector() \|\| !TLI.isTypeLegal(VT))
	return SDValue();
	if (ST->hasMVEIntegerOps() && VT == MVT::v2i64)
	return SDValue();

	int64_t Cnt;

	switch (N->getOpcode()) {
	default: llvm_unreachable("unexpected shift opcode");

	case ISD::SHL:
	if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
	SDLoc dl(N);
	return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
	DAG.getConstant(Cnt, dl, MVT::i32));
	}
	break;

	case ISD::SRA:
	case ISD::SRL:
	if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
	unsigned VShiftOpc =
	(N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
	SDLoc dl(N);
	return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
	DAG.getConstant(Cnt, dl, MVT::i32));
	}
	}
	return SDValue();
	}

	/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
	/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
	static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
	const ARMSubtarget *ST) {
	SDValue N0 = N->getOperand(0);

	// Check for sign- and zero-extensions of vector extract operations of 8-
	// and 16-bit vector elements. NEON supports these directly. They are
	// handled during DAG combining because type legalization will promote them
	// to 32-bit types and it is messy to recognize the operations after that.
	if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	SDValue Vec = N0.getOperand(0);
	SDValue Lane = N0.getOperand(1);
	EVT VT = N->getValueType(0);
	EVT EltVT = N0.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (VT == MVT::i32 &&
	(EltVT == MVT::i8 \|\| EltVT == MVT::i16) &&
	TLI.isTypeLegal(Vec.getValueType()) &&
	isa<ConstantSDNode>(Lane)) {

	unsigned Opc = 0;
	switch (N->getOpcode()) {
	default: llvm_unreachable("unexpected opcode");
	case ISD::SIGN_EXTEND:
	Opc = ARMISD::VGETLANEs;
	break;
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	Opc = ARMISD::VGETLANEu;
	break;
	}
	return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
	}
	}

	return SDValue();
	}

	static const APInt *isPowerOf2Constant(SDValue V) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
	if (!C)
	return nullptr;
	const APInt *CV = &C->getAPIntValue();
	return CV->isPowerOf2() ? CV : nullptr;
	}

	SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
	// If we have a CMOV, OR and AND combination such as:
	// if (x & CN)
	// y \|= CM;
	//
	// And:
	// * CN is a single bit;
	// * All bits covered by CM are known zero in y
	//
	// Then we can convert this into a sequence of BFI instructions. This will
	// always be a win if CM is a single bit, will always be no worse than the
	// TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
	// three bits (due to the extra IT instruction).

	SDValue Op0 = CMOV->getOperand(0);
	SDValue Op1 = CMOV->getOperand(1);
	auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
	auto CC = CCNode->getAPIntValue().getLimitedValue();
	SDValue CmpZ = CMOV->getOperand(4);

	// The compare must be against zero.
	if (!isNullConstant(CmpZ->getOperand(1)))
	return SDValue();

	assert(CmpZ->getOpcode() == ARMISD::CMPZ);
	SDValue And = CmpZ->getOperand(0);
	if (And->getOpcode() != ISD::AND)
	return SDValue();
	const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
	if (!AndC)
	return SDValue();
	SDValue X = And->getOperand(0);

	if (CC == ARMCC::EQ) {
	// We're performing an "equal to zero" compare. Swap the operands so we
	// canonicalize on a "not equal to zero" compare.
	std::swap(Op0, Op1);
	} else {
	assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
	}

	if (Op1->getOpcode() != ISD::OR)
	return SDValue();

	ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
	if (!OrC)
	return SDValue();
	SDValue Y = Op1->getOperand(0);

	if (Op0 != Y)
	return SDValue();

	// Now, is it profitable to continue?
	APInt OrCI = OrC->getAPIntValue();
	unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
	if (OrCI.countPopulation() > Heuristic)
	return SDValue();

	// Lastly, can we determine that the bits defined by OrCI
	// are zero in Y?
	KnownBits Known = DAG.computeKnownBits(Y);
	if ((OrCI & Known.Zero) != OrCI)
	return SDValue();

	// OK, we can do the combine.
	SDValue V = Y;
	SDLoc dl(X);
	EVT VT = X.getValueType();
	unsigned BitInX = AndC->logBase2();

	if (BitInX != 0) {
	// We must shift X first.
	X = DAG.getNode(ISD::SRL, dl, VT, X,
	DAG.getConstant(BitInX, dl, VT));
	}

	for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
	BitInY < NumActiveBits; ++BitInY) {
	if (OrCI[BitInY] == 0)
	continue;
	APInt Mask(VT.getSizeInBits(), 0);
	Mask.setBit(BitInY);
	V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
	// Confusingly, the operand is an inverted mask.
	DAG.getConstant(~Mask, dl, VT));
	}

	return V;
	}

	static SDValue PerformHWLoopCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const ARMSubtarget *ST) {
	// Look for (brcond (xor test.set.loop.iterations, -1)
	SDValue CC = N->getOperand(1);
	unsigned Opc = CC->getOpcode();
	SDValue Int;

	if ((Opc == ISD::XOR \|\| Opc == ISD::SETCC) &&
	(CC->getOperand(0)->getOpcode() == ISD::INTRINSIC_W_CHAIN)) {

	assert((isa<ConstantSDNode>(CC->getOperand(1)) &&
	cast<ConstantSDNode>(CC->getOperand(1))->isOne()) &&
	"Expected to compare against 1");

	Int = CC->getOperand(0);
	} else if (CC->getOpcode() == ISD::INTRINSIC_W_CHAIN)
	Int = CC;
	else
	return SDValue();

	unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue();
	if (IntOp != Intrinsic::test_set_loop_iterations)
	return SDValue();

	SDLoc dl(Int);
	SDValue Chain = N->getOperand(0);
	SDValue Elements = Int.getOperand(2);
	SDValue ExitBlock = N->getOperand(2);

	// TODO: Once we start supporting tail predication, we can add another
	// operand to WLS for the number of elements processed in a vector loop.

	SDValue Ops[] = { Chain, Elements, ExitBlock };
	SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
	DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
	return Res;
	}

	/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
	SDValue
	ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
	SDValue Cmp = N->getOperand(4);
	if (Cmp.getOpcode() != ARMISD::CMPZ)
	// Only looking at NE cases.
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc dl(N);
	SDValue LHS = Cmp.getOperand(0);
	SDValue RHS = Cmp.getOperand(1);
	SDValue Chain = N->getOperand(0);
	SDValue BB = N->getOperand(1);
	SDValue ARMcc = N->getOperand(2);
	ARMCC::CondCodes CC =
	(ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();

	// (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
	// -> (brcond Chain BB CC CPSR Cmp)
	if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
	LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
	LHS->getOperand(0)->hasOneUse()) {
	auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
	auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
	auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
	auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
	if ((LHS00C && LHS00C->getZExtValue() == 0) &&
	(LHS01C && LHS01C->getZExtValue() == 1) &&
	(LHS1C && LHS1C->getZExtValue() == 1) &&
	(RHSC && RHSC->getZExtValue() == 0)) {
	return DAG.getNode(
	ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
	LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
	}
	}

	return SDValue();
	}

	/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
	SDValue
	ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
	SDValue Cmp = N->getOperand(4);
	if (Cmp.getOpcode() != ARMISD::CMPZ)
	// Only looking at EQ and NE cases.
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc dl(N);
	SDValue LHS = Cmp.getOperand(0);
	SDValue RHS = Cmp.getOperand(1);
	SDValue FalseVal = N->getOperand(0);
	SDValue TrueVal = N->getOperand(1);
	SDValue ARMcc = N->getOperand(2);
	ARMCC::CondCodes CC =
	(ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();

	// BFI is only available on V6T2+.
	if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
	SDValue R = PerformCMOVToBFICombine(N, DAG);
	if (R)
	return R;
	}

	// Simplify
	// mov r1, r0
	// cmp r1, x
	// mov r0, y
	// moveq r0, x
	// to
	// cmp r0, x
	// movne r0, y
	//
	// mov r1, r0
	// cmp r1, x
	// mov r0, x
	// movne r0, y
	// to
	// cmp r0, x
	// movne r0, y
	/// FIXME: Turn this into a target neutral optimization?
	SDValue Res;
	if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
	Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
	N->getOperand(3), Cmp);
	} else if (CC == ARMCC::EQ && TrueVal == RHS) {
	SDValue ARMcc;
	SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
	Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
	N->getOperand(3), NewCmp);
	}

	// (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
	// -> (cmov F T CC CPSR Cmp)
	if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
	auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
	auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
	auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
	if ((LHS0C && LHS0C->getZExtValue() == 0) &&
	(LHS1C && LHS1C->getZExtValue() == 1) &&
	(RHSC && RHSC->getZExtValue() == 0)) {
	return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
	LHS->getOperand(2), LHS->getOperand(3),
	LHS->getOperand(4));
	}
	}

	if (!VT.isInteger())
	return SDValue();

	// Materialize a boolean comparison for integers so we can avoid branching.
	if (isNullConstant(FalseVal)) {
	if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
	if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
	// If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
	// right 5 bits will make that 32 be 1, otherwise it will be 0.
	// CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
	SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
	Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
	DAG.getConstant(5, dl, MVT::i32));
	} else {
	// CMOV 0, 1, ==, (CMPZ x, y) ->
	// (ADDCARRY (SUB x, y), t:0, t:1)
	// where t = (SUBCARRY 0, (SUB x, y), 0)
	//
	// The SUBCARRY computes 0 - (x - y) and this will give a borrow when
	// x != y. In other words, a carry C == 1 when x == y, C == 0
	// otherwise.
	// The final ADDCARRY computes
	// x - y + (0 - (x - y)) + C == C
	SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
	// ISD::SUBCARRY returns a borrow but we want the carry here
	// actually.
	SDValue Carry =
	DAG.getNode(ISD::SUB, dl, MVT::i32,
	DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
	Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
	}
	} else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
	(!Subtarget->isThumb1Only() \|\| isPowerOf2Constant(TrueVal))) {
	// This seems pointless but will allow us to combine it further below.
	// CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
	SDValue Sub =
	DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
	SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
	Sub.getValue(1), SDValue());
	Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
	N->getOperand(3), CPSRGlue.getValue(1));
	FalseVal = Sub;
	}
	} else if (isNullConstant(TrueVal)) {
	if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
	(!Subtarget->isThumb1Only() \|\| isPowerOf2Constant(FalseVal))) {
	// This seems pointless but will allow us to combine it further below
	// Note that we change == for != as this is the dual for the case above.
	// CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
	SDValue Sub =
	DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
	SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
	Sub.getValue(1), SDValue());
	Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
	DAG.getConstant(ARMCC::NE, dl, MVT::i32),
	N->getOperand(3), CPSRGlue.getValue(1));
	FalseVal = Sub;
	}
	}

	// On Thumb1, the DAG above may be further combined if z is a power of 2
	// (z == 2 ^ K).
	// CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
	// t1 = (USUBO (SUB x, y), 1)
	// t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
	// Result = if K != 0 then (SHL t2:0, K) else t2:0
	//
	// This also handles the special case of comparing against zero; it's
	// essentially, the same pattern, except there's no SUBS:
	// CMOV x, z, !=, (CMPZ x, 0) ->
	// t1 = (USUBO x, 1)
	// t2 = (SUBCARRY x, t1:0, t1:1)
	// Result = if K != 0 then (SHL t2:0, K) else t2:0
	const APInt *TrueConst;
	if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
	((FalseVal.getOpcode() == ARMISD::SUBS &&
	FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) \|\|
	(FalseVal == LHS && isNullConstant(RHS))) &&
	(TrueConst = isPowerOf2Constant(TrueVal))) {
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	unsigned ShiftAmount = TrueConst->logBase2();
	if (ShiftAmount)
	TrueVal = DAG.getConstant(1, dl, VT);
	SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
	Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));

	if (ShiftAmount)
	Res = DAG.getNode(ISD::SHL, dl, VT, Res,
	DAG.getConstant(ShiftAmount, dl, MVT::i32));
	}

	if (Res.getNode()) {
	KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
	// Capture demanded bits information that would be otherwise lost.
	if (Known.Zero == 0xfffffffe)
	Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
	DAG.getValueType(MVT::i1));
	else if (Known.Zero == 0xffffff00)
	Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
	DAG.getValueType(MVT::i8));
	else if (Known.Zero == 0xffff0000)
	Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
	DAG.getValueType(MVT::i16));
	}

	return Res;
	}

	SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	switch (N->getOpcode()) {
	default: break;
	case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget);
	case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
	case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
	case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
	case ISD::SUB: return PerformSUBCombine(N, DCI);
	case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
	case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
	case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
	case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
	case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget);
	case ARMISD::ADDC:
	case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
	case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
	case ARMISD::BFI: return PerformBFICombine(N, DCI);
	case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
	case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
	case ISD::STORE: return PerformSTORECombine(N, DCI);
	case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
	case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
	case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
	case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
	case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	return PerformVCVTCombine(N, DCI.DAG, Subtarget);
	case ISD::FDIV:
	return PerformVDIVCombine(N, DCI.DAG, Subtarget);
	case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	return PerformShiftCombine(N, DCI, Subtarget);
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
	case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
	case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
	case ISD::LOAD: return PerformLOADCombine(N, DCI);
	case ARMISD::VLD1DUP:
	case ARMISD::VLD2DUP:
	case ARMISD::VLD3DUP:
	case ARMISD::VLD4DUP:
	return PerformVLDCombine(N, DCI);
	case ARMISD::BUILD_VECTOR:
	return PerformARMBUILD_VECTORCombine(N, DCI);
	case ARMISD::SMULWB: {
	unsigned BitWidth = N->getValueType(0).getSizeInBits();
	APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
	if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
	return SDValue();
	break;
	}
	case ARMISD::SMULWT: {
	unsigned BitWidth = N->getValueType(0).getSizeInBits();
	APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
	if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
	return SDValue();
	break;
	}
	case ARMISD::SMLALBB: {
	unsigned BitWidth = N->getValueType(0).getSizeInBits();
	APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
	if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) \|\|
	(SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
	return SDValue();
	break;
	}
	case ARMISD::SMLALBT: {
	unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
	APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
	unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
	APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
	if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) \|\|
	(SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
	return SDValue();
	break;
	}
	case ARMISD::SMLALTB: {
	unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
	APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
	unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
	APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
	if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) \|\|
	(SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
	return SDValue();
	break;
	}
	case ARMISD::SMLALTT: {
	unsigned BitWidth = N->getValueType(0).getSizeInBits();
	APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
	if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) \|\|
	(SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
	return SDValue();
	break;
	}
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN:
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	case Intrinsic::arm_neon_vld1:
	case Intrinsic::arm_neon_vld1x2:
	case Intrinsic::arm_neon_vld1x3:
	case Intrinsic::arm_neon_vld1x4:
	case Intrinsic::arm_neon_vld2:
	case Intrinsic::arm_neon_vld3:
	case Intrinsic::arm_neon_vld4:
	case Intrinsic::arm_neon_vld2lane:
	case Intrinsic::arm_neon_vld3lane:
	case Intrinsic::arm_neon_vld4lane:
	case Intrinsic::arm_neon_vld2dup:
	case Intrinsic::arm_neon_vld3dup:
	case Intrinsic::arm_neon_vld4dup:
	case Intrinsic::arm_neon_vst1:
	case Intrinsic::arm_neon_vst1x2:
	case Intrinsic::arm_neon_vst1x3:
	case Intrinsic::arm_neon_vst1x4:
	case Intrinsic::arm_neon_vst2:
	case Intrinsic::arm_neon_vst3:
	case Intrinsic::arm_neon_vst4:
	case Intrinsic::arm_neon_vst2lane:
	case Intrinsic::arm_neon_vst3lane:
	case Intrinsic::arm_neon_vst4lane:
	return PerformVLDCombine(N, DCI);
	default: break;
	}
	break;
	}
	return SDValue();
	}

	bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
	EVT VT) const {
	return (VT == MVT::f32) && (Opc == ISD::LOAD \|\| Opc == ISD::STORE);
	}

	bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
	unsigned Alignment,
	MachineMemOperand::Flags,
	bool *Fast) const {
	// Depends what it gets converted into if the type is weird.
	if (!VT.isSimple())
	return false;

	// The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
	bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
	auto Ty = VT.getSimpleVT().SimpleTy;

	if (Ty == MVT::i8 \|\| Ty == MVT::i16 \|\| Ty == MVT::i32) {
	// Unaligned access can use (for example) LRDB, LRDH, LDR
	if (AllowsUnaligned) {
	if (Fast)
	*Fast = Subtarget->hasV7Ops();
	return true;
	}
	}

	if (Ty == MVT::f64 \|\| Ty == MVT::v2f64) {
	// For any little-endian targets with neon, we can support unaligned ld/st
	// of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
	// A big-endian target may also explicitly support unaligned accesses
	if (Subtarget->hasNEON() && (AllowsUnaligned \|\| Subtarget->isLittle())) {
	if (Fast)
	*Fast = true;
	return true;
	}
	}

	if (!Subtarget->hasMVEIntegerOps())
	return false;
	if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 &&
	Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 &&
	Ty != MVT::v2f64 &&
	// These are for truncated stores
	Ty != MVT::v4i8 && Ty != MVT::v8i8 && Ty != MVT::v4i16)
	return false;

	if (Subtarget->isLittle()) {
	// In little-endian MVE, the store instructions VSTRB.U8,
	// VSTRH.U16 and VSTRW.U32 all store the vector register in
	// exactly the same format, and differ only in the range of
	// their immediate offset field and the required alignment.
	//
	// In particular, VSTRB.U8 can store a vector at byte alignment.
	// So at this stage we can simply say that loads/stores of all
	// 128-bit wide vector types are permitted at any alignment,
	// because we know at least _one_ instruction can manage that.
	//
	// Later on we might find that some of those loads are better
	// generated as VLDRW.U32 if alignment permits, to take
	// advantage of the larger immediate range. But for the moment,
	// all that matters is that if we don't lower the load then
	// _some_ instruction can handle it.
	if (Fast)
	*Fast = true;
	return true;
	} else {
	// In big-endian MVE, those instructions aren't so similar
	// after all, because they reorder the bytes of the vector
	// differently. So this time we can only store a particular
	// kind of vector if its alignment is at least the element
	// type. And we can't store vectors of i64 or f64 at all
	// without having to do some postprocessing, because there's
	// no VSTRD.U64.
	if (Ty == MVT::v16i8 \|\|
	((Ty == MVT::v8i16 \|\| Ty == MVT::v8f16) && Alignment >= 2) \|\|
	((Ty == MVT::v4i32 \|\| Ty == MVT::v4f32) && Alignment >= 4)) {
	if (Fast)
	*Fast = true;
	return true;
	}
	}

	return false;
	}

	static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
	unsigned AlignCheck) {
	return ((SrcAlign == 0 \|\| SrcAlign % AlignCheck == 0) &&
	(DstAlign == 0 \|\| DstAlign % AlignCheck == 0));
	}

	EVT ARMTargetLowering::getOptimalMemOpType(
	uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
	bool ZeroMemset, bool MemcpyStrSrc,
	const AttributeList &FuncAttributes) const {
	// See if we can use NEON instructions for this...
	if ((!IsMemset \|\| ZeroMemset) && Subtarget->hasNEON() &&
	!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
	bool Fast;
	if (Size >= 16 &&
	(memOpAlign(SrcAlign, DstAlign, 16) \|\|
	(allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1,
	MachineMemOperand::MONone, &Fast) &&
	Fast))) {
	return MVT::v2f64;
	} else if (Size >= 8 &&
	(memOpAlign(SrcAlign, DstAlign, 8) \|\|
	(allowsMisalignedMemoryAccesses(
	MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) &&
	Fast))) {
	return MVT::f64;
	}
	}

	// Let the target-independent logic figure it out.
	return MVT::Other;
	}

	// 64-bit integers are split into their high and low parts and held in two
	// different registers, so the trunc is free since the low register can just
	// be used.
	bool ARMTargetLowering::isTruncateFree(Type SrcTy, Type DstTy) const {
	if (!SrcTy->isIntegerTy() \|\| !DstTy->isIntegerTy())
	return false;
	unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
	unsigned DestBits = DstTy->getPrimitiveSizeInBits();
	return (SrcBits == 64 && DestBits == 32);
	}

	bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
	if (SrcVT.isVector() \|\| DstVT.isVector() \|\| !SrcVT.isInteger() \|\|
	!DstVT.isInteger())
	return false;
	unsigned SrcBits = SrcVT.getSizeInBits();
	unsigned DestBits = DstVT.getSizeInBits();
	return (SrcBits == 64 && DestBits == 32);
	}

	bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	if (Val.getOpcode() != ISD::LOAD)
	return false;

	EVT VT1 = Val.getValueType();
	if (!VT1.isSimple() \|\| !VT1.isInteger() \|\|
	!VT2.isSimple() \|\| !VT2.isInteger())
	return false;

	switch (VT1.getSimpleVT().SimpleTy) {
	default: break;
	case MVT::i1:
	case MVT::i8:
	case MVT::i16:
	// 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
	return true;
	}

	return false;
	}

	bool ARMTargetLowering::isFNegFree(EVT VT) const {
	if (!VT.isSimple())
	return false;

	// There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
	// negate values directly (fneg is free). So, we don't want to let the DAG
	// combiner rewrite fneg into xors and some other instructions. For f16 and
	// FullFP16 argument passing, some bitcast nodes may be introduced,
	// triggering this DAG combine rewrite, so we are avoiding that with this.
	switch (VT.getSimpleVT().SimpleTy) {
	default: break;
	case MVT::f16:
	return Subtarget->hasFullFP16();
	}

	return false;
	}

	/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
	/// of the vector elements.
	static bool areExtractExts(Value Ext1, Value Ext2) {
	auto areExtDoubled = [](Instruction *Ext) {
	return Ext->getType()->getScalarSizeInBits() ==
	2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
	};

	if (!match(Ext1, m_ZExtOrSExt(m_Value())) \|\|
	!match(Ext2, m_ZExtOrSExt(m_Value())) \|\|
	!areExtDoubled(cast<Instruction>(Ext1)) \|\|
	!areExtDoubled(cast<Instruction>(Ext2)))
	return false;

	return true;
	}

	/// Check if sinking \p I's operands to I's basic block is profitable, because
	/// the operands can be folded into a target instruction, e.g.
	/// sext/zext can be folded into vsubl.
	bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
	SmallVectorImpl<Use *> &Ops) const {
	if (!Subtarget->hasNEON() \|\| !I->getType()->isVectorTy())
	return false;

	switch (I->getOpcode()) {
	case Instruction::Sub:
	case Instruction::Add: {
	if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
	return false;
	Ops.push_back(&I->getOperandUse(0));
	Ops.push_back(&I->getOperandUse(1));
	return true;
	}
	default:
	return false;
	}
	return false;
	}

	bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
	EVT VT = ExtVal.getValueType();

	if (!isTypeLegal(VT))
	return false;

	// Don't create a loadext if we can fold the extension into a wide/long
	// instruction.
	// If there's more than one user instruction, the loadext is desirable no
	// matter what. There can be two uses by the same instruction.
	if (ExtVal->use_empty() \|\|
	!ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
	return true;

	SDNode U = ExtVal->use_begin();
	if ((U->getOpcode() == ISD::ADD \|\| U->getOpcode() == ISD::SUB \|\|
	U->getOpcode() == ISD::SHL \|\| U->getOpcode() == ARMISD::VSHLIMM))
	return false;

	return true;
	}

	bool ARMTargetLowering::allowTruncateForTailCall(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;

	if (!isTypeLegal(EVT::getEVT(Ty1)))
	return false;

	assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");

	// Assuming the caller doesn't have a zeroext or signext return parameter,
	// truncation all the way down to i1 is valid.
	return true;
	}

	int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	if (isLegalAddressingMode(DL, AM, Ty, AS)) {
	if (Subtarget->hasFPAO())
	return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
	return 0;
	}
	return -1;
	}

	static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
	if (V < 0)
	return false;

	unsigned Scale = 1;
	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::i1:
	case MVT::i8:
	// Scale == 1;
	break;
	case MVT::i16:
	// Scale == 2;
	Scale = 2;
	break;
	default:
	// On thumb1 we load most things (i32, i64, floats, etc) with a LDR
	// Scale == 4;
	Scale = 4;
	break;
	}

	if ((V & (Scale - 1)) != 0)
	return false;
	return isUInt<5>(V / Scale);
	}

	static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
	const ARMSubtarget *Subtarget) {
	if (!VT.isInteger() && !VT.isFloatingPoint())
	return false;
	if (VT.isVector() && Subtarget->hasNEON())
	return false;
	if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
	!Subtarget->hasMVEFloatOps())
	return false;

	bool IsNeg = false;
	if (V < 0) {
	IsNeg = true;
	V = -V;
	}

	unsigned NumBytes = std::max(VT.getSizeInBits() / 8, 1U);

	// MVE: size * imm7
	if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
	switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
	case MVT::i32:
	case MVT::f32:
	return isShiftedUInt<7,2>(V);
	case MVT::i16:
	case MVT::f16:
	return isShiftedUInt<7,1>(V);
	case MVT::i8:
	return isUInt<7>(V);
	default:
	return false;
	}
	}

	// half VLDR: 2 * imm8
	if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
	return isShiftedUInt<8, 1>(V);
	// VLDR and LDRD: 4 * imm8
	if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) \|\| NumBytes == 8)
	return isShiftedUInt<8, 2>(V);

	if (NumBytes == 1 \|\| NumBytes == 2 \|\| NumBytes == 4) {
	// + imm12 or - imm8
	if (IsNeg)
	return isUInt<8>(V);
	return isUInt<12>(V);
	}

	return false;
	}

	/// isLegalAddressImmediate - Return true if the integer value can be used
	/// as the offset of the target addressing mode for load / store of the
	/// given type.
	static bool isLegalAddressImmediate(int64_t V, EVT VT,
	const ARMSubtarget *Subtarget) {
	if (V == 0)
	return true;

	if (!VT.isSimple())
	return false;

	if (Subtarget->isThumb1Only())
	return isLegalT1AddressImmediate(V, VT);
	else if (Subtarget->isThumb2())
	return isLegalT2AddressImmediate(V, VT, Subtarget);

	// ARM mode.
	if (V < 0)
	V = - V;
	switch (VT.getSimpleVT().SimpleTy) {
	default: return false;
	case MVT::i1:
	case MVT::i8:
	case MVT::i32:
	// +- imm12
	return isUInt<12>(V);
	case MVT::i16:
	// +- imm8
	return isUInt<8>(V);
	case MVT::f32:
	case MVT::f64:
	if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
	return false;
	return isShiftedUInt<8, 2>(V);
	}
	}

	bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
	EVT VT) const {
	int Scale = AM.Scale;
	if (Scale < 0)
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	default: return false;
	case MVT::i1:
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	if (Scale == 1)
	return true;
	// r + r << imm
	Scale = Scale & ~1;
	return Scale == 2 \|\| Scale == 4 \|\| Scale == 8;
	case MVT::i64:
	// FIXME: What are we trying to model here? ldrd doesn't have an r + r
	// version in Thumb mode.
	// r + r
	if (Scale == 1)
	return true;
	// r * 2 (this can be lowered to r + r).
	if (!AM.HasBaseReg && Scale == 2)
	return true;
	return false;
	case MVT::isVoid:
	// Note, we allow "void" uses (basically, uses that aren't loads or
	// stores), because arm allows folding a scale into many arithmetic
	// operations. This should be made more precise and revisited later.

	// Allow r << imm, but the imm has to be a multiple of two.
	if (Scale & 1) return false;
	return isPowerOf2_32(Scale);
	}
	}

	bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM,
	EVT VT) const {
	const int Scale = AM.Scale;

	// Negative scales are not supported in Thumb1.
	if (Scale < 0)
	return false;

	// Thumb1 addressing modes do not support register scaling excepting the
	// following cases:
	// 1. Scale == 1 means no scaling.
	// 2. Scale == 2 this can be lowered to r + r if there is no base register.
	return (Scale == 1) \|\| (!AM.HasBaseReg && Scale == 2);
	}

	/// isLegalAddressingMode - Return true if the addressing mode represented
	/// by AM is legal for this target, for a load/store of the specified type.
	bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS, Instruction *I) const {
	EVT VT = getValueType(DL, Ty, true);
	if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
	return false;

	// Can never fold addr of global into load/store.
	if (AM.BaseGV)
	return false;

	switch (AM.Scale) {
	case 0: // no scale reg, must be "r+i" or "r", or "i".
	break;
	default:
	// ARM doesn't support any R+R*scale+imm addr modes.
	if (AM.BaseOffs)
	return false;

	if (!VT.isSimple())
	return false;

	if (Subtarget->isThumb1Only())
	return isLegalT1ScaledAddressingMode(AM, VT);

	if (Subtarget->isThumb2())
	return isLegalT2ScaledAddressingMode(AM, VT);

	int Scale = AM.Scale;
	switch (VT.getSimpleVT().SimpleTy) {
	default: return false;
	case MVT::i1:
	case MVT::i8:
	case MVT::i32:
	if (Scale < 0) Scale = -Scale;
	if (Scale == 1)
	return true;
	// r + r << imm
	return isPowerOf2_32(Scale & ~1);
	case MVT::i16:
	case MVT::i64:
	// r +/- r
	if (Scale == 1 \|\| (AM.HasBaseReg && Scale == -1))
	return true;
	// r * 2 (this can be lowered to r + r).
	if (!AM.HasBaseReg && Scale == 2)
	return true;
	return false;

	case MVT::isVoid:
	// Note, we allow "void" uses (basically, uses that aren't loads or
	// stores), because arm allows folding a scale into many arithmetic
	// operations. This should be made more precise and revisited later.

	// Allow r << imm, but the imm has to be a multiple of two.
	if (Scale & 1) return false;
	return isPowerOf2_32(Scale);
	}
	}
	return true;
	}

	/// isLegalICmpImmediate - Return true if the specified immediate is legal
	/// icmp immediate, that is the target has icmp instructions which can compare
	/// a register against the immediate without having to materialize the
	/// immediate into a register.
	bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	// Thumb2 and ARM modes can use cmn for negative immediates.
	if (!Subtarget->isThumb())
	return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 \|\|
	ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
	if (Subtarget->isThumb2())
	return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 \|\|
	ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
	// Thumb1 doesn't have cmn, and only 8-bit immediates.
	return Imm >= 0 && Imm <= 255;
	}

	/// isLegalAddImmediate - Return true if the specified immediate is a legal add
	/// or sub immediate, that is the target has add or sub instructions which can
	/// add a register with the immediate without having to materialize the
	/// immediate into a register.
	bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
	// Same encoding for add/sub, just flip the sign.
	int64_t AbsImm = std::abs(Imm);
	if (!Subtarget->isThumb())
	return ARM_AM::getSOImmVal(AbsImm) != -1;
	if (Subtarget->isThumb2())
	return ARM_AM::getT2SOImmVal(AbsImm) != -1;
	// Thumb1 only has 8-bit unsigned immediate.
	return AbsImm >= 0 && AbsImm <= 255;
	}

	static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
	bool isSEXTLoad, SDValue &Base,
	SDValue &Offset, bool &isInc,
	SelectionDAG &DAG) {
	if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
	return false;

	if (VT == MVT::i16 \|\| ((VT == MVT::i8 \|\| VT == MVT::i1) && isSEXTLoad)) {
	// AddressingMode 3
	Base = Ptr->getOperand(0);
	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
	int RHSC = (int)RHS->getZExtValue();
	if (RHSC < 0 && RHSC > -256) {
	assert(Ptr->getOpcode() == ISD::ADD);
	isInc = false;
	Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
	return true;
	}
	}
	isInc = (Ptr->getOpcode() == ISD::ADD);
	Offset = Ptr->getOperand(1);
	return true;
	} else if (VT == MVT::i32 \|\| VT == MVT::i8 \|\| VT == MVT::i1) {
	// AddressingMode 2
	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
	int RHSC = (int)RHS->getZExtValue();
	if (RHSC < 0 && RHSC > -0x1000) {
	assert(Ptr->getOpcode() == ISD::ADD);
	isInc = false;
	Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
	Base = Ptr->getOperand(0);
	return true;
	}
	}

	if (Ptr->getOpcode() == ISD::ADD) {
	isInc = true;
	ARM_AM::ShiftOpc ShOpcVal=
	ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
	if (ShOpcVal != ARM_AM::no_shift) {
	Base = Ptr->getOperand(1);
	Offset = Ptr->getOperand(0);
	} else {
	Base = Ptr->getOperand(0);
	Offset = Ptr->getOperand(1);
	}
	return true;
	}

	isInc = (Ptr->getOpcode() == ISD::ADD);
	Base = Ptr->getOperand(0);
	Offset = Ptr->getOperand(1);
	return true;
	}

	// FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
	return false;
	}

	static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
	bool isSEXTLoad, SDValue &Base,
	SDValue &Offset, bool &isInc,
	SelectionDAG &DAG) {
	if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
	return false;

	Base = Ptr->getOperand(0);
	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
	int RHSC = (int)RHS->getZExtValue();
	if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
	assert(Ptr->getOpcode() == ISD::ADD);
	isInc = false;
	Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
	return true;
	} else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
	isInc = Ptr->getOpcode() == ISD::ADD;
	Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
	return true;
	}
	}

	return false;
	}

	/// getPreIndexedAddressParts - returns true by value, base pointer and
	/// offset pointer and addressing mode by reference if the node's address
	/// can be legally represented as pre-indexed load / store address.
	bool
	ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const {
	if (Subtarget->isThumb1Only())
	return false;

	EVT VT;
	SDValue Ptr;
	bool isSEXTLoad = false;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	Ptr = LD->getBasePtr();
	VT = LD->getMemoryVT();
	isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	Ptr = ST->getBasePtr();
	VT = ST->getMemoryVT();
	} else
	return false;

	bool isInc;
	bool isLegal = false;
	if (Subtarget->isThumb2())
	isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
	Offset, isInc, DAG);
	else
	isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
	Offset, isInc, DAG);
	if (!isLegal)
	return false;

	AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
	return true;
	}

	/// getPostIndexedAddressParts - returns true by value, base pointer and
	/// offset pointer and addressing mode by reference if this node can be
	/// combined with a load / store to form a post-indexed load / store.
	bool ARMTargetLowering::getPostIndexedAddressParts(SDNode N, SDNode Op,
	SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const {
	EVT VT;
	SDValue Ptr;
	bool isSEXTLoad = false, isNonExt;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	Ptr = LD->getBasePtr();
	isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
	isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	Ptr = ST->getBasePtr();
	isNonExt = !ST->isTruncatingStore();
	} else
	return false;

	if (Subtarget->isThumb1Only()) {
	// Thumb-1 can do a limited post-inc load or store as an updating LDM. It
	// must be non-extending/truncating, i32, with an offset of 4.
	assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
	if (Op->getOpcode() != ISD::ADD \|\| !isNonExt)
	return false;
	auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
	if (!RHS \|\| RHS->getZExtValue() != 4)
	return false;

	Offset = Op->getOperand(1);
	Base = Op->getOperand(0);
	AM = ISD::POST_INC;
	return true;
	}

	bool isInc;
	bool isLegal = false;
	if (Subtarget->isThumb2())
	isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
	isInc, DAG);
	else
	isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
	isInc, DAG);
	if (!isLegal)
	return false;

	if (Ptr != Base) {
	// Swap base ptr and offset to catch more post-index load / store when
	// it's legal. In Thumb2 mode, offset must be an immediate.
	if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
	!Subtarget->isThumb2())
	std::swap(Base, Offset);

	// Post-indexed load / store update the base pointer.
	if (Ptr != Base)
	return false;
	}

	AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
	return true;
	}

	void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned BitWidth = Known.getBitWidth();
	Known.resetAll();
	switch (Op.getOpcode()) {
	default: break;
	case ARMISD::ADDC:
	case ARMISD::ADDE:
	case ARMISD::SUBC:
	case ARMISD::SUBE:
	// Special cases when we convert a carry to a boolean.
	if (Op.getResNo() == 0) {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	// (ADDE 0, 0, C) will give us a single bit.
	if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
	isNullConstant(RHS)) {
	Known.Zero \|= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
	return;
	}
	}
	break;
	case ARMISD::CMOV: {
	// Bits are known zero/one if known on the LHS and RHS.
	Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
	if (Known.isUnknown())
	return;

	KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
	Known.Zero &= KnownRHS.Zero;
	Known.One &= KnownRHS.One;
	return;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
	Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
	switch (IntID) {
	default: return;
	case Intrinsic::arm_ldaex:
	case Intrinsic::arm_ldrex: {
	EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
	unsigned MemBits = VT.getScalarSizeInBits();
	Known.Zero \|= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
	return;
	}
	}
	}
	case ARMISD::BFI: {
	// Conservatively, we can recurse down the first operand
	// and just mask out all affected bits.
	Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

	// The operand to BFI is already a mask suitable for removing the bits it
	// sets.
	ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
	const APInt &Mask = CI->getAPIntValue();
	Known.Zero &= Mask;
	Known.One &= Mask;
	return;
	}
	case ARMISD::VGETLANEs:
	case ARMISD::VGETLANEu: {
	const SDValue &SrcSV = Op.getOperand(0);
	EVT VecVT = SrcSV.getValueType();
	assert(VecVT.isVector() && "VGETLANE expected a vector type");
	const unsigned NumSrcElts = VecVT.getVectorNumElements();
	ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
	assert(Pos->getAPIntValue().ult(NumSrcElts) &&
	"VGETLANE index out of bounds");
	unsigned Idx = Pos->getZExtValue();
	APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
	Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);

	EVT VT = Op.getValueType();
	const unsigned DstSz = VT.getScalarSizeInBits();
	const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
	(void)SrcSz;
	assert(SrcSz == Known.getBitWidth());
	assert(DstSz > SrcSz);
	if (Op.getOpcode() == ARMISD::VGETLANEs)
	Known = Known.sext(DstSz);
	else {
	Known = Known.zext(DstSz, true /* extended bits are known zero */);
	}
	assert(DstSz == Known.getBitWidth());
	break;
	}
	}
	}

	bool
	ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op,
	const APInt &DemandedAPInt,
	TargetLoweringOpt &TLO) const {
	// Delay optimization, so we don't have to deal with illegal types, or block
	// optimizations.
	if (!TLO.LegalOps)
	return false;

	// Only optimize AND for now.
	if (Op.getOpcode() != ISD::AND)
	return false;

	EVT VT = Op.getValueType();

	// Ignore vectors.
	if (VT.isVector())
	return false;

	assert(VT == MVT::i32 && "Unexpected integer type");

	// Make sure the RHS really is a constant.
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!C)
	return false;

	unsigned Mask = C->getZExtValue();

	unsigned Demanded = DemandedAPInt.getZExtValue();
	unsigned ShrunkMask = Mask & Demanded;
	unsigned ExpandedMask = Mask \| ~Demanded;

	// If the mask is all zeros, let the target-independent code replace the
	// result with zero.
	if (ShrunkMask == 0)
	return false;

	// If the mask is all ones, erase the AND. (Currently, the target-independent
	// code won't do this, so we have to do it explicitly to avoid an infinite
	// loop in obscure cases.)
	if (ExpandedMask == ~0U)
	return TLO.CombineTo(Op, Op.getOperand(0));

	auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
	return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
	};
	auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
	if (NewMask == Mask)
	return true;
	SDLoc DL(Op);
	SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
	SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
	return TLO.CombineTo(Op, NewOp);
	};

	// Prefer uxtb mask.
	if (IsLegalMask(0xFF))
	return UseMask(0xFF);

	// Prefer uxth mask.
	if (IsLegalMask(0xFFFF))
	return UseMask(0xFFFF);

	// [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
	// FIXME: Prefer a contiguous sequence of bits for other optimizations.
	if (ShrunkMask < 256)
	return UseMask(ShrunkMask);

	// [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
	// FIXME: Prefer a contiguous sequence of bits for other optimizations.
	if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
	return UseMask(ExpandedMask);

	// Potential improvements:
	//
	// We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
	// We could try to prefer Thumb1 immediates which can be lowered to a
	// two-instruction sequence.
	// We could try to recognize more legal ARM/Thumb2 immediates here.

	return false;
	}


	//===----------------------------------------------------------------------===//
	// ARM Inline Assembly Support
	//===----------------------------------------------------------------------===//

	bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
	// Looking for "rev" which is V6+.
	if (!Subtarget->hasV6Ops())
	return false;

	InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
	std::string AsmStr = IA->getAsmString();
	SmallVector<StringRef, 4> AsmPieces;
	SplitString(AsmStr, AsmPieces, ";\n");

	switch (AsmPieces.size()) {
	default: return false;
	case 1:
	AsmStr = AsmPieces[0];
	AsmPieces.clear();
	SplitString(AsmStr, AsmPieces, " \t,");

	// rev $0, $1
	if (AsmPieces.size() == 3 &&
	AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
	IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
	IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
	if (Ty && Ty->getBitWidth() == 32)
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	break;
	}

	return false;
	}

	const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
	// At this point, we have to lower this constraint to something else, so we
	// lower it to an "r" or "w". However, by doing this we will force the result
	// to be in register, while the X constraint is much more permissive.
	//
	// Although we are correct (we are free to emit anything, without
	// constraints), we might break use cases that would expect us to be more
	// efficient and emit something else.
	if (!Subtarget->hasVFP2Base())
	return "r";
	if (ConstraintVT.isFloatingPoint())
	return "w";
	if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
	(ConstraintVT.getSizeInBits() == 64 \|\|
	ConstraintVT.getSizeInBits() == 128))
	return "w";

	return "r";
	}

	/// getConstraintType - Given a constraint letter, return the type of
	/// constraint it is for this target.
	ARMTargetLowering::ConstraintType
	ARMTargetLowering::getConstraintType(StringRef Constraint) const {
	- if (Constraint.size() == 1) {
	+ unsigned S = Constraint.size();
	+ if (S == 1) {
	switch (Constraint[0]) {
	default: break;
	case 'l': return C_RegisterClass;
	case 'w': return C_RegisterClass;
	case 'h': return C_RegisterClass;
	case 'x': return C_RegisterClass;
	case 't': return C_RegisterClass;
	- case 'j': return C_Other; // Constant for movw.
	- // An address with a single base register. Due to the way we
	- // currently handle addresses it is the same as an 'r' memory constraint.
	+ case 'j': return C_Immediate; // Constant for movw.
	+ // An address with a single base register. Due to the way we
	+ // currently handle addresses it is the same as an 'r' memory constraint.
	case 'Q': return C_Memory;
	}
	- } else if (Constraint.size() == 2) {
	+ } else if (S == 2) {
	switch (Constraint[0]) {
	default: break;
	case 'T': return C_RegisterClass;
	// All 'U+' constraints are addresses.
	case 'U': return C_Memory;
	}
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	ARMTargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	break;
	case 'l':
	if (type->isIntegerTy()) {
	if (Subtarget->isThumb())
	weight = CW_SpecificReg;
	else
	weight = CW_Register;
	}
	break;
	case 'w':
	if (type->isFloatingPointTy())
	weight = CW_Register;
	break;
	}
	return weight;
	}

	using RCPair = std::pair<unsigned, const TargetRegisterClass *>;

	RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
	const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
	switch (Constraint.size()) {
	case 1:
	// GCC ARM Constraint Letters
	switch (Constraint[0]) {
	case 'l': // Low regs or general regs.
	if (Subtarget->isThumb())
	return RCPair(0U, &ARM::tGPRRegClass);
	return RCPair(0U, &ARM::GPRRegClass);
	case 'h': // High regs or no regs.
	if (Subtarget->isThumb())
	return RCPair(0U, &ARM::hGPRRegClass);
	break;
	case 'r':
	if (Subtarget->isThumb1Only())
	return RCPair(0U, &ARM::tGPRRegClass);
	return RCPair(0U, &ARM::GPRRegClass);
	case 'w':
	if (VT == MVT::Other)
	break;
	if (VT == MVT::f32)
	return RCPair(0U, &ARM::SPRRegClass);
	if (VT.getSizeInBits() == 64)
	return RCPair(0U, &ARM::DPRRegClass);
	if (VT.getSizeInBits() == 128)
	return RCPair(0U, &ARM::QPRRegClass);
	break;
	case 'x':
	if (VT == MVT::Other)
	break;
	if (VT == MVT::f32)
	return RCPair(0U, &ARM::SPR_8RegClass);
	if (VT.getSizeInBits() == 64)
	return RCPair(0U, &ARM::DPR_8RegClass);
	if (VT.getSizeInBits() == 128)
	return RCPair(0U, &ARM::QPR_8RegClass);
	break;
	case 't':
	if (VT == MVT::Other)
	break;
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	return RCPair(0U, &ARM::SPRRegClass);
	if (VT.getSizeInBits() == 64)
	return RCPair(0U, &ARM::DPR_VFP2RegClass);
	if (VT.getSizeInBits() == 128)
	return RCPair(0U, &ARM::QPR_VFP2RegClass);
	break;
	}
	break;

	case 2:
	if (Constraint[0] == 'T') {
	switch (Constraint[1]) {
	default:
	break;
	case 'e':
	return RCPair(0U, &ARM::tGPREvenRegClass);
	case 'o':
	return RCPair(0U, &ARM::tGPROddRegClass);
	}
	}
	break;

	default:
	break;
	}

	if (StringRef("{cc}").equals_lower(Constraint))
	return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);

	return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
	}

	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
	/// vector. If it is invalid, don't add anything to Ops.
	void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Currently only support length 1 constraints.
	if (Constraint.length() != 1) return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default: break;
	case 'j':
	case 'I': case 'J': case 'K': case 'L':
	case 'M': case 'N': case 'O':
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return;

	int64_t CVal64 = C->getSExtValue();
	int CVal = (int) CVal64;
	// None of these constraints allow values larger than 32 bits. Check
	// that the value fits in an int.
	if (CVal != CVal64)
	return;

	switch (ConstraintLetter) {
	case 'j':
	// Constant suitable for movw, must be between 0 and
	// 65535.
	if (Subtarget->hasV6T2Ops())
	if (CVal >= 0 && CVal <= 65535)
	break;
	return;
	case 'I':
	if (Subtarget->isThumb1Only()) {
	// This must be a constant between 0 and 255, for ADD
	// immediates.
	if (CVal >= 0 && CVal <= 255)
	break;
	} else if (Subtarget->isThumb2()) {
	// A constant that can be used as an immediate value in a
	// data-processing instruction.
	if (ARM_AM::getT2SOImmVal(CVal) != -1)
	break;
	} else {
	// A constant that can be used as an immediate value in a
	// data-processing instruction.
	if (ARM_AM::getSOImmVal(CVal) != -1)
	break;
	}
	return;

	case 'J':
	if (Subtarget->isThumb1Only()) {
	// This must be a constant between -255 and -1, for negated ADD
	// immediates. This can be used in GCC with an "n" modifier that
	// prints the negated value, for use with SUB instructions. It is
	// not useful otherwise but is implemented for compatibility.
	if (CVal >= -255 && CVal <= -1)
	break;
	} else {
	// This must be a constant between -4095 and 4095. It is not clear
	// what this constraint is intended for. Implemented for
	// compatibility with GCC.
	if (CVal >= -4095 && CVal <= 4095)
	break;
	}
	return;

	case 'K':
	if (Subtarget->isThumb1Only()) {
	// A 32-bit value where only one byte has a nonzero value. Exclude
	// zero to match GCC. This constraint is used by GCC internally for
	// constants that can be loaded with a move/shift combination.
	// It is not useful otherwise but is implemented for compatibility.
	if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
	break;
	} else if (Subtarget->isThumb2()) {
	// A constant whose bitwise inverse can be used as an immediate
	// value in a data-processing instruction. This can be used in GCC
	// with a "B" modifier that prints the inverted value, for use with
	// BIC and MVN instructions. It is not useful otherwise but is
	// implemented for compatibility.
	if (ARM_AM::getT2SOImmVal(~CVal) != -1)
	break;
	} else {
	// A constant whose bitwise inverse can be used as an immediate
	// value in a data-processing instruction. This can be used in GCC
	// with a "B" modifier that prints the inverted value, for use with
	// BIC and MVN instructions. It is not useful otherwise but is
	// implemented for compatibility.
	if (ARM_AM::getSOImmVal(~CVal) != -1)
	break;
	}
	return;

	case 'L':
	if (Subtarget->isThumb1Only()) {
	// This must be a constant between -7 and 7,
	// for 3-operand ADD/SUB immediate instructions.
	if (CVal >= -7 && CVal < 7)
	break;
	} else if (Subtarget->isThumb2()) {
	// A constant whose negation can be used as an immediate value in a
	// data-processing instruction. This can be used in GCC with an "n"
	// modifier that prints the negated value, for use with SUB
	// instructions. It is not useful otherwise but is implemented for
	// compatibility.
	if (ARM_AM::getT2SOImmVal(-CVal) != -1)
	break;
	} else {
	// A constant whose negation can be used as an immediate value in a
	// data-processing instruction. This can be used in GCC with an "n"
	// modifier that prints the negated value, for use with SUB
	// instructions. It is not useful otherwise but is implemented for
	// compatibility.
	if (ARM_AM::getSOImmVal(-CVal) != -1)
	break;
	}
	return;

	case 'M':
	if (Subtarget->isThumb1Only()) {
	// This must be a multiple of 4 between 0 and 1020, for
	// ADD sp + immediate.
	if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
	break;
	} else {
	// A power of two or a constant between 0 and 32. This is used in
	// GCC for the shift amount on shifted register operands, but it is
	// useful in general for any shift amounts.
	if ((CVal >= 0 && CVal <= 32) \|\| ((CVal & (CVal - 1)) == 0))
	break;
	}
	return;

	case 'N':
	if (Subtarget->isThumb()) { // FIXME thumb2
	// This must be a constant between 0 and 31, for shift amounts.
	if (CVal >= 0 && CVal <= 31)
	break;
	}
	return;

	case 'O':
	if (Subtarget->isThumb()) { // FIXME thumb2
	// This must be a multiple of 4 between -508 and 508, for
	// ADD/SUB sp = sp + immediate.
	if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
	break;
	}
	return;
	}
	Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
	break;
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}
	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	static RTLIB::Libcall getDivRemLibcall(
	const SDNode *N, MVT::SimpleValueType SVT) {
	assert((N->getOpcode() == ISD::SDIVREM \|\| N->getOpcode() == ISD::UDIVREM \|\|
	N->getOpcode() == ISD::SREM \|\| N->getOpcode() == ISD::UREM) &&
	"Unhandled Opcode in getDivRemLibcall");
	bool isSigned = N->getOpcode() == ISD::SDIVREM \|\|
	N->getOpcode() == ISD::SREM;
	RTLIB::Libcall LC;
	switch (SVT) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
	case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
	case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
	case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
	}
	return LC;
	}

	static TargetLowering::ArgListTy getDivRemArgList(
	const SDNode N, LLVMContext Context, const ARMSubtarget *Subtarget) {
	assert((N->getOpcode() == ISD::SDIVREM \|\| N->getOpcode() == ISD::UDIVREM \|\|
	N->getOpcode() == ISD::SREM \|\| N->getOpcode() == ISD::UREM) &&
	"Unhandled Opcode in getDivRemArgList");
	bool isSigned = N->getOpcode() == ISD::SDIVREM \|\|
	N->getOpcode() == ISD::SREM;
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	EVT ArgVT = N->getOperand(i).getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(Context);
	Entry.Node = N->getOperand(i);
	Entry.Ty = ArgTy;
	Entry.IsSExt = isSigned;
	Entry.IsZExt = !isSigned;
	Args.push_back(Entry);
	}
	if (Subtarget->isTargetWindows() && Args.size() >= 2)
	std::swap(Args[0], Args[1]);
	return Args;
	}

	SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
	assert((Subtarget->isTargetAEABI() \|\| Subtarget->isTargetAndroid() \|\|
	Subtarget->isTargetGNUAEABI() \|\| Subtarget->isTargetMuslAEABI() \|\|
	Subtarget->isTargetWindows()) &&
	"Register-based DivRem lowering only");
	unsigned Opcode = Op->getOpcode();
	assert((Opcode == ISD::SDIVREM \|\| Opcode == ISD::UDIVREM) &&
	"Invalid opcode for Div/Rem lowering");
	bool isSigned = (Opcode == ISD::SDIVREM);
	EVT VT = Op->getValueType(0);
	Type Ty = VT.getTypeForEVT(DAG.getContext());
	SDLoc dl(Op);

	// If the target has hardware divide, use divide + multiply + subtract:
	// div = a / b
	// rem = a - b * div
	// return {div, rem}
	// This should be lowered into UDIV/SDIV + MLS later on.
	bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
	: Subtarget->hasDivideInARMMode();
	if (hasDivide && Op->getValueType(0).isSimple() &&
	Op->getSimpleValueType(0) == MVT::i32) {
	unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
	const SDValue Dividend = Op->getOperand(0);
	const SDValue Divisor = Op->getOperand(1);
	SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
	SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);

	SDValue Values[2] = {Div, Rem};
	return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
	}

	RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
	VT.getSimpleVT().SimpleTy);
	SDValue InChain = DAG.getEntryNode();

	TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
	DAG.getContext(),
	Subtarget);

	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
	getPointerTy(DAG.getDataLayout()));

	Type *RetTy = StructType::get(Ty, Ty);

	if (Subtarget->isTargetWindows())
	InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl).setChain(InChain)
	.setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
	.setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
	return CallInfo.first;
	}

	// Lowers REM using divmod helpers
	// see RTABI section 4.2/4.3
	SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
	// Build return types (div and rem)
	std::vector<Type*> RetTyParams;
	Type *RetTyElement;

	switch (N->getValueType(0).getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
	case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
	case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
	case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
	}

	RetTyParams.push_back(RetTyElement);
	RetTyParams.push_back(RetTyElement);
	ArrayRef<Type> ret = ArrayRef<Type>(RetTyParams);
	Type RetTy = StructType::get(DAG.getContext(), ret);

	RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
	SimpleTy);
	SDValue InChain = DAG.getEntryNode();
	TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
	Subtarget);
	bool isSigned = N->getOpcode() == ISD::SREM;
	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
	getPointerTy(DAG.getDataLayout()));

	if (Subtarget->isTargetWindows())
	InChain = WinDBZCheckDenominator(DAG, N, InChain);

	// Lower call
	CallLoweringInfo CLI(DAG);
	CLI.setChain(InChain)
	.setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
	.setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);

	// Return second (rem) result operand (first contains div)
	SDNode *ResNode = CallResult.first.getNode();
	assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
	return ResNode->getOperand(1);
	}

	SDValue
	ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget->isTargetWindows() && "unsupported target platform");
	SDLoc DL(Op);

	// Get the inputs.
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);

	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
	"no-stack-arg-probe")) {
	unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
	Chain = SP.getValue(1);
	SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
	if (Align)
	SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align, DL, MVT::i32));
	Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
	SDValue Ops[2] = { SP, Chain };
	return DAG.getMergeValues(Ops, DL);
	}

	SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
	DAG.getConstant(2, DL, MVT::i32));

	SDValue Flag;
	Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
	Flag = Chain.getValue(1);

	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);

	SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
	Chain = NewSP.getValue(1);

	SDValue Ops[2] = { NewSP, Chain };
	return DAG.getMergeValues(Ops, DL);
	}

	SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
	SDValue SrcVal = Op.getOperand(0);
	const unsigned DstSz = Op.getValueType().getSizeInBits();
	const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
	assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
	"Unexpected type for custom-lowering FP_EXTEND");

	assert((!Subtarget->hasFP64() \|\| !Subtarget->hasFPARMv8Base()) &&
	"With both FP DP and 16, any FP conversion is legal!");

	assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
	"With FP16, 16 to 32 conversion is legal!");

	// Either we are converting from 16 -> 64, without FP16 and/or
	// FP.double-precision or without Armv8-fp. So we must do it in two
	// steps.
	// Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
	// without FP16. So we must do a function call.
	SDLoc Loc(Op);
	RTLIB::Libcall LC;
	if (SrcSz == 16) {
	// Instruction from 16 -> 32
	if (Subtarget->hasFP16())
	SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, SrcVal);
	// Lib call from 16 -> 32
	else {
	LC = RTLIB::getFPEXT(MVT::f16, MVT::f32);
	assert(LC != RTLIB::UNKNOWN_LIBCALL &&
	"Unexpected type for custom-lowering FP_EXTEND");
	SrcVal =
	makeLibCall(DAG, LC, MVT::f32, SrcVal, /isSigned/ false, Loc).first;
	}
	}

	if (DstSz != 64)
	return SrcVal;
	// For sure now SrcVal is 32 bits
	if (Subtarget->hasFP64()) // Instruction from 32 -> 64
	return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, SrcVal);

	LC = RTLIB::getFPEXT(MVT::f32, MVT::f64);
	assert(LC != RTLIB::UNKNOWN_LIBCALL &&
	"Unexpected type for custom-lowering FP_EXTEND");
	return makeLibCall(DAG, LC, MVT::f64, SrcVal, /isSigned/ false, Loc).first;
	}

	SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
	SDValue SrcVal = Op.getOperand(0);
	EVT SrcVT = SrcVal.getValueType();
	EVT DstVT = Op.getValueType();
	const unsigned DstSz = Op.getValueType().getSizeInBits();
	const unsigned SrcSz = SrcVT.getSizeInBits();
	(void)DstSz;
	assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
	"Unexpected type for custom-lowering FP_ROUND");

	assert((!Subtarget->hasFP64() \|\| !Subtarget->hasFPARMv8Base()) &&
	"With both FP DP and 16, any FP conversion is legal!");

	SDLoc Loc(Op);

	// Instruction from 32 -> 16 if hasFP16 is valid
	if (SrcSz == 32 && Subtarget->hasFP16())
	return Op;

	// Lib call from 32 -> 16 / 64 -> [32, 16]
	RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
	assert(LC != RTLIB::UNKNOWN_LIBCALL &&
	"Unexpected type for custom-lowering FP_ROUND");
	return makeLibCall(DAG, LC, DstVT, SrcVal, /isSigned/ false, Loc).first;
	}

	void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS.");
	MVT HalfT = MVT::i32;
	SDLoc dl(N);
	SDValue Hi, Lo, Tmp;

	if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) \|\|
	!isOperationLegalOrCustom(ISD::UADDO, HalfT))
	return ;

	unsigned OpTypeBits = HalfT.getScalarSizeInBits();
	SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);

	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
	DAG.getConstant(0, dl, HalfT));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
	DAG.getConstant(1, dl, HalfT));

	Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi,
	DAG.getConstant(OpTypeBits - 1, dl,
	getShiftAmountTy(HalfT, DAG.getDataLayout())));
	Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
	Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
	SDValue(Lo.getNode(), 1));
	Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
	Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);

	Results.push_back(Lo);
	Results.push_back(Hi);
	}

	bool
	ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
	// The ARM target isn't yet aware of offsets.
	return false;
	}

	bool ARM::isBitFieldInvertedMask(unsigned v) {
	if (v == 0xffffffff)
	return false;

	// there can be 1's on either or both "outsides", all the "inside"
	// bits must be 0's
	return isShiftedMask_32(~v);
	}

	/// isFPImmLegal - Returns true if the target can instruction select the
	/// specified FP immediate natively. If false, the legalizer will
	/// materialize the FP immediate as a load from a constant pool.
	bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
	bool ForCodeSize) const {
	if (!Subtarget->hasVFP3Base())
	return false;
	if (VT == MVT::f16 && Subtarget->hasFullFP16())
	return ARM_AM::getFP16Imm(Imm) != -1;
	if (VT == MVT::f32)
	return ARM_AM::getFP32Imm(Imm) != -1;
	if (VT == MVT::f64 && Subtarget->hasFP64())
	return ARM_AM::getFP64Imm(Imm) != -1;
	return false;
	}

	/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
	/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
	/// specified in the intrinsic calls.
	bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {
	switch (Intrinsic) {
	case Intrinsic::arm_neon_vld1:
	case Intrinsic::arm_neon_vld2:
	case Intrinsic::arm_neon_vld3:
	case Intrinsic::arm_neon_vld4:
	case Intrinsic::arm_neon_vld2lane:
	case Intrinsic::arm_neon_vld3lane:
	case Intrinsic::arm_neon_vld4lane:
	case Intrinsic::arm_neon_vld2dup:
	case Intrinsic::arm_neon_vld3dup:
	case Intrinsic::arm_neon_vld4dup: {
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	// Conservatively set memVT to the entire set of vectors loaded.
	auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
	uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
	// volatile loads with NEON intrinsics not supported
	Info.flags = MachineMemOperand::MOLoad;
	return true;
	}
	case Intrinsic::arm_neon_vld1x2:
	case Intrinsic::arm_neon_vld1x3:
	case Intrinsic::arm_neon_vld1x4: {
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	// Conservatively set memVT to the entire set of vectors loaded.
	auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
	uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align = 0;
	// volatile loads with NEON intrinsics not supported
	Info.flags = MachineMemOperand::MOLoad;
	return true;
	}
	case Intrinsic::arm_neon_vst1:
	case Intrinsic::arm_neon_vst2:
	case Intrinsic::arm_neon_vst3:
	case Intrinsic::arm_neon_vst4:
	case Intrinsic::arm_neon_vst2lane:
	case Intrinsic::arm_neon_vst3lane:
	case Intrinsic::arm_neon_vst4lane: {
	Info.opc = ISD::INTRINSIC_VOID;
	// Conservatively set memVT to the entire set of vectors stored.
	auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
	unsigned NumElts = 0;
	for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
	Type *ArgTy = I.getArgOperand(ArgI)->getType();
	if (!ArgTy->isVectorTy())
	break;
	NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
	}
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
	// volatile stores with NEON intrinsics not supported
	Info.flags = MachineMemOperand::MOStore;
	return true;
	}
	case Intrinsic::arm_neon_vst1x2:
	case Intrinsic::arm_neon_vst1x3:
	case Intrinsic::arm_neon_vst1x4: {
	Info.opc = ISD::INTRINSIC_VOID;
	// Conservatively set memVT to the entire set of vectors stored.
	auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
	unsigned NumElts = 0;
	for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
	Type *ArgTy = I.getArgOperand(ArgI)->getType();
	if (!ArgTy->isVectorTy())
	break;
	NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
	}
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = 0;
	// volatile stores with NEON intrinsics not supported
	Info.flags = MachineMemOperand::MOStore;
	return true;
	}
	case Intrinsic::arm_ldaex:
	case Intrinsic::arm_ldrex: {
	auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
	return true;
	}
	case Intrinsic::arm_stlex:
	case Intrinsic::arm_strex: {
	auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = 0;
	Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
	return true;
	}
	case Intrinsic::arm_stlexd:
	case Intrinsic::arm_strexd:
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::i64;
	Info.ptrVal = I.getArgOperand(2);
	Info.offset = 0;
	Info.align = 8;
	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
	return true;

	case Intrinsic::arm_ldaexd:
	case Intrinsic::arm_ldrexd:
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::i64;
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = 8;
	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
	return true;

	default:
	break;
	}

	return false;
	}

	/// Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned Bits = Ty->getPrimitiveSizeInBits();
	if (Bits == 0 \|\| Bits > 32)
	return false;
	return true;
	}

	bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	return (Index == 0 \|\| Index == ResVT.getVectorNumElements());
	}

	Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
	ARM_MB::MemBOpt Domain) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();

	// First, if the target has no DMB, see what fallback we can use.
	if (!Subtarget->hasDataBarrier()) {
	// Some ARMv6 cpus can support data barriers with an mcr instruction.
	// Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
	// here.
	if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
	Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
	Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
	Builder.getInt32(0), Builder.getInt32(7),
	Builder.getInt32(10), Builder.getInt32(5)};
	return Builder.CreateCall(MCR, args);
	} else {
	// Instead of using barriers, atomic accesses on these subtargets use
	// libcalls.
	llvm_unreachable("makeDMB on a target so old that it has no barriers");
	}
	} else {
	Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
	// Only a full system barrier exists in the M-class architectures.
	Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
	Constant *CDomain = Builder.getInt32(Domain);
	return Builder.CreateCall(DMB, CDomain);
	}
	}

	// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
	Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	switch (Ord) {
	case AtomicOrdering::NotAtomic:
	case AtomicOrdering::Unordered:
	llvm_unreachable("Invalid fence: unordered/non-atomic");
	case AtomicOrdering::Monotonic:
	case AtomicOrdering::Acquire:
	return nullptr; // Nothing to do
	case AtomicOrdering::SequentiallyConsistent:
	if (!Inst->hasAtomicStore())
	return nullptr; // Nothing to do
	LLVM_FALLTHROUGH;
	case AtomicOrdering::Release:
	case AtomicOrdering::AcquireRelease:
	if (Subtarget->preferISHSTBarriers())
	return makeDMB(Builder, ARM_MB::ISHST);
	// FIXME: add a comment with a link to documentation justifying this.
	else
	return makeDMB(Builder, ARM_MB::ISH);
	}
	llvm_unreachable("Unknown fence ordering in emitLeadingFence");
	}

	Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	switch (Ord) {
	case AtomicOrdering::NotAtomic:
	case AtomicOrdering::Unordered:
	llvm_unreachable("Invalid fence: unordered/not-atomic");
	case AtomicOrdering::Monotonic:
	case AtomicOrdering::Release:
	return nullptr; // Nothing to do
	case AtomicOrdering::Acquire:
	case AtomicOrdering::AcquireRelease:
	case AtomicOrdering::SequentiallyConsistent:
	return makeDMB(Builder, ARM_MB::ISH);
	}
	llvm_unreachable("Unknown fence ordering in emitTrailingFence");
	}

	// Loads and stores less than 64-bits are already atomic; ones above that
	// are doomed anyway, so defer to the default libcall and blame the OS when
	// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
	// anything for those.
	bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
	return (Size == 64) && !Subtarget->isMClass();
	}

	// Loads and stores less than 64-bits are already atomic; ones above that
	// are doomed anyway, so defer to the default libcall and blame the OS when
	// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
	// anything for those.
	// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
	// guarantee, see DDI0406C ARM architecture reference manual,
	// sections A8.8.72-74 LDRD)
	TargetLowering::AtomicExpansionKind
	ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	unsigned Size = LI->getType()->getPrimitiveSizeInBits();
	return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
	: AtomicExpansionKind::None;
	}

	// For the real atomic operations, we have ldrex/strex up to 32 bits,
	// and up to 64 bits on the non-M profiles
	TargetLowering::AtomicExpansionKind
	ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	if (AI->isFloatingPointOperation())
	return AtomicExpansionKind::CmpXChg;

	unsigned Size = AI->getType()->getPrimitiveSizeInBits();
	bool hasAtomicRMW = !Subtarget->isThumb() \|\| Subtarget->hasV8MBaselineOps();
	return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
	? AtomicExpansionKind::LLSC
	: AtomicExpansionKind::None;
	}

	TargetLowering::AtomicExpansionKind
	ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
	// implement cmpxchg without spilling. If the address being exchanged is also
	// on the stack and close enough to the spill slot, this can lead to a
	// situation where the monitor always gets cleared and the atomic operation
	// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
	bool HasAtomicCmpXchg =
	!Subtarget->isThumb() \|\| Subtarget->hasV8MBaselineOps();
	if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg)
	return AtomicExpansionKind::LLSC;
	return AtomicExpansionKind::None;
	}

	bool ARMTargetLowering::shouldInsertFencesForAtomic(
	const Instruction *I) const {
	return InsertFencesForAtomic;
	}

	// This has so far only been implemented for MachO.
	bool ARMTargetLowering::useLoadStackGuardNode() const {
	return Subtarget->isTargetMachO();
	}

	void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
	if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
	return TargetLowering::insertSSPDeclarations(M);

	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
	"__security_check_cookie", Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext()));
	if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
	F->addAttribute(1, Attribute::AttrKind::InReg);
	}

	Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
	return M.getGlobalVariable("__security_cookie");
	return TargetLowering::getSDagStackGuard(M);
	}

	Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
	return M.getFunction("__security_check_cookie");
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	bool ARMTargetLowering::canCombineStoreAndExtract(Type VectorTy, Value Idx,
	unsigned &Cost) const {
	// If we do not have NEON, vector types are not natively supported.
	if (!Subtarget->hasNEON())
	return false;

	// Floating point values and vector values map to the same register file.
	// Therefore, although we could do a store extract of a vector type, this is
	// better to leave at float as we have more freedom in the addressing mode for
	// those.
	if (VectorTy->isFPOrFPVectorTy())
	return false;

	// If the index is unknown at compile time, this is very expensive to lower
	// and it is not possible to combine the store with the extract.
	if (!isa<ConstantInt>(Idx))
	return false;

	assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
	unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
	// We can do a store + vector extract on any vector that fits perfectly in a D
	// or Q register.
	if (BitWidth == 64 \|\| BitWidth == 128) {
	Cost = 0;
	return true;
	}
	return false;
	}

	bool ARMTargetLowering::isCheapToSpeculateCttz() const {
	return Subtarget->hasV6T2Ops();
	}

	bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
	return Subtarget->hasV6T2Ops();
	}

	bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
	return !Subtarget->hasMinSize();
	}

	Value ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value Addr,
	AtomicOrdering Ord) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
	bool IsAcquire = isAcquireOrStronger(Ord);

	// Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
	// intrinsic must return {i32, i32} and we have to recombine them into a
	// single i64 here.
	if (ValTy->getPrimitiveSizeInBits() == 64) {
	Intrinsic::ID Int =
	IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
	Function *Ldrex = Intrinsic::getDeclaration(M, Int);

	Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
	Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");

	Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
	Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
	if (!Subtarget->isLittle())
	std::swap (Lo, Hi);
	Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
	Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
	return Builder.CreateOr(
	Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
	}

	Type *Tys[] = { Addr->getType() };
	Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
	Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);

	return Builder.CreateTruncOrBitCast(
	Builder.CreateCall(Ldrex, Addr),
	cast<PointerType>(Addr->getType())->getElementType());
	}

	void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
	IRBuilder<> &Builder) const {
	if (!Subtarget->hasV7Ops())
	return;
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
	}

	Value ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value Val,
	Value *Addr,
	AtomicOrdering Ord) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	bool IsRelease = isReleaseOrStronger(Ord);

	// Since the intrinsics must have legal type, the i64 intrinsics take two
	// parameters: "i32, i32". We must marshal Val into the appropriate form
	// before the call.
	if (Val->getType()->getPrimitiveSizeInBits() == 64) {
	Intrinsic::ID Int =
	IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
	Function *Strex = Intrinsic::getDeclaration(M, Int);
	Type *Int32Ty = Type::getInt32Ty(M->getContext());

	Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
	Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
	if (!Subtarget->isLittle())
	std::swap(Lo, Hi);
	Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
	return Builder.CreateCall(Strex, {Lo, Hi, Addr});
	}

	Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
	Type *Tys[] = { Addr->getType() };
	Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);

	return Builder.CreateCall(
	Strex, {Builder.CreateZExtOrBitCast(
	Val, Strex->getFunctionType()->getParamType(0)),
	Addr});
	}


	bool ARMTargetLowering::alignLoopsWithOptSize() const {
	return Subtarget->isMClass();
	}

	/// A helper function for determining the number of interleaved accesses we
	/// will generate when lowering accesses of the given type.
	unsigned
	ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
	const DataLayout &DL) const {
	return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
	}

	bool ARMTargetLowering::isLegalInterleavedAccessType(
	VectorType *VecTy, const DataLayout &DL) const {

	unsigned VecSize = DL.getTypeSizeInBits(VecTy);
	unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());

	// Ensure the vector doesn't have f16 elements. Even though we could do an
	// i16 vldN, we can't hold the f16 vectors and will end up converting via
	// f32.
	if (VecTy->getElementType()->isHalfTy())
	return false;

	// Ensure the number of vector elements is greater than 1.
	if (VecTy->getNumElements() < 2)
	return false;

	// Ensure the element type is legal.
	if (ElSize != 8 && ElSize != 16 && ElSize != 32)
	return false;

	// Ensure the total vector size is 64 or a multiple of 128. Types larger than
	// 128 will be split into multiple interleaved accesses.
	return VecSize == 64 \|\| VecSize % 128 == 0;
	}

	/// Lower an interleaved load into a vldN intrinsic.
	///
	/// E.g. Lower an interleaved load (Factor = 2):
	/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
	/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
	/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
	///
	/// Into:
	/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
	/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
	/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
	bool ARMTargetLowering::lowerInterleavedLoad(
	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
	ArrayRef<unsigned> Indices, unsigned Factor) const {
	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
	"Invalid interleave factor");
	assert(!Shuffles.empty() && "Empty shufflevector input");
	assert(Shuffles.size() == Indices.size() &&
	"Unmatched number of shufflevectors and indices");

	VectorType *VecTy = Shuffles[0]->getType();
	Type *EltTy = VecTy->getVectorElementType();

	const DataLayout &DL = LI->getModule()->getDataLayout();

	// Skip if we do not have NEON and skip illegal vector types. We can
	// "legalize" wide vector types into multiple interleaved accesses as long as
	// the vector types are divisible by 128.
	if (!Subtarget->hasNEON() \|\| !isLegalInterleavedAccessType(VecTy, DL))
	return false;

	unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);

	// A pointer vector can not be the return type of the ldN intrinsics. Need to
	// load integer vectors first and then convert to pointer vectors.
	if (EltTy->isPointerTy())
	VecTy =
	VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());

	IRBuilder<> Builder(LI);

	// The base address of the load.
	Value *BaseAddr = LI->getPointerOperand();

	if (NumLoads > 1) {
	// If we're going to generate more than one load, reset the sub-vector type
	// to something legal.
	VecTy = VectorType::get(VecTy->getVectorElementType(),
	VecTy->getVectorNumElements() / NumLoads);

	// We will compute the pointer operand of each load from the original base
	// address using GEPs. Cast the base address to a pointer to the scalar
	// element type.
	BaseAddr = Builder.CreateBitCast(
	BaseAddr, VecTy->getVectorElementType()->getPointerTo(
	LI->getPointerAddressSpace()));
	}

	assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");

	Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
	Type *Tys[] = {VecTy, Int8Ptr};
	static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
	Intrinsic::arm_neon_vld3,
	Intrinsic::arm_neon_vld4};
	Function *VldnFunc =
	Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);

	// Holds sub-vectors extracted from the load intrinsic return values. The
	// sub-vectors are associated with the shufflevector instructions they will
	// replace.
	DenseMap<ShuffleVectorInst , SmallVector<Value , 4>> SubVecs;

	for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
	// If we're generating more than one load, compute the base address of
	// subsequent loads as an offset from the previous.
	if (LoadCount > 0)
	BaseAddr =
	Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
	VecTy->getVectorNumElements() * Factor);

	SmallVector<Value *, 2> Ops;
	Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
	Ops.push_back(Builder.getInt32(LI->getAlignment()));

	CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");

	// Replace uses of each shufflevector with the corresponding vector loaded
	// by ldN.
	for (unsigned i = 0; i < Shuffles.size(); i++) {
	ShuffleVectorInst *SV = Shuffles[i];
	unsigned Index = Indices[i];

	Value *SubVec = Builder.CreateExtractValue(VldN, Index);

	// Convert the integer vector to pointer vector if the element is pointer.
	if (EltTy->isPointerTy())
	SubVec = Builder.CreateIntToPtr(
	SubVec, VectorType::get(SV->getType()->getVectorElementType(),
	VecTy->getVectorNumElements()));

	SubVecs[SV].push_back(SubVec);
	}
	}

	// Replace uses of the shufflevector instructions with the sub-vectors
	// returned by the load intrinsic. If a shufflevector instruction is
	// associated with more than one sub-vector, those sub-vectors will be
	// concatenated into a single wide vector.
	for (ShuffleVectorInst *SVI : Shuffles) {
	auto &SubVec = SubVecs[SVI];
	auto *WideVec =
	SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
	SVI->replaceAllUsesWith(WideVec);
	}

	return true;
	}

	/// Lower an interleaved store into a vstN intrinsic.
	///
	/// E.g. Lower an interleaved store (Factor = 3):
	/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
	/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
	/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
	///
	/// Into:
	/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
	/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
	/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
	/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
	///
	/// Note that the new shufflevectors will be removed and we'll only generate one
	/// vst3 instruction in CodeGen.
	///
	/// Example for a more general valid mask (Factor 3). Lower:
	/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
	/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
	/// store <12 x i32> %i.vec, <12 x i32>* %ptr
	///
	/// Into:
	/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
	/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
	/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
	/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
	bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
	ShuffleVectorInst *SVI,
	unsigned Factor) const {
	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
	"Invalid interleave factor");

	VectorType *VecTy = SVI->getType();
	assert(VecTy->getVectorNumElements() % Factor == 0 &&
	"Invalid interleaved store");

	unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
	Type *EltTy = VecTy->getVectorElementType();
	VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);

	const DataLayout &DL = SI->getModule()->getDataLayout();

	// Skip if we do not have NEON and skip illegal vector types. We can
	// "legalize" wide vector types into multiple interleaved accesses as long as
	// the vector types are divisible by 128.
	if (!Subtarget->hasNEON() \|\| !isLegalInterleavedAccessType(SubVecTy, DL))
	return false;

	unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);

	Value *Op0 = SVI->getOperand(0);
	Value *Op1 = SVI->getOperand(1);
	IRBuilder<> Builder(SI);

	// StN intrinsics don't support pointer vectors as arguments. Convert pointer
	// vectors to integer vectors.
	if (EltTy->isPointerTy()) {
	Type *IntTy = DL.getIntPtrType(EltTy);

	// Convert to the corresponding integer vector.
	Type *IntVecTy =
	VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
	Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
	Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);

	SubVecTy = VectorType::get(IntTy, LaneLen);
	}

	// The base address of the store.
	Value *BaseAddr = SI->getPointerOperand();

	if (NumStores > 1) {
	// If we're going to generate more than one store, reset the lane length
	// and sub-vector type to something legal.
	LaneLen /= NumStores;
	SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);

	// We will compute the pointer operand of each store from the original base
	// address using GEPs. Cast the base address to a pointer to the scalar
	// element type.
	BaseAddr = Builder.CreateBitCast(
	BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
	SI->getPointerAddressSpace()));
	}

	assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");

	auto Mask = SVI->getShuffleMask();

	Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
	Type *Tys[] = {Int8Ptr, SubVecTy};
	static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
	Intrinsic::arm_neon_vst3,
	Intrinsic::arm_neon_vst4};

	for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
	// If we generating more than one store, we compute the base address of
	// subsequent stores as an offset from the previous.
	if (StoreCount > 0)
	BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
	BaseAddr, LaneLen * Factor);

	SmallVector<Value *, 6> Ops;
	Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));

	Function *VstNFunc =
	Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);

	// Split the shufflevector operands into sub vectors for the new vstN call.
	for (unsigned i = 0; i < Factor; i++) {
	unsigned IdxI = StoreCount * LaneLen * Factor + i;
	if (Mask[IdxI] >= 0) {
	Ops.push_back(Builder.CreateShuffleVector(
	Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
	} else {
	unsigned StartMask = 0;
	for (unsigned j = 1; j < LaneLen; j++) {
	unsigned IdxJ = StoreCount * LaneLen * Factor + j;
	if (Mask[IdxJ * Factor + IdxI] >= 0) {
	StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
	break;
	}
	}
	// Note: If all elements in a chunk are undefs, StartMask=0!
	// Note: Filling undef gaps with random elements is ok, since
	// those elements were being written anyway (with undefs).
	// In the case of all undefs we're defaulting to using elems from 0
	// Note: StartMask cannot be negative, it's checked in
	// isReInterleaveMask
	Ops.push_back(Builder.CreateShuffleVector(
	Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
	}
	}

	Ops.push_back(Builder.getInt32(SI->getAlignment()));
	Builder.CreateCall(VstNFunc, Ops);
	}
	return true;
	}

	enum HABaseType {
	HA_UNKNOWN = 0,
	HA_FLOAT,
	HA_DOUBLE,
	HA_VECT64,
	HA_VECT128
	};

	static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
	uint64_t &Members) {
	if (auto *ST = dyn_cast<StructType>(Ty)) {
	for (unsigned i = 0; i < ST->getNumElements(); ++i) {
	uint64_t SubMembers = 0;
	if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
	return false;
	Members += SubMembers;
	}
	} else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
	uint64_t SubMembers = 0;
	if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
	return false;
	Members += SubMembers * AT->getNumElements();
	} else if (Ty->isFloatTy()) {
	if (Base != HA_UNKNOWN && Base != HA_FLOAT)
	return false;
	Members = 1;
	Base = HA_FLOAT;
	} else if (Ty->isDoubleTy()) {
	if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
	return false;
	Members = 1;
	Base = HA_DOUBLE;
	} else if (auto *VT = dyn_cast<VectorType>(Ty)) {
	Members = 1;
	switch (Base) {
	case HA_FLOAT:
	case HA_DOUBLE:
	return false;
	case HA_VECT64:
	return VT->getBitWidth() == 64;
	case HA_VECT128:
	return VT->getBitWidth() == 128;
	case HA_UNKNOWN:
	switch (VT->getBitWidth()) {
	case 64:
	Base = HA_VECT64;
	return true;
	case 128:
	Base = HA_VECT128;
	return true;
	default:
	return false;
	}
	}
	}

	return (Members > 0 && Members <= 4);
	}

	/// Return the correct alignment for the current calling convention.
	unsigned
	ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
	DataLayout DL) const {
	if (!ArgTy->isVectorTy())
	return DL.getABITypeAlignment(ArgTy);

	// Avoid over-aligning vector parameters. It would require realigning the
	// stack and waste space for no real benefit.
	return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment());
	}

	/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
	/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
	/// passing according to AAPCS rules.
	bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
	Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
	if (getEffectiveCallingConv(CallConv, isVarArg) !=
	CallingConv::ARM_AAPCS_VFP)
	return false;

	HABaseType Base = HA_UNKNOWN;
	uint64_t Members = 0;
	bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
	LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());

	bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
	return IsHA \|\| IsIntArray;
	}

	unsigned ARMTargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	// Platforms which do not use SjLj EH may return values in these registers
	// via the personality function.
	return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
	}

	unsigned ARMTargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	// Platforms which do not use SjLj EH may return values in these registers
	// via the personality function.
	return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
	}

	void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	// Update IsSplitCSR in ARMFunctionInfo.
	ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void ARMTargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (ARM::GPRRegClass.contains(*I))
	RC = &ARM::GPRRegClass;
	else if (ARM::DPRRegClass.contains(*I))
	RC = &ARM::DPRRegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	unsigned NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(Entry->getParent()->getFunction().hasFnAttribute(
	Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
	MF.getFrameInfo().computeMaxCallFrameSize(MF);
	TargetLoweringBase::finalizeLowering(MF);
	}
	Index: vendor/llvm/dist-release_90/lib/Target/ARM/ARMInstrThumb.td
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/ARM/ARMInstrThumb.td (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/ARM/ARMInstrThumb.td (revision 351303)
	@@ -1,1736 +1,1737 @@
	//===-- ARMInstrThumb.td - Thumb support for ARM ------------ tablegen --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the Thumb instruction set.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Thumb specific DAG Nodes.
	//

	def imm_sr_XFORM: SDNodeXForm<imm, [{
	unsigned Imm = N->getZExtValue();
	return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), SDLoc(N), MVT::i32);
	}]>;
	def ThumbSRImmAsmOperand: ImmAsmOperand<1,32> { let Name = "ImmThumbSR"; }
	def imm_sr : Operand<i32>, PatLeaf<(imm), [{
	uint64_t Imm = N->getZExtValue();
	return Imm > 0 && Imm <= 32;
	}], imm_sr_XFORM> {
	let PrintMethod = "printThumbSRImm";
	let ParserMatchClass = ThumbSRImmAsmOperand;
	}

	def imm0_7_neg : PatLeaf<(i32 imm), [{
	return (uint32_t)-N->getZExtValue() < 8;
	}], imm_neg_XFORM>;

	def ThumbModImmNeg1_7AsmOperand : AsmOperandClass { let Name = "ThumbModImmNeg1_7"; }
	def mod_imm1_7_neg : Operand<i32>, PatLeaf<(imm), [{
	unsigned Value = -(unsigned)N->getZExtValue();
	return 0 < Value && Value < 8;
	}], imm_neg_XFORM> {
	let ParserMatchClass = ThumbModImmNeg1_7AsmOperand;
	}

	def ThumbModImmNeg8_255AsmOperand : AsmOperandClass { let Name = "ThumbModImmNeg8_255"; }
	def mod_imm8_255_neg : Operand<i32>, PatLeaf<(imm), [{
	unsigned Value = -(unsigned)N->getZExtValue();
	return 7 < Value && Value < 256;
	}], imm_neg_XFORM> {
	let ParserMatchClass = ThumbModImmNeg8_255AsmOperand;
	}


	def imm0_255_comp : PatLeaf<(i32 imm), [{
	return ~((uint32_t)N->getZExtValue()) < 256;
	}]>;

	def imm8_255_neg : PatLeaf<(i32 imm), [{
	unsigned Val = -N->getZExtValue();
	return Val >= 8 && Val < 256;
	}], imm_neg_XFORM>;

	// Break imm's up into two pieces: an immediate + a left shift. This uses
	// thumb_immshifted to match and thumb_immshifted_val and thumb_immshifted_shamt
	// to get the val/shift pieces.
	def thumb_immshifted : PatLeaf<(imm), [{
	return ARM_AM::isThumbImmShiftedVal((unsigned)N->getZExtValue());
	}]>;

	def thumb_immshifted_val : SDNodeXForm<imm, [{
	unsigned V = ARM_AM::getThumbImmNonShiftedVal((unsigned)N->getZExtValue());
	return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
	}]>;

	def thumb_immshifted_shamt : SDNodeXForm<imm, [{
	unsigned V = ARM_AM::getThumbImmValShift((unsigned)N->getZExtValue());
	return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);
	}]>;

	def imm256_510 : ImmLeaf<i32, [{
	return Imm >= 256 && Imm < 511;
	}]>;

	def thumb_imm256_510_addend : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(N->getZExtValue() - 255, SDLoc(N), MVT::i32);
	}]>;

	// Scaled 4 immediate.
	def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; }
	def t_imm0_1020s4 : Operand<i32> {
	let PrintMethod = "printThumbS4ImmOperand";
	let ParserMatchClass = t_imm0_1020s4_asmoperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	def t_imm0_508s4_asmoperand: AsmOperandClass { let Name = "Imm0_508s4"; }
	def t_imm0_508s4 : Operand<i32> {
	let PrintMethod = "printThumbS4ImmOperand";
	let ParserMatchClass = t_imm0_508s4_asmoperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}
	// Alias use only, so no printer is necessary.
	def t_imm0_508s4_neg_asmoperand: AsmOperandClass { let Name = "Imm0_508s4Neg"; }
	def t_imm0_508s4_neg : Operand<i32> {
	let ParserMatchClass = t_imm0_508s4_neg_asmoperand;
	let OperandType = "OPERAND_IMMEDIATE";
	}

	// Define Thumb specific addressing modes.

	// unsigned 8-bit, 2-scaled memory offset
	class OperandUnsignedOffset_b8s2 : AsmOperandClass {
	let Name = "UnsignedOffset_b8s2";
	let PredicateMethod = "isUnsignedOffset<8, 2>";
	}

	def UnsignedOffset_b8s2 : OperandUnsignedOffset_b8s2;

	// thumb style PC relative operand. signed, 8 bits magnitude,
	// two bits shift. can be represented as either [pc, #imm], #imm,
	// or relocatable expression...
	def ThumbMemPC : AsmOperandClass {
	let Name = "ThumbMemPC";
	}

	let OperandType = "OPERAND_PCREL" in {
	def t_brtarget : Operand<OtherVT> {
	let EncoderMethod = "getThumbBRTargetOpValue";
	let DecoderMethod = "DecodeThumbBROperand";
	}

	// ADR instruction labels.
	def t_adrlabel : Operand<i32> {
	let EncoderMethod = "getThumbAdrLabelOpValue";
	let PrintMethod = "printAdrLabelOperand<2>";
	let ParserMatchClass = UnsignedOffset_b8s2;
	}


	def thumb_br_target : Operand<OtherVT> {
	let ParserMatchClass = ThumbBranchTarget;
	let EncoderMethod = "getThumbBranchTargetOpValue";
	let OperandType = "OPERAND_PCREL";
	}

	def thumb_bl_target : Operand<i32> {
	let ParserMatchClass = ThumbBranchTarget;
	let EncoderMethod = "getThumbBLTargetOpValue";
	let DecoderMethod = "DecodeThumbBLTargetOperand";
	}

	// Target for BLX from thumb mode.
	def thumb_blx_target : Operand<i32> {
	let ParserMatchClass = ARMBranchTarget;
	let EncoderMethod = "getThumbBLXTargetOpValue";
	let DecoderMethod = "DecodeThumbBLXOffset";
	}

	def thumb_bcc_target : Operand<OtherVT> {
	let ParserMatchClass = ThumbBranchTarget;
	let EncoderMethod = "getThumbBCCTargetOpValue";
	let DecoderMethod = "DecodeThumbBCCTargetOperand";
	}

	def thumb_cb_target : Operand<OtherVT> {
	let ParserMatchClass = ThumbBranchTarget;
	let EncoderMethod = "getThumbCBTargetOpValue";
	let DecoderMethod = "DecodeThumbCmpBROperand";
	}

	// t_addrmode_pc := <label> => pc + imm8 * 4
	//
	def t_addrmode_pc : MemOperand {
	let EncoderMethod = "getAddrModePCOpValue";
	let DecoderMethod = "DecodeThumbAddrModePC";
	let PrintMethod = "printThumbLdrLabelOperand";
	let ParserMatchClass = ThumbMemPC;
	}
	}

	// t_addrmode_rr := reg + reg
	//
	def t_addrmode_rr_asm_operand : AsmOperandClass { let Name = "MemThumbRR"; }
	def t_addrmode_rr : MemOperand,
	ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> {
	let EncoderMethod = "getThumbAddrModeRegRegOpValue";
	let PrintMethod = "printThumbAddrModeRROperand";
	let DecoderMethod = "DecodeThumbAddrModeRR";
	let ParserMatchClass = t_addrmode_rr_asm_operand;
	let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
	}

	// t_addrmode_rr_sext := reg + reg
	//
	// This is similar to t_addrmode_rr, but uses different heuristics for
	// ldrsb/ldrsh.
	def t_addrmode_rr_sext : MemOperand,
	ComplexPattern<i32, 2, "SelectThumbAddrModeRRSext", []> {
	let EncoderMethod = "getThumbAddrModeRegRegOpValue";
	let PrintMethod = "printThumbAddrModeRROperand";
	let DecoderMethod = "DecodeThumbAddrModeRR";
	let ParserMatchClass = t_addrmode_rr_asm_operand;
	let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
	}

	// t_addrmode_rrs := reg + reg
	//
	// We use separate scaled versions because the Select* functions need
	// to explicitly check for a matching constant and return false here so that
	// the reg+imm forms will match instead. This is a horrible way to do that,
	// as it forces tight coupling between the methods, but it's how selectiondag
	// currently works.
	def t_addrmode_rrs1 : MemOperand,
	ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S1", []> {
	let EncoderMethod = "getThumbAddrModeRegRegOpValue";
	let PrintMethod = "printThumbAddrModeRROperand";
	let DecoderMethod = "DecodeThumbAddrModeRR";
	let ParserMatchClass = t_addrmode_rr_asm_operand;
	let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
	}
	def t_addrmode_rrs2 : MemOperand,
	ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S2", []> {
	let EncoderMethod = "getThumbAddrModeRegRegOpValue";
	let DecoderMethod = "DecodeThumbAddrModeRR";
	let PrintMethod = "printThumbAddrModeRROperand";
	let ParserMatchClass = t_addrmode_rr_asm_operand;
	let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
	}
	def t_addrmode_rrs4 : MemOperand,
	ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S4", []> {
	let EncoderMethod = "getThumbAddrModeRegRegOpValue";
	let DecoderMethod = "DecodeThumbAddrModeRR";
	let PrintMethod = "printThumbAddrModeRROperand";
	let ParserMatchClass = t_addrmode_rr_asm_operand;
	let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
	}

	// t_addrmode_is4 := reg + imm5 * 4
	//
	def t_addrmode_is4_asm_operand : AsmOperandClass { let Name = "MemThumbRIs4"; }
	def t_addrmode_is4 : MemOperand,
	ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S4", []> {
	let EncoderMethod = "getAddrModeISOpValue";
	let DecoderMethod = "DecodeThumbAddrModeIS";
	let PrintMethod = "printThumbAddrModeImm5S4Operand";
	let ParserMatchClass = t_addrmode_is4_asm_operand;
	let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
	}

	// t_addrmode_is2 := reg + imm5 * 2
	//
	def t_addrmode_is2_asm_operand : AsmOperandClass { let Name = "MemThumbRIs2"; }
	def t_addrmode_is2 : MemOperand,
	ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S2", []> {
	let EncoderMethod = "getAddrModeISOpValue";
	let DecoderMethod = "DecodeThumbAddrModeIS";
	let PrintMethod = "printThumbAddrModeImm5S2Operand";
	let ParserMatchClass = t_addrmode_is2_asm_operand;
	let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
	}

	// t_addrmode_is1 := reg + imm5
	//
	def t_addrmode_is1_asm_operand : AsmOperandClass { let Name = "MemThumbRIs1"; }
	def t_addrmode_is1 : MemOperand,
	ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S1", []> {
	let EncoderMethod = "getAddrModeISOpValue";
	let DecoderMethod = "DecodeThumbAddrModeIS";
	let PrintMethod = "printThumbAddrModeImm5S1Operand";
	let ParserMatchClass = t_addrmode_is1_asm_operand;
	let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
	}

	// t_addrmode_sp := sp + imm8 * 4
	//
	// FIXME: This really shouldn't have an explicit SP operand at all. It should
	// be implicit, just like in the instruction encoding itself.
	def t_addrmode_sp_asm_operand : AsmOperandClass { let Name = "MemThumbSPI"; }
	def t_addrmode_sp : MemOperand,
	ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> {
	let EncoderMethod = "getAddrModeThumbSPOpValue";
	let DecoderMethod = "DecodeThumbAddrModeSP";
	let PrintMethod = "printThumbAddrModeSPOperand";
	let ParserMatchClass = t_addrmode_sp_asm_operand;
	let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
	}

	// Inspects parent to determine whether an or instruction can be implemented as
	// an add (i.e. whether we know overflow won't occur in the add).
	def AddLikeOrOp : ComplexPattern<i32, 1, "SelectAddLikeOr", [],
	[SDNPWantParent]>;

	// Pattern to exclude immediates from matching
	def non_imm32 : PatLeaf<(i32 GPR), [{ return !isa<ConstantSDNode>(N); }]>;

	//===----------------------------------------------------------------------===//
	// Miscellaneous Instructions.
	//

	// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
	// from removing one half of the matched pairs. That breaks PEI, which assumes
	// these will always be in pairs, and asserts if it finds otherwise. Better way?
	let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
	def tADJCALLSTACKUP :
	PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary,
	[(ARMcallseq_end imm:$amt1, imm:$amt2)]>,
	Requires<[IsThumb, IsThumb1Only]>;

	def tADJCALLSTACKDOWN :
	PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2), NoItinerary,
	[(ARMcallseq_start imm:$amt, imm:$amt2)]>,
	Requires<[IsThumb, IsThumb1Only]>;
	}

	class T1SystemEncoding<bits<8> opc>
	: T1Encoding<0b101111> {
	let Inst{9-8} = 0b11;
	let Inst{7-0} = opc;
	}

	def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm",
	[(int_arm_hint imm0_15:$imm)]>,
	T1SystemEncoding<0x00>,
	Requires<[IsThumb, HasV6M]> {
	bits<4> imm;
	let Inst{7-4} = imm;
	}

	// Note: When EmitPriority == 1, the alias will be used for printing
	class tHintAlias<string Asm, dag Result, bit EmitPriority = 0> : tInstAlias<Asm, Result, EmitPriority> {
	let Predicates = [IsThumb, HasV6M];
	}

	def : tHintAlias<"nop$p", (tHINT 0, pred:$p), 1>; // A8.6.110
	def : tHintAlias<"yield$p", (tHINT 1, pred:$p), 1>; // A8.6.410
	def : tHintAlias<"wfe$p", (tHINT 2, pred:$p), 1>; // A8.6.408
	def : tHintAlias<"wfi$p", (tHINT 3, pred:$p), 1>; // A8.6.409
	def : tHintAlias<"sev$p", (tHINT 4, pred:$p), 1>; // A8.6.157
	def : tInstAlias<"sevl$p", (tHINT 5, pred:$p), 1> {
	let Predicates = [IsThumb2, HasV8];
	}

	// The imm operand $val can be used by a debugger to store more information
	// about the breakpoint.
	def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val",
	[]>,
	T1Encoding<0b101111> {
	let Inst{9-8} = 0b10;
	// A8.6.22
	bits<8> val;
	let Inst{7-0} = val;
	}
	// default immediate for breakpoint mnemonic
	def : InstAlias<"bkpt", (tBKPT 0), 0>, Requires<[IsThumb]>;

	def tHLT : T1I<(outs), (ins imm0_63:$val), NoItinerary, "hlt\t$val",
	[]>, T1Encoding<0b101110>, Requires<[IsThumb, HasV8]> {
	let Inst{9-6} = 0b1010;
	bits<6> val;
	let Inst{5-0} = val;
	}

	def tSETEND : T1I<(outs), (ins setend_op:$end), NoItinerary, "setend\t$end",
	[]>, T1Encoding<0b101101>, Requires<[IsThumb, IsNotMClass]>, Deprecated<HasV8Ops> {
	bits<1> end;
	// A8.6.156
	let Inst{9-5} = 0b10010;
	let Inst{4} = 1;
	let Inst{3} = end;
	let Inst{2-0} = 0b000;
	}

	// Change Processor State is a system instruction -- for disassembly only.
	def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags),
	NoItinerary, "cps$imod $iflags", []>,
	T1Misc<0b0110011> {
	// A8.6.38 & B6.1.1
	bit imod;
	bits<3> iflags;

	let Inst{4} = imod;
	let Inst{3} = 0;
	let Inst{2-0} = iflags;
	let DecoderMethod = "DecodeThumbCPS";
	}

	// For both thumb1 and thumb2.
	let isNotDuplicable = 1, isCodeGenOnly = 1 in
	def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "",
	[(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>,
	T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
	// A8.6.6
	bits<3> dst;
	let Inst{6-3} = 0b1111; // Rm = pc
	let Inst{2-0} = dst;
	}

	// ADD <Rd>, sp, #<imm8>
	// FIXME: This should not be marked as having side effects, and it should be
	// rematerializable. Clearing the side effect bit causes miscompilations,
	// probably because the instruction can be moved around.
	def tADDrSPi : T1pI<(outs tGPR:$dst), (ins GPRsp:$sp, t_imm0_1020s4:$imm),
	IIC_iALUi, "add", "\t$dst, $sp, $imm", []>,
	T1Encoding<{1,0,1,0,1,?}>, Sched<[WriteALU]> {
	// A6.2 & A8.6.8
	bits<3> dst;
	bits<8> imm;
	let Inst{10-8} = dst;
	let Inst{7-0} = imm;
	let DecoderMethod = "DecodeThumbAddSpecialReg";
	}

	// Thumb1 frame lowering is rather fragile, we hope to be able to use
	// tADDrSPi, but we may need to insert a sequence that clobbers CPSR.
	def tADDframe : PseudoInst<(outs tGPR:$dst), (ins i32imm:$base, i32imm:$offset),
	NoItinerary, []>,
	Requires<[IsThumb, IsThumb1Only]> {
	let Defs = [CPSR];
	}

	// ADD sp, sp, #<imm7>
	def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
	IIC_iALUi, "add", "\t$Rdn, $imm", []>,
	T1Misc<{0,0,0,0,0,?,?}>, Sched<[WriteALU]> {
	// A6.2.5 & A8.6.8
	bits<7> imm;
	let Inst{6-0} = imm;
	let DecoderMethod = "DecodeThumbAddSPImm";
	}

	// SUB sp, sp, #<imm7>
	// FIXME: The encoding and the ASM string don't match up.
	def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
	IIC_iALUi, "sub", "\t$Rdn, $imm", []>,
	T1Misc<{0,0,0,0,1,?,?}>, Sched<[WriteALU]> {
	// A6.2.5 & A8.6.214
	bits<7> imm;
	let Inst{6-0} = imm;
	let DecoderMethod = "DecodeThumbAddSPImm";
	}

	def : tInstSubst<"add${p} sp, $imm",
	(tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
	def : tInstSubst<"add${p} sp, sp, $imm",
	(tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;

	// Can optionally specify SP as a three operand instruction.
	def : tInstAlias<"add${p} sp, sp, $imm",
	(tADDspi SP, t_imm0_508s4:$imm, pred:$p)>;
	def : tInstAlias<"sub${p} sp, sp, $imm",
	(tSUBspi SP, t_imm0_508s4:$imm, pred:$p)>;

	// ADD <Rm>, sp
	def tADDrSP : T1pI<(outs GPR:$Rdn), (ins GPRsp:$sp, GPR:$Rn), IIC_iALUr,
	"add", "\t$Rdn, $sp, $Rn", []>,
	T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
	// A8.6.9 Encoding T1
	bits<4> Rdn;
	let Inst{7} = Rdn{3};
	let Inst{6-3} = 0b1101;
	let Inst{2-0} = Rdn{2-0};
	let DecoderMethod = "DecodeThumbAddSPReg";
	}

	// ADD sp, <Rm>
	def tADDspr : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, GPR:$Rm), IIC_iALUr,
	"add", "\t$Rdn, $Rm", []>,
	T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
	// A8.6.9 Encoding T2
	bits<4> Rm;
	let Inst{7} = 1;
	let Inst{6-3} = Rm;
	let Inst{2-0} = 0b101;
	let DecoderMethod = "DecodeThumbAddSPReg";
	}

	//===----------------------------------------------------------------------===//
	// Control Flow Instructions.
	//

	// Indirect branches
	let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
	def tBX : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bx${p}\t$Rm", []>,
	T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
	// A6.2.3 & A8.6.25
	bits<4> Rm;
	let Inst{6-3} = Rm;
	let Inst{2-0} = 0b000;
	let Unpredictable{2-0} = 0b111;
	}
	def tBXNS : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bxns${p}\t$Rm", []>,
	Requires<[IsThumb, Has8MSecExt]>,
	T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
	bits<4> Rm;
	let Inst{6-3} = Rm;
	let Inst{2-0} = 0b100;
	let Unpredictable{1-0} = 0b11;
	}
	}

	let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
	def tBX_RET : tPseudoExpand<(outs), (ins pred:$p), 2, IIC_Br,
	[(ARMretflag)], (tBX LR, pred:$p)>, Sched<[WriteBr]>;

	// Alternative return instruction used by vararg functions.
	def tBX_RET_vararg : tPseudoExpand<(outs), (ins tGPR:$Rm, pred:$p),
	2, IIC_Br, [],
	(tBX GPR:$Rm, pred:$p)>, Sched<[WriteBr]>;
	}

	// All calls clobber the non-callee saved registers. SP is marked as a use to
	// prevent stack-pointer assignments that appear immediately before calls from
	// potentially appearing dead.
	let isCall = 1,
	Defs = [LR], Uses = [SP] in {
	// Also used for Thumb2
	def tBL : TIx2<0b11110, 0b11, 1,
	(outs), (ins pred:$p, thumb_bl_target:$func), IIC_Br,
	"bl${p}\t$func",
	[(ARMcall tglobaladdr:$func)]>,
	Requires<[IsThumb]>, Sched<[WriteBrL]> {
	bits<24> func;
	let Inst{26} = func{23};
	let Inst{25-16} = func{20-11};
	let Inst{13} = func{22};
	let Inst{11} = func{21};
	let Inst{10-0} = func{10-0};
	}

	// ARMv5T and above, also used for Thumb2
	def tBLXi : TIx2<0b11110, 0b11, 0,
	(outs), (ins pred:$p, thumb_blx_target:$func), IIC_Br,
	"blx${p}\t$func", []>,
	Requires<[IsThumb, HasV5T, IsNotMClass]>, Sched<[WriteBrL]> {
	bits<24> func;
	let Inst{26} = func{23};
	let Inst{25-16} = func{20-11};
	let Inst{13} = func{22};
	let Inst{11} = func{21};
	let Inst{10-1} = func{10-1};
	let Inst{0} = 0; // func{0} is assumed zero
	}

	// Also used for Thumb2
	def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br,
	"blx${p}\t$func",
	[(ARMcall GPR:$func)]>,
	Requires<[IsThumb, HasV5T]>,
	T1Special<{1,1,1,?}>, Sched<[WriteBrL]> { // A6.2.3 & A8.6.24;
	bits<4> func;
	let Inst{6-3} = func;
	let Inst{2-0} = 0b000;
	}

	// ARMv8-M Security Extensions
	def tBLXNSr : TI<(outs), (ins pred:$p, GPRnopc:$func), IIC_Br,
	"blxns${p}\t$func", []>,
	Requires<[IsThumb, Has8MSecExt]>,
	T1Special<{1,1,1,?}>, Sched<[WriteBrL]> {
	bits<4> func;
	let Inst{6-3} = func;
	let Inst{2-0} = 0b100;
	let Unpredictable{1-0} = 0b11;
	}

	// ARMv4T
	def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func),
	4, IIC_Br,
	[(ARMcall_nolink tGPR:$func)]>,
	Requires<[IsThumb, IsThumb1Only]>, Sched<[WriteBr]>;
	}

	let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
	let isPredicable = 1 in
	def tB : T1pI<(outs), (ins t_brtarget:$target), IIC_Br,
	"b", "\t$target", [(br bb:$target)]>,
	T1Encoding<{1,1,1,0,0,?}>, Sched<[WriteBr]> {
	bits<11> target;
	let Inst{10-0} = target;
	let AsmMatchConverter = "cvtThumbBranches";
	}

	// Far jump
	// Just a pseudo for a tBL instruction. Needed to let regalloc know about
	// the clobber of LR.
	let Defs = [LR] in
	def tBfar : tPseudoExpand<(outs), (ins thumb_bl_target:$target, pred:$p),
	4, IIC_Br, [],
	(tBL pred:$p, thumb_bl_target:$target)>,
	Sched<[WriteBrTbl]>;

	def tBR_JTr : tPseudoInst<(outs),
	(ins tGPR:$target, i32imm:$jt),
	0, IIC_Br,
	[(ARMbrjt tGPR:$target, tjumptable:$jt)]>,
	Sched<[WriteBrTbl]> {
	let Size = 2;
	+ let isNotDuplicable = 1;
	list<Predicate> Predicates = [IsThumb, IsThumb1Only];
	}
	}

	// FIXME: should be able to write a pattern for ARMBrcond, but can't use
	// a two-value operand where a dag node expects two operands. :(
	let isBranch = 1, isTerminator = 1 in
	def tBcc : T1I<(outs), (ins thumb_bcc_target:$target, pred:$p), IIC_Br,
	"b${p}\t$target",
	[/(ARMbrcond bb:$target, imm:$cc)/]>,
	T1BranchCond<{1,1,0,1}>, Sched<[WriteBr]> {
	bits<4> p;
	bits<8> target;
	let Inst{11-8} = p;
	let Inst{7-0} = target;
	let AsmMatchConverter = "cvtThumbBranches";
	}


	// Tail calls
	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
	// IOS versions.
	let Uses = [SP] in {
	def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst),
	4, IIC_Br, [],
	(tBX GPR:$dst, (ops 14, zero_reg))>,
	Requires<[IsThumb]>, Sched<[WriteBr]>;
	}
	// tTAILJMPd: MachO version uses a Thumb2 branch (no Thumb1 tail calls
	// on MachO), so it's in ARMInstrThumb2.td.
	// Non-MachO version:
	let Uses = [SP] in {
	def tTAILJMPdND : tPseudoExpand<(outs),
	(ins t_brtarget:$dst, pred:$p),
	4, IIC_Br, [],
	(tB t_brtarget:$dst, pred:$p)>,
	Requires<[IsThumb, IsNotMachO]>, Sched<[WriteBr]>;
	}
	}


	// A8.6.218 Supervisor Call (Software Interrupt)
	// A8.6.16 B: Encoding T1
	// If Inst{11-8} == 0b1111 then SEE SVC
	let isCall = 1, Uses = [SP] in
	def tSVC : T1pI<(outs), (ins imm0_255:$imm), IIC_Br,
	"svc", "\t$imm", []>, Encoding16, Sched<[WriteBr]> {
	bits<8> imm;
	let Inst{15-12} = 0b1101;
	let Inst{11-8} = 0b1111;
	let Inst{7-0} = imm;
	}

	// The assembler uses 0xDEFE for a trap instruction.
	let isBarrier = 1, isTerminator = 1 in
	def tTRAP : TI<(outs), (ins), IIC_Br,
	"trap", [(trap)]>, Encoding16, Sched<[WriteBr]> {
	let Inst = 0xdefe;
	}

	//===----------------------------------------------------------------------===//
	// Load Store Instructions.
	//

	// PC-relative loads need to be matched first as constant pool accesses need to
	// always be PC-relative. We do this using AddedComplexity, as the pattern is
	// simpler than the patterns of the other load instructions.
	let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 10 in
	def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
	"ldr", "\t$Rt, $addr",
	[(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
	T1Encoding<{0,1,0,0,1,?}>, Sched<[WriteLd]> {
	// A6.2 & A8.6.59
	bits<3> Rt;
	bits<8> addr;
	let Inst{10-8} = Rt;
	let Inst{7-0} = addr;
	}

	// SP-relative loads should be matched before standard immediate-offset loads as
	// it means we avoid having to move SP to another register.
	let canFoldAsLoad = 1 in
	def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
	"ldr", "\t$Rt, $addr",
	[(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>,
	T1LdStSP<{1,?,?}>, Sched<[WriteLd]> {
	bits<3> Rt;
	bits<8> addr;
	let Inst{10-8} = Rt;
	let Inst{7-0} = addr;
	}

	// Loads: reg/reg and reg/imm5
	let canFoldAsLoad = 1, isReMaterializable = 1 in
	multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
	Operand AddrMode_r, Operand AddrMode_i,
	AddrMode am, InstrItinClass itin_r,
	InstrItinClass itin_i, string asm,
	PatFrag opnode> {
	// Immediate-offset loads should be matched before register-offset loads as
	// when the offset is a constant it's simpler to first check if it fits in the
	// immediate offset field then fall back to register-offset if it doesn't.
	def i : // reg/imm5
	T1pILdStEncodeImm<imm_opc, 1 /* Load */,
	(outs tGPR:$Rt), (ins AddrMode_i:$addr),
	am, itin_i, asm, "\t$Rt, $addr",
	[(set tGPR:$Rt, (opnode AddrMode_i:$addr))]>;
	// Register-offset loads are matched last.
	def r : // reg/reg
	T1pILdStEncode<reg_opc,
	(outs tGPR:$Rt), (ins AddrMode_r:$addr),
	am, itin_r, asm, "\t$Rt, $addr",
	[(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>;
	}
	// Stores: reg/reg and reg/imm5
	multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
	Operand AddrMode_r, Operand AddrMode_i,
	AddrMode am, InstrItinClass itin_r,
	InstrItinClass itin_i, string asm,
	PatFrag opnode> {
	def i : // reg/imm5
	T1pILdStEncodeImm<imm_opc, 0 /* Store */,
	(outs), (ins tGPR:$Rt, AddrMode_i:$addr),
	am, itin_i, asm, "\t$Rt, $addr",
	[(opnode tGPR:$Rt, AddrMode_i:$addr)]>;
	def r : // reg/reg
	T1pILdStEncode<reg_opc,
	(outs), (ins tGPR:$Rt, AddrMode_r:$addr),
	am, itin_r, asm, "\t$Rt, $addr",
	[(opnode tGPR:$Rt, AddrMode_r:$addr)]>;
	}

	// A8.6.57 & A8.6.60
	defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rr,
	t_addrmode_is4, AddrModeT1_4,
	IIC_iLoad_r, IIC_iLoad_i, "ldr",
	load>, Sched<[WriteLd]>;

	// A8.6.64 & A8.6.61
	defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rr,
	t_addrmode_is1, AddrModeT1_1,
	IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb",
	zextloadi8>, Sched<[WriteLd]>;

	// A8.6.76 & A8.6.73
	defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rr,
	t_addrmode_is2, AddrModeT1_2,
	IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh",
	zextloadi16>, Sched<[WriteLd]>;

	let AddedComplexity = 10 in
	def tLDRSB : // A8.6.80
	T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr_sext:$addr),
	AddrModeT1_1, IIC_iLoad_bh_r,
	"ldrsb", "\t$Rt, $addr",
	[(set tGPR:$Rt, (sextloadi8 t_addrmode_rr_sext:$addr))]>, Sched<[WriteLd]>;

	let AddedComplexity = 10 in
	def tLDRSH : // A8.6.84
	T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr_sext:$addr),
	AddrModeT1_2, IIC_iLoad_bh_r,
	"ldrsh", "\t$Rt, $addr",
	[(set tGPR:$Rt, (sextloadi16 t_addrmode_rr_sext:$addr))]>, Sched<[WriteLd]>;


	def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
	"str", "\t$Rt, $addr",
	[(store tGPR:$Rt, t_addrmode_sp:$addr)]>,
	T1LdStSP<{0,?,?}>, Sched<[WriteST]> {
	bits<3> Rt;
	bits<8> addr;
	let Inst{10-8} = Rt;
	let Inst{7-0} = addr;
	}

	// A8.6.194 & A8.6.192
	defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rr,
	t_addrmode_is4, AddrModeT1_4,
	IIC_iStore_r, IIC_iStore_i, "str",
	store>, Sched<[WriteST]>;

	// A8.6.197 & A8.6.195
	defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rr,
	t_addrmode_is1, AddrModeT1_1,
	IIC_iStore_bh_r, IIC_iStore_bh_i, "strb",
	truncstorei8>, Sched<[WriteST]>;

	// A8.6.207 & A8.6.205
	defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr,
	t_addrmode_is2, AddrModeT1_2,
	IIC_iStore_bh_r, IIC_iStore_bh_i, "strh",
	truncstorei16>, Sched<[WriteST]>;


	//===----------------------------------------------------------------------===//
	// Load / store multiple Instructions.
	//

	// These require base address to be written back or one of the loaded regs.
	let hasSideEffects = 0 in {

	let mayLoad = 1, hasExtraDefRegAllocReq = 1, variadicOpsAreDefs = 1 in
	def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
	IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> {
	bits<3> Rn;
	bits<8> regs;
	let Inst{10-8} = Rn;
	let Inst{7-0} = regs;
	}

	// Writeback version is just a pseudo, as there's no encoding difference.
	// Writeback happens iff the base register is not in the destination register
	// list.
	let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
	def tLDMIA_UPD :
	InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
	"$Rn = $wb", IIC_iLoad_mu>,
	PseudoInstExpansion<(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)> {
	let Size = 2;
	let OutOperandList = (outs tGPR:$wb);
	let InOperandList = (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops);
	let Pattern = [];
	let isCodeGenOnly = 1;
	let isPseudo = 1;
	list<Predicate> Predicates = [IsThumb];
	}

	// There is no non-writeback version of STM for Thumb.
	let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
	def tSTMIA_UPD : Thumb1I<(outs tGPR:$wb),
	(ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
	AddrModeNone, 2, IIC_iStore_mu,
	"stm${p}\t$Rn!, $regs", "$Rn = $wb", []>,
	T1Encoding<{1,1,0,0,0,?}> {
	bits<3> Rn;
	bits<8> regs;
	let Inst{10-8} = Rn;
	let Inst{7-0} = regs;
	}

	} // hasSideEffects

	def : InstAlias<"ldm${p} $Rn!, $regs",
	(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs), 0>,
	Requires<[IsThumb, IsThumb1Only]>;

	let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1,
	variadicOpsAreDefs = 1 in
	def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
	IIC_iPop,
	"pop${p}\t$regs", []>,
	T1Misc<{1,1,0,?,?,?,?}>, Sched<[WriteLd]> {
	bits<16> regs;
	let Inst{8} = regs{15};
	let Inst{7-0} = regs{7-0};
	}

	let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in
	def tPUSH : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
	IIC_iStore_m,
	"push${p}\t$regs", []>,
	T1Misc<{0,1,0,?,?,?,?}>, Sched<[WriteST]> {
	bits<16> regs;
	let Inst{8} = regs{14};
	let Inst{7-0} = regs{7-0};
	}

	//===----------------------------------------------------------------------===//
	// Arithmetic Instructions.
	//

	// Helper classes for encoding T1pI patterns:
	class T1pIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
	string opc, string asm, list<dag> pattern>
	: T1pI<oops, iops, itin, opc, asm, pattern>,
	T1DataProcessing<opA> {
	bits<3> Rm;
	bits<3> Rn;
	let Inst{5-3} = Rm;
	let Inst{2-0} = Rn;
	}
	class T1pIMiscEncode<bits<7> opA, dag oops, dag iops, InstrItinClass itin,
	string opc, string asm, list<dag> pattern>
	: T1pI<oops, iops, itin, opc, asm, pattern>,
	T1Misc<opA> {
	bits<3> Rm;
	bits<3> Rd;
	let Inst{5-3} = Rm;
	let Inst{2-0} = Rd;
	}

	// Helper classes for encoding T1sI patterns:
	class T1sIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
	string opc, string asm, list<dag> pattern>
	: T1sI<oops, iops, itin, opc, asm, pattern>,
	T1DataProcessing<opA> {
	bits<3> Rd;
	bits<3> Rn;
	let Inst{5-3} = Rn;
	let Inst{2-0} = Rd;
	}
	class T1sIGenEncode<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
	string opc, string asm, list<dag> pattern>
	: T1sI<oops, iops, itin, opc, asm, pattern>,
	T1General<opA> {
	bits<3> Rm;
	bits<3> Rn;
	bits<3> Rd;
	let Inst{8-6} = Rm;
	let Inst{5-3} = Rn;
	let Inst{2-0} = Rd;
	}
	class T1sIGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
	string opc, string asm, list<dag> pattern>
	: T1sI<oops, iops, itin, opc, asm, pattern>,
	T1General<opA> {
	bits<3> Rd;
	bits<3> Rm;
	let Inst{5-3} = Rm;
	let Inst{2-0} = Rd;
	}

	// Helper classes for encoding T1sIt patterns:
	class T1sItDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
	string opc, string asm, list<dag> pattern>
	: T1sIt<oops, iops, itin, opc, asm, pattern>,
	T1DataProcessing<opA> {
	bits<3> Rdn;
	bits<3> Rm;
	let Inst{5-3} = Rm;
	let Inst{2-0} = Rdn;
	}
	class T1sItGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
	string opc, string asm, list<dag> pattern>
	: T1sIt<oops, iops, itin, opc, asm, pattern>,
	T1General<opA> {
	bits<3> Rdn;
	bits<8> imm8;
	let Inst{10-8} = Rdn;
	let Inst{7-0} = imm8;
	}

	let isAdd = 1 in {
	// Add with carry register
	let isCommutable = 1, Uses = [CPSR] in
	def tADC : // A8.6.2
	T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr,
	"adc", "\t$Rdn, $Rm",
	[]>, Sched<[WriteALU]>;

	// Add immediate
	def tADDi3 : // A8.6.4 T1
	T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
	IIC_iALUi,
	"add", "\t$Rd, $Rm, $imm3",
	[(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]>,
	Sched<[WriteALU]> {
	bits<3> imm3;
	let Inst{8-6} = imm3;
	}

	def tADDi8 : // A8.6.4 T2
	T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn),
	(ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
	"add", "\t$Rdn, $imm8",
	[(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>,
	Sched<[WriteALU]>;

	// Add register
	let isCommutable = 1 in
	def tADDrr : // A8.6.6 T1
	T1sIGenEncode<0b01100, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
	IIC_iALUr,
	"add", "\t$Rd, $Rn, $Rm",
	[(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;

	/// Similar to the above except these set the 's' bit so the
	/// instruction modifies the CPSR register.
	///
	/// These opcodes will be converted to the real non-S opcodes by
	/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
	let hasPostISelHook = 1, Defs = [CPSR] in {
	let isCommutable = 1, Uses = [CPSR] in
	def tADCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
	2, IIC_iALUr,
	[(set tGPR:$Rdn, CPSR, (ARMadde tGPR:$Rn, tGPR:$Rm,
	CPSR))]>,
	Requires<[IsThumb1Only]>,
	Sched<[WriteALU]>;

	def tADDSi3 : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
	2, IIC_iALUi,
	[(set tGPR:$Rd, CPSR, (ARMaddc tGPR:$Rm,
	imm0_7:$imm3))]>,
	Requires<[IsThumb1Only]>,
	Sched<[WriteALU]>;

	def tADDSi8 : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, imm0_255:$imm8),
	2, IIC_iALUi,
	[(set tGPR:$Rdn, CPSR, (ARMaddc tGPR:$Rn,
	imm8_255:$imm8))]>,
	Requires<[IsThumb1Only]>,
	Sched<[WriteALU]>;

	let isCommutable = 1 in
	def tADDSrr : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
	2, IIC_iALUr,
	[(set tGPR:$Rd, CPSR, (ARMaddc tGPR:$Rn,
	tGPR:$Rm))]>,
	Requires<[IsThumb1Only]>,
	Sched<[WriteALU]>;
	}

	let hasSideEffects = 0 in
	def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr,
	"add", "\t$Rdn, $Rm", []>,
	T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
	// A8.6.6 T2
	bits<4> Rdn;
	bits<4> Rm;
	let Inst{7} = Rdn{3};
	let Inst{6-3} = Rm;
	let Inst{2-0} = Rdn{2-0};
	}
	}

	// Thumb has more flexible short encodings for ADD than ORR, so use those where
	// possible.
	def : T1Pat<(or AddLikeOrOp:$Rn, imm0_7:$imm), (tADDi3 $Rn, imm0_7:$imm)>;

	def : T1Pat<(or AddLikeOrOp:$Rn, imm8_255:$imm), (tADDi8 $Rn, imm8_255:$imm)>;

	def : T1Pat<(or AddLikeOrOp:$Rn, tGPR:$Rm), (tADDrr $Rn, $Rm)>;


	def : tInstAlias <"add${s}${p} $Rdn, $Rm",
	(tADDrr tGPR:$Rdn,s_cc_out:$s, tGPR:$Rdn, tGPR:$Rm, pred:$p)>;

	def : tInstSubst<"sub${s}${p} $rd, $rn, $imm",
	(tADDi3 tGPR:$rd, s_cc_out:$s, tGPR:$rn, mod_imm1_7_neg:$imm, pred:$p)>;
	def : tInstSubst<"sub${s}${p} $rdn, $imm",
	(tADDi8 tGPR:$rdn, s_cc_out:$s, mod_imm8_255_neg:$imm, pred:$p)>;


	// AND register
	let isCommutable = 1 in
	def tAND : // A8.6.12
	T1sItDPEncode<0b0000, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
	IIC_iBITr,
	"and", "\t$Rdn, $Rm",
	[(set tGPR:$Rdn, (and tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;

	// ASR immediate
	def tASRri : // A8.6.14
	T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
	IIC_iMOVsi,
	"asr", "\t$Rd, $Rm, $imm5",
	[(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm_sr:$imm5)))]>,
	Sched<[WriteALU]> {
	bits<5> imm5;
	let Inst{10-6} = imm5;
	}

	// ASR register
	def tASRrr : // A8.6.15
	T1sItDPEncode<0b0100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
	IIC_iMOVsr,
	"asr", "\t$Rdn, $Rm",
	[(set tGPR:$Rdn, (sra tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;

	// BIC register
	def tBIC : // A8.6.20
	T1sItDPEncode<0b1110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
	IIC_iBITr,
	"bic", "\t$Rdn, $Rm",
	[(set tGPR:$Rdn, (and tGPR:$Rn, (not tGPR:$Rm)))]>,
	Sched<[WriteALU]>;

	// CMN register
	let isCompare = 1, Defs = [CPSR] in {
	//FIXME: Disable CMN, as CCodes are backwards from compare expectations
	// Compare-to-zero still works out, just not the relationals
	//def tCMN : // A8.6.33
	// T1pIDPEncode<0b1011, (outs), (ins tGPR:$lhs, tGPR:$rhs),
	// IIC_iCMPr,
	// "cmn", "\t$lhs, $rhs",
	// [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>;

	def tCMNz : // A8.6.33
	T1pIDPEncode<0b1011, (outs), (ins tGPR:$Rn, tGPR:$Rm),
	IIC_iCMPr,
	"cmn", "\t$Rn, $Rm",
	[(ARMcmpZ tGPR:$Rn, (ineg tGPR:$Rm))]>, Sched<[WriteCMP]>;

	} // isCompare = 1, Defs = [CPSR]

	// CMP immediate
	let isCompare = 1, Defs = [CPSR] in {
	def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, imm0_255:$imm8), IIC_iCMPi,
	"cmp", "\t$Rn, $imm8",
	[(ARMcmp tGPR:$Rn, imm0_255:$imm8)]>,
	T1General<{1,0,1,?,?}>, Sched<[WriteCMP]> {
	// A8.6.35
	bits<3> Rn;
	bits<8> imm8;
	let Inst{10-8} = Rn;
	let Inst{7-0} = imm8;
	}

	// CMP register
	def tCMPr : // A8.6.36 T1
	T1pIDPEncode<0b1010, (outs), (ins tGPR:$Rn, tGPR:$Rm),
	IIC_iCMPr,
	"cmp", "\t$Rn, $Rm",
	[(ARMcmp tGPR:$Rn, tGPR:$Rm)]>, Sched<[WriteCMP]>;

	def tCMPhir : T1pI<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_iCMPr,
	"cmp", "\t$Rn, $Rm", []>,
	T1Special<{0,1,?,?}>, Sched<[WriteCMP]> {
	// A8.6.36 T2
	bits<4> Rm;
	bits<4> Rn;
	let Inst{7} = Rn{3};
	let Inst{6-3} = Rm;
	let Inst{2-0} = Rn{2-0};
	}
	} // isCompare = 1, Defs = [CPSR]


	// XOR register
	let isCommutable = 1 in
	def tEOR : // A8.6.45
	T1sItDPEncode<0b0001, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
	IIC_iBITr,
	"eor", "\t$Rdn, $Rm",
	[(set tGPR:$Rdn, (xor tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;

	// LSL immediate
	def tLSLri : // A8.6.88
	T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_31:$imm5),
	IIC_iMOVsi,
	"lsl", "\t$Rd, $Rm, $imm5",
	[(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]>,
	Sched<[WriteALU]> {
	bits<5> imm5;
	let Inst{10-6} = imm5;
	}

	// LSL register
	def tLSLrr : // A8.6.89
	T1sItDPEncode<0b0010, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
	IIC_iMOVsr,
	"lsl", "\t$Rdn, $Rm",
	[(set tGPR:$Rdn, (shl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;

	// LSR immediate
	def tLSRri : // A8.6.90
	T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
	IIC_iMOVsi,
	"lsr", "\t$Rd, $Rm, $imm5",
	[(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm_sr:$imm5)))]>,
	Sched<[WriteALU]> {
	bits<5> imm5;
	let Inst{10-6} = imm5;
	}

	// LSR register
	def tLSRrr : // A8.6.91
	T1sItDPEncode<0b0011, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
	IIC_iMOVsr,
	"lsr", "\t$Rdn, $Rm",
	[(set tGPR:$Rdn, (srl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;

	// Move register
	let isMoveImm = 1 in
	def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255:$imm8), IIC_iMOVi,
	"mov", "\t$Rd, $imm8",
	[(set tGPR:$Rd, imm0_255:$imm8)]>,
	T1General<{1,0,0,?,?}>, Sched<[WriteALU]> {
	// A8.6.96
	bits<3> Rd;
	bits<8> imm8;
	let Inst{10-8} = Rd;
	let Inst{7-0} = imm8;
	}
	// Because we have an explicit tMOVSr below, we need an alias to handle
	// the immediate "movs" form here. Blech.
	def : tInstAlias <"movs $Rdn, $imm",
	(tMOVi8 tGPR:$Rdn, CPSR, imm0_255:$imm, 14, 0)>;

	// A7-73: MOV(2) - mov setting flag.

	let hasSideEffects = 0, isMoveReg = 1 in {
	def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone,
	2, IIC_iMOVr,
	"mov", "\t$Rd, $Rm", "", []>,
	T1Special<{1,0,?,?}>, Sched<[WriteALU]> {
	// A8.6.97
	bits<4> Rd;
	bits<4> Rm;
	let Inst{7} = Rd{3};
	let Inst{6-3} = Rm;
	let Inst{2-0} = Rd{2-0};
	}
	let Defs = [CPSR] in
	def tMOVSr : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr,
	"movs\t$Rd, $Rm", []>, Encoding16, Sched<[WriteALU]> {
	// A8.6.97
	bits<3> Rd;
	bits<3> Rm;
	let Inst{15-6} = 0b0000000000;
	let Inst{5-3} = Rm;
	let Inst{2-0} = Rd;
	}
	} // hasSideEffects

	// Multiply register
	let isCommutable = 1 in
	def tMUL : // A8.6.105 T1
	Thumb1sI<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), AddrModeNone, 2,
	IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", "$Rm = $Rd",
	[(set tGPR:$Rd, (mul tGPR:$Rn, tGPR:$Rm))]>,
	T1DataProcessing<0b1101>, Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
	bits<3> Rd;
	bits<3> Rn;
	let Inst{5-3} = Rn;
	let Inst{2-0} = Rd;
	let AsmMatchConverter = "cvtThumbMultiply";
	}

	def :tInstAlias<"mul${s}${p} $Rdm, $Rn", (tMUL tGPR:$Rdm, s_cc_out:$s, tGPR:$Rn,
	pred:$p)>;

	// Move inverse register
	def tMVN : // A8.6.107
	T1sIDPEncode<0b1111, (outs tGPR:$Rd), (ins tGPR:$Rn), IIC_iMVNr,
	"mvn", "\t$Rd, $Rn",
	[(set tGPR:$Rd, (not tGPR:$Rn))]>, Sched<[WriteALU]>;

	// Bitwise or register
	let isCommutable = 1 in
	def tORR : // A8.6.114
	T1sItDPEncode<0b1100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
	IIC_iBITr,
	"orr", "\t$Rdn, $Rm",
	[(set tGPR:$Rdn, (or tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;

	// Swaps
	def tREV : // A8.6.134
	T1pIMiscEncode<{1,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
	IIC_iUNAr,
	"rev", "\t$Rd, $Rm",
	[(set tGPR:$Rd, (bswap tGPR:$Rm))]>,
	Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;

	def tREV16 : // A8.6.135
	T1pIMiscEncode<{1,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
	IIC_iUNAr,
	"rev16", "\t$Rd, $Rm",
	[(set tGPR:$Rd, (rotr (bswap tGPR:$Rm), (i32 16)))]>,
	Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;

	def tREVSH : // A8.6.136
	T1pIMiscEncode<{1,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
	IIC_iUNAr,
	"revsh", "\t$Rd, $Rm",
	[(set tGPR:$Rd, (sra (bswap tGPR:$Rm), (i32 16)))]>,
	Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;

	// Rotate right register
	def tROR : // A8.6.139
	T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
	IIC_iMOVsr,
	"ror", "\t$Rdn, $Rm",
	[(set tGPR:$Rdn, (rotr tGPR:$Rn, tGPR:$Rm))]>,
	Sched<[WriteALU]>;

	// Negate register
	def tRSB : // A8.6.141
	T1sIDPEncode<0b1001, (outs tGPR:$Rd), (ins tGPR:$Rn),
	IIC_iALUi,
	"rsb", "\t$Rd, $Rn, #0",
	[(set tGPR:$Rd, (ineg tGPR:$Rn))]>, Sched<[WriteALU]>;

	// Subtract with carry register
	let Uses = [CPSR] in
	def tSBC : // A8.6.151
	T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
	IIC_iALUr,
	"sbc", "\t$Rdn, $Rm",
	[]>,
	Sched<[WriteALU]>;

	// Subtract immediate
	def tSUBi3 : // A8.6.210 T1
	T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
	IIC_iALUi,
	"sub", "\t$Rd, $Rm, $imm3",
	[(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]>,
	Sched<[WriteALU]> {
	bits<3> imm3;
	let Inst{8-6} = imm3;
	}

	def tSUBi8 : // A8.6.210 T2
	T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn),
	(ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
	"sub", "\t$Rdn, $imm8",
	[(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>,
	Sched<[WriteALU]>;

	def : tInstSubst<"add${s}${p} $rd, $rn, $imm",
	(tSUBi3 tGPR:$rd, s_cc_out:$s, tGPR:$rn, mod_imm1_7_neg:$imm, pred:$p)>;


	def : tInstSubst<"add${s}${p} $rdn, $imm",
	(tSUBi8 tGPR:$rdn, s_cc_out:$s, mod_imm8_255_neg:$imm, pred:$p)>;


	// Subtract register
	def tSUBrr : // A8.6.212
	T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
	IIC_iALUr,
	"sub", "\t$Rd, $Rn, $Rm",
	[(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>,
	Sched<[WriteALU]>;

	def : tInstAlias <"sub${s}${p} $Rdn, $Rm",
	(tSUBrr tGPR:$Rdn,s_cc_out:$s, tGPR:$Rdn, tGPR:$Rm, pred:$p)>;

	/// Similar to the above except these set the 's' bit so the
	/// instruction modifies the CPSR register.
	///
	/// These opcodes will be converted to the real non-S opcodes by
	/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
	let hasPostISelHook = 1, Defs = [CPSR] in {
	let Uses = [CPSR] in
	def tSBCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
	2, IIC_iALUr,
	[(set tGPR:$Rdn, CPSR, (ARMsube tGPR:$Rn, tGPR:$Rm,
	CPSR))]>,
	Requires<[IsThumb1Only]>,
	Sched<[WriteALU]>;

	def tSUBSi3 : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
	2, IIC_iALUi,
	[(set tGPR:$Rd, CPSR, (ARMsubc tGPR:$Rm,
	imm0_7:$imm3))]>,
	Requires<[IsThumb1Only]>,
	Sched<[WriteALU]>;

	def tSUBSi8 : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, imm0_255:$imm8),
	2, IIC_iALUi,
	[(set tGPR:$Rdn, CPSR, (ARMsubc tGPR:$Rn,
	imm8_255:$imm8))]>,
	Requires<[IsThumb1Only]>,
	Sched<[WriteALU]>;

	def tSUBSrr : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
	2, IIC_iALUr,
	[(set tGPR:$Rd, CPSR, (ARMsubc tGPR:$Rn,
	tGPR:$Rm))]>,
	Requires<[IsThumb1Only]>,
	Sched<[WriteALU]>;

	def tRSBS : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn),
	2, IIC_iALUr,
	[(set tGPR:$Rd, CPSR, (ARMsubc 0, tGPR:$Rn))]>,
	Requires<[IsThumb1Only]>,
	Sched<[WriteALU]>;
	}


	def : T1Pat<(ARMsubs tGPR:$Rn, tGPR:$Rm), (tSUBSrr $Rn, $Rm)>;
	def : T1Pat<(ARMsubs tGPR:$Rn, imm0_7:$imm3), (tSUBSi3 $Rn, imm0_7:$imm3)>;
	def : T1Pat<(ARMsubs tGPR:$Rn, imm0_255:$imm8), (tSUBSi8 $Rn, imm0_255:$imm8)>;


	// Sign-extend byte
	def tSXTB : // A8.6.222
	T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
	IIC_iUNAr,
	"sxtb", "\t$Rd, $Rm",
	[(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i8))]>,
	Requires<[IsThumb, IsThumb1Only, HasV6]>,
	Sched<[WriteALU]>;

	// Sign-extend short
	def tSXTH : // A8.6.224
	T1pIMiscEncode<{0,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
	IIC_iUNAr,
	"sxth", "\t$Rd, $Rm",
	[(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i16))]>,
	Requires<[IsThumb, IsThumb1Only, HasV6]>,
	Sched<[WriteALU]>;

	// Test
	let isCompare = 1, isCommutable = 1, Defs = [CPSR] in
	def tTST : // A8.6.230
	T1pIDPEncode<0b1000, (outs), (ins tGPR:$Rn, tGPR:$Rm), IIC_iTSTr,
	"tst", "\t$Rn, $Rm",
	[(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>,
	Sched<[WriteALU]>;

	// A8.8.247 UDF - Undefined (Encoding T1)
	def tUDF : TI<(outs), (ins imm0_255:$imm8), IIC_Br, "udf\t$imm8",
	[(int_arm_undefined imm0_255:$imm8)]>, Encoding16 {
	bits<8> imm8;
	let Inst{15-12} = 0b1101;
	let Inst{11-8} = 0b1110;
	let Inst{7-0} = imm8;
	}

	def : Pat<(debugtrap), (tBKPT 0)>, Requires<[IsThumb, HasV5T]>;
	def : Pat<(debugtrap), (tUDF 254)>, Requires<[IsThumb, NoV5T]>;

	def t__brkdiv0 : TI<(outs), (ins), IIC_Br, "__brkdiv0",
	[(int_arm_undefined 249)]>, Encoding16,
	Requires<[IsThumb, IsWindows]> {
	let Inst = 0xdef9;
	let isTerminator = 1;
	}

	// Zero-extend byte
	def tUXTB : // A8.6.262
	T1pIMiscEncode<{0,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
	IIC_iUNAr,
	"uxtb", "\t$Rd, $Rm",
	[(set tGPR:$Rd, (and tGPR:$Rm, 0xFF))]>,
	Requires<[IsThumb, IsThumb1Only, HasV6]>,
	Sched<[WriteALU]>;

	// Zero-extend short
	def tUXTH : // A8.6.264
	T1pIMiscEncode<{0,0,1,0,1,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
	IIC_iUNAr,
	"uxth", "\t$Rd, $Rm",
	[(set tGPR:$Rd, (and tGPR:$Rm, 0xFFFF))]>,
	Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;

	// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC operation.
	// Expanded after instruction selection into a branch sequence.
	let usesCustomInserter = 1 in // Expanded after instruction selection.
	def tMOVCCr_pseudo :
	PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, cmovpred:$p),
	NoItinerary,
	[(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, cmovpred:$p))]>;

	// tLEApcrel - Load a pc-relative address into a register without offending the
	// assembler.

	def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p),
	IIC_iALUi, "adr{$p}\t$Rd, $addr", []>,
	T1Encoding<{1,0,1,0,0,?}>, Sched<[WriteALU]> {
	bits<3> Rd;
	bits<8> addr;
	let Inst{10-8} = Rd;
	let Inst{7-0} = addr;
	let DecoderMethod = "DecodeThumbAddSpecialReg";
	}

	let hasSideEffects = 0, isReMaterializable = 1 in
	def tLEApcrel : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p),
	2, IIC_iALUi, []>, Sched<[WriteALU]>;

	let hasSideEffects = 1 in
	def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
	(ins i32imm:$label, pred:$p),
	2, IIC_iALUi, []>, Sched<[WriteALU]>;

	// Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them
	// and make use of the same compressed jump table format as Thumb-2.
	let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1,
	- isIndirectBranch = 1 in {
	+ isIndirectBranch = 1, isNotDuplicable = 1 in {
	def tTBB_JT : tPseudoInst<(outs),
	(ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0,
	IIC_Br, []>, Sched<[WriteBr]>;

	def tTBH_JT : tPseudoInst<(outs),
	(ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0,
	IIC_Br, []>, Sched<[WriteBr]>;
	}

	//===----------------------------------------------------------------------===//
	// TLS Instructions
	//

	// __aeabi_read_tp preserves the registers r1-r3.
	// This is a pseudo inst so that we can get the encoding right,
	// complete with fixup for the aeabi_read_tp function.
	let isCall = 1, Defs = [R0, R12, LR, CPSR], Uses = [SP] in
	def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br,
	[(set R0, ARMthread_pointer)]>,
	Sched<[WriteBr]>;

	//===----------------------------------------------------------------------===//
	// SJLJ Exception handling intrinsics
	//

	// eh_sjlj_setjmp() is an instruction sequence to store the return address and
	// save #0 in R0 for the non-longjmp case. Since by its nature we may be coming
	// from some other function to get here, and we're using the stack frame for the
	// containing function to save/restore registers, we can't keep anything live in
	// regs across the eh_sjlj_setjmp(), else it will almost certainly have been
	// tromped upon when we get here from a longjmp(). We force everything out of
	// registers except for our own input by listing the relevant registers in
	// Defs. By doing so, we also cause the prologue/epilogue code to actively
	// preserve all of the callee-saved resgisters, which is exactly what we want.
	// $val is a scratch register for our use.
	let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R12, CPSR ],
	hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
	usesCustomInserter = 1 in
	def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
	AddrModeNone, 0, NoItinerary, "","",
	[(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>;

	// FIXME: Non-IOS version(s)
	let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
	Defs = [ R7, LR, SP ] in
	def tInt_eh_sjlj_longjmp : XI<(outs), (ins tGPR:$src, tGPR:$scratch),
	AddrModeNone, 0, IndexModeNone,
	Pseudo, NoItinerary, "", "",
	[(ARMeh_sjlj_longjmp tGPR:$src, tGPR:$scratch)]>,
	Requires<[IsThumb,IsNotWindows]>;

	// (Windows is Thumb2-only)
	let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
	Defs = [ R11, LR, SP ] in
	def tInt_WIN_eh_sjlj_longjmp
	: XI<(outs), (ins GPR:$src, GPR:$scratch), AddrModeNone, 0, IndexModeNone,
	Pseudo, NoItinerary, "", "", [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
	Requires<[IsThumb,IsWindows]>;

	//===----------------------------------------------------------------------===//
	// Non-Instruction Patterns
	//

	// Comparisons
	def : T1Pat<(ARMcmpZ tGPR:$Rn, imm0_255:$imm8),
	(tCMPi8 tGPR:$Rn, imm0_255:$imm8)>;
	def : T1Pat<(ARMcmpZ tGPR:$Rn, tGPR:$Rm),
	(tCMPr tGPR:$Rn, tGPR:$Rm)>;

	// Bswap 16 with load/store
	def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)),
	(tREV16 (tLDRHi t_addrmode_is2:$addr))>;
	def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rr:$addr)), (i32 16)),
	(tREV16 (tLDRHr t_addrmode_rr:$addr))>;
	def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
	t_addrmode_is2:$addr),
	(tSTRHi(tREV16 tGPR:$Rn), t_addrmode_is2:$addr)>;
	def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
	t_addrmode_rr:$addr),
	(tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rr:$addr)>;

	// ConstantPool
	def : T1Pat<(ARMWrapper tconstpool :$dst), (tLEApcrel tconstpool :$dst)>;

	// GlobalAddress
	def tLDRLIT_ga_pcrel : PseudoInst<(outs tGPR:$dst), (ins i32imm:$addr),
	IIC_iLoadiALU,
	[(set tGPR:$dst,
	(ARMWrapperPIC tglobaladdr:$addr))]>,
	Requires<[IsThumb, DontUseMovtInPic]>;

	def tLDRLIT_ga_abs : PseudoInst<(outs tGPR:$dst), (ins i32imm:$src),
	IIC_iLoad_i,
	[(set tGPR:$dst,
	(ARMWrapper tglobaladdr:$src))]>,
	Requires<[IsThumb, DontUseMovt]>;

	// TLS globals
	def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),
	(tLDRLIT_ga_pcrel tglobaltlsaddr:$addr)>,
	Requires<[IsThumb, DontUseMovtInPic]>;
	def : Pat<(ARMWrapper tglobaltlsaddr:$addr),
	(tLDRLIT_ga_abs tglobaltlsaddr:$addr)>,
	Requires<[IsThumb, DontUseMovt]>;


	// JumpTable
	def : T1Pat<(ARMWrapperJT tjumptable:$dst),
	(tLEApcrelJT tjumptable:$dst)>;

	// Direct calls
	def : T1Pat<(ARMcall texternalsym:$func), (tBL texternalsym:$func)>,
	Requires<[IsThumb]>;

	// zextload i1 -> zextload i8
	def : T1Pat<(zextloadi1 t_addrmode_is1:$addr),
	(tLDRBi t_addrmode_is1:$addr)>;
	def : T1Pat<(zextloadi1 t_addrmode_rr:$addr),
	(tLDRBr t_addrmode_rr:$addr)>;

	// extload from the stack -> word load from the stack, as it avoids having to
	// materialize the base in a separate register. This only works when a word
	// load puts the byte/halfword value in the same place in the register that the
	// byte/halfword load would, i.e. when little-endian.
	def : T1Pat<(extloadi1 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
	Requires<[IsThumb, IsThumb1Only, IsLE]>;
	def : T1Pat<(extloadi8 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
	Requires<[IsThumb, IsThumb1Only, IsLE]>;
	def : T1Pat<(extloadi16 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>,
	Requires<[IsThumb, IsThumb1Only, IsLE]>;

	// extload -> zextload
	def : T1Pat<(extloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>;
	def : T1Pat<(extloadi1 t_addrmode_rr:$addr), (tLDRBr t_addrmode_rr:$addr)>;
	def : T1Pat<(extloadi8 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>;
	def : T1Pat<(extloadi8 t_addrmode_rr:$addr), (tLDRBr t_addrmode_rr:$addr)>;
	def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>;
	def : T1Pat<(extloadi16 t_addrmode_rr:$addr), (tLDRHr t_addrmode_rr:$addr)>;

	// post-inc loads and stores

	// post-inc LDR -> LDM r0!, {r1}. The way operands are layed out in LDMs is
	// different to how ISel expects them for a post-inc load, so use a pseudo
	// and expand it just after ISel.
	let usesCustomInserter = 1, mayLoad =1,
	Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in
	def tLDR_postidx: tPseudoInst<(outs tGPR:$Rt, tGPR:$Rn_wb),
	(ins tGPR:$Rn, pred:$p),
	4, IIC_iStore_ru,
	[]>;

	// post-inc STR -> STM r0!, {r1}. The layout of this (because it doesn't def
	// multiple registers) is the same in ISel as MachineInstr, so there's no need
	// for a pseudo.
	def : T1Pat<(post_store tGPR:$Rt, tGPR:$Rn, 4),
	(tSTMIA_UPD tGPR:$Rn, tGPR:$Rt)>;

	// If it's impossible to use [r,r] address mode for sextload, select to
	// ldr{b\|h} + sxt{b\|h} instead.
	def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
	(tSXTB (tLDRBi t_addrmode_is1:$addr))>,
	Requires<[IsThumb, IsThumb1Only, HasV6]>;
	def : T1Pat<(sextloadi8 t_addrmode_rr:$addr),
	(tSXTB (tLDRBr t_addrmode_rr:$addr))>,
	Requires<[IsThumb, IsThumb1Only, HasV6]>;
	def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
	(tSXTH (tLDRHi t_addrmode_is2:$addr))>,
	Requires<[IsThumb, IsThumb1Only, HasV6]>;
	def : T1Pat<(sextloadi16 t_addrmode_rr:$addr),
	(tSXTH (tLDRHr t_addrmode_rr:$addr))>,
	Requires<[IsThumb, IsThumb1Only, HasV6]>;

	def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
	(tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>;
	def : T1Pat<(sextloadi8 t_addrmode_rr:$addr),
	(tASRri (tLSLri (tLDRBr t_addrmode_rr:$addr), 24), 24)>;
	def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
	(tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>;
	def : T1Pat<(sextloadi16 t_addrmode_rr:$addr),
	(tASRri (tLSLri (tLDRHr t_addrmode_rr:$addr), 16), 16)>;

	def : T1Pat<(atomic_load_8 t_addrmode_is1:$src),
	(tLDRBi t_addrmode_is1:$src)>;
	def : T1Pat<(atomic_load_8 t_addrmode_rr:$src),
	(tLDRBr t_addrmode_rr:$src)>;
	def : T1Pat<(atomic_load_16 t_addrmode_is2:$src),
	(tLDRHi t_addrmode_is2:$src)>;
	def : T1Pat<(atomic_load_16 t_addrmode_rr:$src),
	(tLDRHr t_addrmode_rr:$src)>;
	def : T1Pat<(atomic_load_32 t_addrmode_is4:$src),
	(tLDRi t_addrmode_is4:$src)>;
	def : T1Pat<(atomic_load_32 t_addrmode_rr:$src),
	(tLDRr t_addrmode_rr:$src)>;
	def : T1Pat<(atomic_store_8 t_addrmode_is1:$ptr, tGPR:$val),
	(tSTRBi tGPR:$val, t_addrmode_is1:$ptr)>;
	def : T1Pat<(atomic_store_8 t_addrmode_rr:$ptr, tGPR:$val),
	(tSTRBr tGPR:$val, t_addrmode_rr:$ptr)>;
	def : T1Pat<(atomic_store_16 t_addrmode_is2:$ptr, tGPR:$val),
	(tSTRHi tGPR:$val, t_addrmode_is2:$ptr)>;
	def : T1Pat<(atomic_store_16 t_addrmode_rr:$ptr, tGPR:$val),
	(tSTRHr tGPR:$val, t_addrmode_rr:$ptr)>;
	def : T1Pat<(atomic_store_32 t_addrmode_is4:$ptr, tGPR:$val),
	(tSTRi tGPR:$val, t_addrmode_is4:$ptr)>;
	def : T1Pat<(atomic_store_32 t_addrmode_rr:$ptr, tGPR:$val),
	(tSTRr tGPR:$val, t_addrmode_rr:$ptr)>;

	// Large immediate handling.

	// Two piece imms.
	def : T1Pat<(i32 thumb_immshifted:$src),
	(tLSLri (tMOVi8 (thumb_immshifted_val imm:$src)),
	(thumb_immshifted_shamt imm:$src))>;

	def : T1Pat<(i32 imm0_255_comp:$src),
	(tMVN (tMOVi8 (imm_not_XFORM imm:$src)))>;

	def : T1Pat<(i32 imm256_510:$src),
	(tADDi8 (tMOVi8 255),
	(thumb_imm256_510_addend imm:$src))>;

	// Pseudo instruction that combines ldr from constpool and add pc. This should
	// be expanded into two instructions late to allow if-conversion and
	// scheduling.
	let isReMaterializable = 1 in
	def tLDRpci_pic : PseudoInst<(outs tGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
	NoItinerary,
	[(set tGPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
	imm:$cp))]>,
	Requires<[IsThumb, IsThumb1Only]>;

	// Pseudo-instruction for merged POP and return.
	// FIXME: remove when we have a way to marking a MI with these properties.
	let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
	hasExtraDefRegAllocReq = 1 in
	def tPOP_RET : tPseudoExpand<(outs), (ins pred:$p, reglist:$regs, variable_ops),
	2, IIC_iPop_Br, [],
	(tPOP pred:$p, reglist:$regs)>, Sched<[WriteBrL]>;

	// Indirect branch using "mov pc, $Rm"
	let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
	def tBRIND : tPseudoExpand<(outs), (ins GPR:$Rm, pred:$p),
	2, IIC_Br, [(brind GPR:$Rm)],
	(tMOVr PC, GPR:$Rm, pred:$p)>, Sched<[WriteBr]>;
	}


	// In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00
	// encoding is available on ARMv6K, but we don't differentiate that finely.
	def : InstAlias<"nop", (tMOVr R8, R8, 14, 0), 0>, Requires<[IsThumb, IsThumb1Only]>;


	// "neg" is and alias for "rsb rd, rn, #0"
	def : tInstAlias<"neg${s}${p} $Rd, $Rm",
	(tRSB tGPR:$Rd, s_cc_out:$s, tGPR:$Rm, pred:$p)>;


	// Implied destination operand forms for shifts.
	def : tInstAlias<"lsl${s}${p} $Rdm, $imm",
	(tLSLri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm0_31:$imm, pred:$p)>;
	def : tInstAlias<"lsr${s}${p} $Rdm, $imm",
	(tLSRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
	def : tInstAlias<"asr${s}${p} $Rdm, $imm",
	(tASRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;

	// Pseudo instruction ldr Rt, =immediate
	def tLDRConstPool
	: tAsmPseudo<"ldr${p} $Rt, $immediate",
	(ins tGPR:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
	Index: vendor/llvm/dist-release_90/lib/Target/AVR/AVRISelLowering.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/AVR/AVRISelLowering.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/AVR/AVRISelLowering.cpp (revision 351303)
	@@ -1,2049 +1,2049 @@
	//===-- AVRISelLowering.cpp - AVR DAG Lowering Implementation -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that AVR uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "AVRISelLowering.h"

	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
	#include "llvm/IR/Function.h"
	#include "llvm/Support/ErrorHandling.h"

	#include "AVR.h"
	#include "AVRMachineFunctionInfo.h"
	#include "AVRSubtarget.h"
	#include "AVRTargetMachine.h"
	#include "MCTargetDesc/AVRMCTargetDesc.h"

	namespace llvm {

	AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM,
	const AVRSubtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	// Set up the register classes.
	addRegisterClass(MVT::i8, &AVR::GPR8RegClass);
	addRegisterClass(MVT::i16, &AVR::DREGSRegClass);

	// Compute derived properties from the register classes.
	computeRegisterProperties(Subtarget.getRegisterInfo());

	setBooleanContents(ZeroOrOneBooleanContent);
	setBooleanVectorContents(ZeroOrOneBooleanContent);
	setSchedulingPreference(Sched::RegPressure);
	setStackPointerRegisterToSaveRestore(AVR::SP);
	setSupportsUnalignedAtomics(true);

	setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
	setOperationAction(ISD::BlockAddress, MVT::i16, Custom);

	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand);
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand);

	for (MVT VT : MVT::integer_valuetypes()) {
	for (auto N : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
	setLoadExtAction(N, VT, MVT::i1, Promote);
	setLoadExtAction(N, VT, MVT::i8, Expand);
	}
	}

	setTruncStoreAction(MVT::i16, MVT::i8, Expand);

	for (MVT VT : MVT::integer_valuetypes()) {
	setOperationAction(ISD::ADDC, VT, Legal);
	setOperationAction(ISD::SUBC, VT, Legal);
	setOperationAction(ISD::ADDE, VT, Legal);
	setOperationAction(ISD::SUBE, VT, Legal);
	}

	// sub (x, imm) gets canonicalized to add (x, -imm), so for illegal types
	// revert into a sub since we don't have an add with immediate instruction.
	setOperationAction(ISD::ADD, MVT::i32, Custom);
	setOperationAction(ISD::ADD, MVT::i64, Custom);

	// our shift instructions are only able to shift 1 bit at a time, so handle
	// this in a custom way.
	setOperationAction(ISD::SRA, MVT::i8, Custom);
	setOperationAction(ISD::SHL, MVT::i8, Custom);
	setOperationAction(ISD::SRL, MVT::i8, Custom);
	setOperationAction(ISD::SRA, MVT::i16, Custom);
	setOperationAction(ISD::SHL, MVT::i16, Custom);
	setOperationAction(ISD::SRL, MVT::i16, Custom);
	setOperationAction(ISD::SHL_PARTS, MVT::i16, Expand);
	setOperationAction(ISD::SRA_PARTS, MVT::i16, Expand);
	setOperationAction(ISD::SRL_PARTS, MVT::i16, Expand);

	setOperationAction(ISD::ROTL, MVT::i8, Custom);
	setOperationAction(ISD::ROTL, MVT::i16, Expand);
	setOperationAction(ISD::ROTR, MVT::i8, Custom);
	setOperationAction(ISD::ROTR, MVT::i16, Expand);

	setOperationAction(ISD::BR_CC, MVT::i8, Custom);
	setOperationAction(ISD::BR_CC, MVT::i16, Custom);
	setOperationAction(ISD::BR_CC, MVT::i32, Custom);
	setOperationAction(ISD::BR_CC, MVT::i64, Custom);
	setOperationAction(ISD::BRCOND, MVT::Other, Expand);

	setOperationAction(ISD::SELECT_CC, MVT::i8, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i16, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
	setOperationAction(ISD::SETCC, MVT::i8, Custom);
	setOperationAction(ISD::SETCC, MVT::i16, Custom);
	setOperationAction(ISD::SETCC, MVT::i32, Custom);
	setOperationAction(ISD::SETCC, MVT::i64, Custom);
	setOperationAction(ISD::SELECT, MVT::i8, Expand);
	setOperationAction(ISD::SELECT, MVT::i16, Expand);

	setOperationAction(ISD::BSWAP, MVT::i16, Expand);

	// Add support for postincrement and predecrement load/stores.
	setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
	setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
	setIndexedLoadAction(ISD::PRE_DEC, MVT::i8, Legal);
	setIndexedLoadAction(ISD::PRE_DEC, MVT::i16, Legal);
	setIndexedStoreAction(ISD::POST_INC, MVT::i8, Legal);
	setIndexedStoreAction(ISD::POST_INC, MVT::i16, Legal);
	setIndexedStoreAction(ISD::PRE_DEC, MVT::i8, Legal);
	setIndexedStoreAction(ISD::PRE_DEC, MVT::i16, Legal);

	setOperationAction(ISD::BR_JT, MVT::Other, Expand);

	setOperationAction(ISD::VASTART, MVT::Other, Custom);
	setOperationAction(ISD::VAEND, MVT::Other, Expand);
	setOperationAction(ISD::VAARG, MVT::Other, Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Expand);

	// Atomic operations which must be lowered to rtlib calls
	for (MVT VT : MVT::integer_valuetypes()) {
	setOperationAction(ISD::ATOMIC_SWAP, VT, Expand);
	setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_NAND, VT, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_MAX, VT, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_MIN, VT, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_UMAX, VT, Expand);
	setOperationAction(ISD::ATOMIC_LOAD_UMIN, VT, Expand);
	}

	// Division/remainder
	setOperationAction(ISD::UDIV, MVT::i8, Expand);
	setOperationAction(ISD::UDIV, MVT::i16, Expand);
	setOperationAction(ISD::UREM, MVT::i8, Expand);
	setOperationAction(ISD::UREM, MVT::i16, Expand);
	setOperationAction(ISD::SDIV, MVT::i8, Expand);
	setOperationAction(ISD::SDIV, MVT::i16, Expand);
	setOperationAction(ISD::SREM, MVT::i8, Expand);
	setOperationAction(ISD::SREM, MVT::i16, Expand);

	// Make division and modulus custom
	for (MVT VT : MVT::integer_valuetypes()) {
	setOperationAction(ISD::UDIVREM, VT, Custom);
	setOperationAction(ISD::SDIVREM, VT, Custom);
	}

	// Do not use MUL. The AVR instructions are closer to SMUL_LOHI &co.
	setOperationAction(ISD::MUL, MVT::i8, Expand);
	setOperationAction(ISD::MUL, MVT::i16, Expand);

	// Expand 16 bit multiplications.
	setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
	setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);

	// Expand multiplications to libcalls when there is
	// no hardware MUL.
	if (!Subtarget.supportsMultiplication()) {
	setOperationAction(ISD::SMUL_LOHI, MVT::i8, Expand);
	setOperationAction(ISD::UMUL_LOHI, MVT::i8, Expand);
	}

	for (MVT VT : MVT::integer_valuetypes()) {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	}

	for (MVT VT : MVT::integer_valuetypes()) {
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);
	}

	for (MVT VT : MVT::integer_valuetypes()) {
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
	// TODO: The generated code is pretty poor. Investigate using the
	// same "shift and subtract with carry" trick that we do for
	// extending 8-bit to 16-bit. This may require infrastructure
	// improvements in how we treat 16-bit "registers" to be feasible.
	}

	// Division rtlib functions (not supported)
	setLibcallName(RTLIB::SDIV_I8, nullptr);
	setLibcallName(RTLIB::SDIV_I16, nullptr);
	setLibcallName(RTLIB::SDIV_I32, nullptr);
	setLibcallName(RTLIB::SDIV_I64, nullptr);
	setLibcallName(RTLIB::SDIV_I128, nullptr);
	setLibcallName(RTLIB::UDIV_I8, nullptr);
	setLibcallName(RTLIB::UDIV_I16, nullptr);
	setLibcallName(RTLIB::UDIV_I32, nullptr);
	setLibcallName(RTLIB::UDIV_I64, nullptr);
	setLibcallName(RTLIB::UDIV_I128, nullptr);

	// Modulus rtlib functions (not supported)
	setLibcallName(RTLIB::SREM_I8, nullptr);
	setLibcallName(RTLIB::SREM_I16, nullptr);
	setLibcallName(RTLIB::SREM_I32, nullptr);
	setLibcallName(RTLIB::SREM_I64, nullptr);
	setLibcallName(RTLIB::SREM_I128, nullptr);
	setLibcallName(RTLIB::UREM_I8, nullptr);
	setLibcallName(RTLIB::UREM_I16, nullptr);
	setLibcallName(RTLIB::UREM_I32, nullptr);
	setLibcallName(RTLIB::UREM_I64, nullptr);
	setLibcallName(RTLIB::UREM_I128, nullptr);

	// Division and modulus rtlib functions
	setLibcallName(RTLIB::SDIVREM_I8, "__divmodqi4");
	setLibcallName(RTLIB::SDIVREM_I16, "__divmodhi4");
	setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
	setLibcallName(RTLIB::SDIVREM_I64, "__divmoddi4");
	setLibcallName(RTLIB::SDIVREM_I128, "__divmodti4");
	setLibcallName(RTLIB::UDIVREM_I8, "__udivmodqi4");
	setLibcallName(RTLIB::UDIVREM_I16, "__udivmodhi4");
	setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
	setLibcallName(RTLIB::UDIVREM_I64, "__udivmoddi4");
	setLibcallName(RTLIB::UDIVREM_I128, "__udivmodti4");

	// Several of the runtime library functions use a special calling conv
	setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::AVR_BUILTIN);
	setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::AVR_BUILTIN);
	setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::AVR_BUILTIN);
	setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::AVR_BUILTIN);

	// Trigonometric rtlib functions
	setLibcallName(RTLIB::SIN_F32, "sin");
	setLibcallName(RTLIB::COS_F32, "cos");

	setMinFunctionAlignment(1);
	setMinimumJumpTableEntries(UINT_MAX);
	}

	const char *AVRTargetLowering::getTargetNodeName(unsigned Opcode) const {
	#define NODE(name) \
	case AVRISD::name: \
	return #name

	switch (Opcode) {
	default:
	return nullptr;
	NODE(RET_FLAG);
	NODE(RETI_FLAG);
	NODE(CALL);
	NODE(WRAPPER);
	NODE(LSL);
	NODE(LSR);
	NODE(ROL);
	NODE(ROR);
	NODE(ASR);
	NODE(LSLLOOP);
	NODE(LSRLOOP);
	NODE(ASRLOOP);
	NODE(BRCOND);
	NODE(CMP);
	NODE(CMPC);
	NODE(TST);
	NODE(SELECT_CC);
	#undef NODE
	}
	}

	EVT AVRTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
	EVT VT) const {
	assert(!VT.isVector() && "No AVR SetCC type for vectors!");
	return MVT::i8;
	}

	SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
	//:TODO: this function has to be completely rewritten to produce optimal
	// code, for now it's producing very long but correct code.
	unsigned Opc8;
	const SDNode *N = Op.getNode();
	EVT VT = Op.getValueType();
	SDLoc dl(N);

	// Expand non-constant shifts to loops.
	if (!isa<ConstantSDNode>(N->getOperand(1))) {
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Invalid shift opcode!");
	case ISD::SHL:
	return DAG.getNode(AVRISD::LSLLOOP, dl, VT, N->getOperand(0),
	N->getOperand(1));
	case ISD::SRL:
	return DAG.getNode(AVRISD::LSRLOOP, dl, VT, N->getOperand(0),
	N->getOperand(1));
	case ISD::ROTL:
	return DAG.getNode(AVRISD::ROLLOOP, dl, VT, N->getOperand(0),
	N->getOperand(1));
	case ISD::ROTR:
	return DAG.getNode(AVRISD::RORLOOP, dl, VT, N->getOperand(0),
	N->getOperand(1));
	case ISD::SRA:
	return DAG.getNode(AVRISD::ASRLOOP, dl, VT, N->getOperand(0),
	N->getOperand(1));
	}
	}

	uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	SDValue Victim = N->getOperand(0);

	switch (Op.getOpcode()) {
	case ISD::SRA:
	Opc8 = AVRISD::ASR;
	break;
	case ISD::ROTL:
	Opc8 = AVRISD::ROL;
	break;
	case ISD::ROTR:
	Opc8 = AVRISD::ROR;
	break;
	case ISD::SRL:
	Opc8 = AVRISD::LSR;
	break;
	case ISD::SHL:
	Opc8 = AVRISD::LSL;
	break;
	default:
	llvm_unreachable("Invalid shift opcode");
	}

	while (ShiftAmount--) {
	Victim = DAG.getNode(Opc8, dl, VT, Victim);
	}

	return Victim;
	}

	SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
	unsigned Opcode = Op->getOpcode();
	assert((Opcode == ISD::SDIVREM \|\| Opcode == ISD::UDIVREM) &&
	"Invalid opcode for Div/Rem lowering");
	bool IsSigned = (Opcode == ISD::SDIVREM);
	EVT VT = Op->getValueType(0);
	Type Ty = VT.getTypeForEVT(DAG.getContext());

	RTLIB::Libcall LC;
	switch (VT.getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("Unexpected request for libcall!");
	case MVT::i8:
	LC = IsSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8;
	break;
	case MVT::i16:
	LC = IsSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16;
	break;
	case MVT::i32:
	LC = IsSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32;
	break;
	case MVT::i64:
	LC = IsSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64;
	break;
	case MVT::i128:
	LC = IsSigned ? RTLIB::SDIVREM_I128 : RTLIB::UDIVREM_I128;
	break;
	}

	SDValue InChain = DAG.getEntryNode();

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (SDValue const &Value : Op->op_values()) {
	Entry.Node = Value;
	Entry.Ty = Value.getValueType().getTypeForEVT(*DAG.getContext());
	Entry.IsSExt = IsSigned;
	Entry.IsZExt = !IsSigned;
	Args.push_back(Entry);
	}

	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
	getPointerTy(DAG.getDataLayout()));

	Type RetTy = (Type )StructType::get(Ty, Ty);

	SDLoc dl(Op);
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(InChain)
	.setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
	.setInRegister()
	.setSExtResult(IsSigned)
	.setZExtResult(!IsSigned);

	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
	return CallInfo.first;
	}

	SDValue AVRTargetLowering::LowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	auto DL = DAG.getDataLayout();

	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
	int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();

	// Create the TargetGlobalAddress node, folding in the constant offset.
	SDValue Result =
	DAG.getTargetGlobalAddress(GV, SDLoc(Op), getPointerTy(DL), Offset);
	return DAG.getNode(AVRISD::WRAPPER, SDLoc(Op), getPointerTy(DL), Result);
	}

	SDValue AVRTargetLowering::LowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	auto DL = DAG.getDataLayout();
	const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();

	SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(DL));

	return DAG.getNode(AVRISD::WRAPPER, SDLoc(Op), getPointerTy(DL), Result);
	}

	/// IntCCToAVRCC - Convert a DAG integer condition code to an AVR CC.
	static AVRCC::CondCodes intCCToAVRCC(ISD::CondCode CC) {
	switch (CC) {
	default:
	llvm_unreachable("Unknown condition code!");
	case ISD::SETEQ:
	return AVRCC::COND_EQ;
	case ISD::SETNE:
	return AVRCC::COND_NE;
	case ISD::SETGE:
	return AVRCC::COND_GE;
	case ISD::SETLT:
	return AVRCC::COND_LT;
	case ISD::SETUGE:
	return AVRCC::COND_SH;
	case ISD::SETULT:
	return AVRCC::COND_LO;
	}
	}

	/// Returns appropriate AVR CMP/CMPC nodes and corresponding condition code for
	/// the given operands.
	SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
	SDValue &AVRcc, SelectionDAG &DAG,
	SDLoc DL) const {
	SDValue Cmp;
	EVT VT = LHS.getValueType();
	bool UseTest = false;

	switch (CC) {
	default:
	break;
	case ISD::SETLE: {
	// Swap operands and reverse the branching condition.
	std::swap(LHS, RHS);
	CC = ISD::SETGE;
	break;
	}
	case ISD::SETGT: {
	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
	switch (C->getSExtValue()) {
	case -1: {
	// When doing lhs > -1 use a tst instruction on the top part of lhs
	// and use brpl instead of using a chain of cp/cpc.
	UseTest = true;
	AVRcc = DAG.getConstant(AVRCC::COND_PL, DL, MVT::i8);
	break;
	}
	case 0: {
	// Turn lhs > 0 into 0 < lhs since 0 can be materialized with
	// __zero_reg__ in lhs.
	RHS = LHS;
	LHS = DAG.getConstant(0, DL, VT);
	CC = ISD::SETLT;
	break;
	}
	default: {
	// Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows
	// us to fold the constant into the cmp instruction.
	RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
	CC = ISD::SETGE;
	break;
	}
	}
	break;
	}
	// Swap operands and reverse the branching condition.
	std::swap(LHS, RHS);
	CC = ISD::SETLT;
	break;
	}
	case ISD::SETLT: {
	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
	switch (C->getSExtValue()) {
	case 1: {
	// Turn lhs < 1 into 0 >= lhs since 0 can be materialized with
	// __zero_reg__ in lhs.
	RHS = LHS;
	LHS = DAG.getConstant(0, DL, VT);
	CC = ISD::SETGE;
	break;
	}
	case 0: {
	// When doing lhs < 0 use a tst instruction on the top part of lhs
	// and use brmi instead of using a chain of cp/cpc.
	UseTest = true;
	AVRcc = DAG.getConstant(AVRCC::COND_MI, DL, MVT::i8);
	break;
	}
	}
	}
	break;
	}
	case ISD::SETULE: {
	// Swap operands and reverse the branching condition.
	std::swap(LHS, RHS);
	CC = ISD::SETUGE;
	break;
	}
	case ISD::SETUGT: {
	// Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
	// fold the constant into the cmp instruction.
	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
	RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
	CC = ISD::SETUGE;
	break;
	}
	// Swap operands and reverse the branching condition.
	std::swap(LHS, RHS);
	CC = ISD::SETULT;
	break;
	}
	}

	// Expand 32 and 64 bit comparisons with custom CMP and CMPC nodes instead of
	// using the default and/or/xor expansion code which is much longer.
	if (VT == MVT::i32) {
	SDValue LHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS,
	DAG.getIntPtrConstant(0, DL));
	SDValue LHShi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS,
	DAG.getIntPtrConstant(1, DL));
	SDValue RHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS,
	DAG.getIntPtrConstant(0, DL));
	SDValue RHShi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS,
	DAG.getIntPtrConstant(1, DL));

	if (UseTest) {
	// When using tst we only care about the highest part.
	SDValue Top = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHShi,
	DAG.getIntPtrConstant(1, DL));
	Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue, Top);
	} else {
	Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHSlo, RHSlo);
	Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHShi, RHShi, Cmp);
	}
	} else if (VT == MVT::i64) {
	SDValue LHS_0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS,
	DAG.getIntPtrConstant(0, DL));
	SDValue LHS_1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS,
	DAG.getIntPtrConstant(1, DL));

	SDValue LHS0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_0,
	DAG.getIntPtrConstant(0, DL));
	SDValue LHS1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_0,
	DAG.getIntPtrConstant(1, DL));
	SDValue LHS2 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_1,
	DAG.getIntPtrConstant(0, DL));
	SDValue LHS3 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, LHS_1,
	DAG.getIntPtrConstant(1, DL));

	SDValue RHS_0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS,
	DAG.getIntPtrConstant(0, DL));
	SDValue RHS_1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS,
	DAG.getIntPtrConstant(1, DL));

	SDValue RHS0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_0,
	DAG.getIntPtrConstant(0, DL));
	SDValue RHS1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_0,
	DAG.getIntPtrConstant(1, DL));
	SDValue RHS2 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_1,
	DAG.getIntPtrConstant(0, DL));
	SDValue RHS3 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i16, RHS_1,
	DAG.getIntPtrConstant(1, DL));

	if (UseTest) {
	// When using tst we only care about the highest part.
	SDValue Top = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS3,
	DAG.getIntPtrConstant(1, DL));
	Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue, Top);
	} else {
	Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHS0, RHS0);
	Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS1, RHS1, Cmp);
	Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS2, RHS2, Cmp);
	Cmp = DAG.getNode(AVRISD::CMPC, DL, MVT::Glue, LHS3, RHS3, Cmp);
	}
	} else if (VT == MVT::i8 \|\| VT == MVT::i16) {
	if (UseTest) {
	// When using tst we only care about the highest part.
	Cmp = DAG.getNode(AVRISD::TST, DL, MVT::Glue,
	(VT == MVT::i8)
	? LHS
	: DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8,
	LHS, DAG.getIntPtrConstant(1, DL)));
	} else {
	Cmp = DAG.getNode(AVRISD::CMP, DL, MVT::Glue, LHS, RHS);
	}
	} else {
	llvm_unreachable("Invalid comparison size");
	}

	// When using a test instruction AVRcc is already set.
	if (!UseTest) {
	AVRcc = DAG.getConstant(intCCToAVRCC(CC), DL, MVT::i8);
	}

	return Cmp;
	}

	SDValue AVRTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
	SDValue LHS = Op.getOperand(2);
	SDValue RHS = Op.getOperand(3);
	SDValue Dest = Op.getOperand(4);
	SDLoc dl(Op);

	SDValue TargetCC;
	SDValue Cmp = getAVRCmp(LHS, RHS, CC, TargetCC, DAG, dl);

	return DAG.getNode(AVRISD::BRCOND, dl, MVT::Other, Chain, Dest, TargetCC,
	Cmp);
	}

	SDValue AVRTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue TrueV = Op.getOperand(2);
	SDValue FalseV = Op.getOperand(3);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
	SDLoc dl(Op);

	SDValue TargetCC;
	SDValue Cmp = getAVRCmp(LHS, RHS, CC, TargetCC, DAG, dl);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
	SDValue Ops[] = {TrueV, FalseV, TargetCC, Cmp};

	return DAG.getNode(AVRISD::SELECT_CC, dl, VTs, Ops);
	}

	SDValue AVRTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDLoc DL(Op);

	SDValue TargetCC;
	SDValue Cmp = getAVRCmp(LHS, RHS, CC, TargetCC, DAG, DL);

	SDValue TrueV = DAG.getConstant(1, DL, Op.getValueType());
	SDValue FalseV = DAG.getConstant(0, DL, Op.getValueType());
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
	SDValue Ops[] = {TrueV, FalseV, TargetCC, Cmp};

	return DAG.getNode(AVRISD::SELECT_CC, DL, VTs, Ops);
	}

	SDValue AVRTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	const MachineFunction &MF = DAG.getMachineFunction();
	const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	auto DL = DAG.getDataLayout();
	SDLoc dl(Op);

	// Vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDValue FI = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(), getPointerTy(DL));

	return DAG.getStore(Op.getOperand(0), dl, FI, Op.getOperand(1),
	MachinePointerInfo(SV), 0);
	}

	SDValue AVRTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Don't know how to custom lower this!");
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	case ISD::ROTL:
	case ISD::ROTR:
	return LowerShifts(Op, DAG);
	case ISD::GlobalAddress:
	return LowerGlobalAddress(Op, DAG);
	case ISD::BlockAddress:
	return LowerBlockAddress(Op, DAG);
	case ISD::BR_CC:
	return LowerBR_CC(Op, DAG);
	case ISD::SELECT_CC:
	return LowerSELECT_CC(Op, DAG);
	case ISD::SETCC:
	return LowerSETCC(Op, DAG);
	case ISD::VASTART:
	return LowerVASTART(Op, DAG);
	case ISD::SDIVREM:
	case ISD::UDIVREM:
	return LowerDivRem(Op, DAG);
	}

	return SDValue();
	}

	/// Replace a node with an illegal result type
	/// with a new node built out of custom code.
	void AVRTargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDLoc DL(N);

	switch (N->getOpcode()) {
	case ISD::ADD: {
	// Convert add (x, imm) into sub (x, -imm).
	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
	SDValue Sub = DAG.getNode(
	ISD::SUB, DL, N->getValueType(0), N->getOperand(0),
	DAG.getConstant(-C->getAPIntValue(), DL, C->getValueType(0)));
	Results.push_back(Sub);
	}
	break;
	}
	default: {
	SDValue Res = LowerOperation(SDValue(N, 0), DAG);

	for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
	Results.push_back(Res.getValue(I));

	break;
	}
	}
	}

	/// Return true if the addressing mode represented
	/// by AM is legal for this target, for a load/store of the specified type.
	bool AVRTargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS, Instruction *I) const {
	int64_t Offs = AM.BaseOffs;

	// Allow absolute addresses.
	if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && Offs == 0) {
	return true;
	}

	// Flash memory instructions only allow zero offsets.
	if (isa<PointerType>(Ty) && AS == AVR::ProgramMemory) {
	return false;
	}

	// Allow reg+<6bit> offset.
	if (Offs < 0)
	Offs = -Offs;
	if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 0 && isUInt<6>(Offs)) {
	return true;
	}

	return false;
	}

	/// Returns true by value, base pointer and
	/// offset pointer and addressing mode by reference if the node's address
	/// can be legally represented as pre-indexed load / store address.
	bool AVRTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const {
	EVT VT;
	const SDNode *Op;
	SDLoc DL(N);

	if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	Op = LD->getBasePtr().getNode();
	if (LD->getExtensionType() != ISD::NON_EXTLOAD)
	return false;
	if (AVR::isProgramMemoryAccess(LD)) {
	return false;
	}
	} else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	Op = ST->getBasePtr().getNode();
	if (AVR::isProgramMemoryAccess(ST)) {
	return false;
	}
	} else {
	return false;
	}

	if (VT != MVT::i8 && VT != MVT::i16) {
	return false;
	}

	if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) {
	return false;
	}

	if (const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
	int RHSC = RHS->getSExtValue();
	if (Op->getOpcode() == ISD::SUB)
	RHSC = -RHSC;

	if ((VT == MVT::i16 && RHSC != -2) \|\| (VT == MVT::i8 && RHSC != -1)) {
	return false;
	}

	Base = Op->getOperand(0);
	Offset = DAG.getConstant(RHSC, DL, MVT::i8);
	AM = ISD::PRE_DEC;

	return true;
	}

	return false;
	}

	/// Returns true by value, base pointer and
	/// offset pointer and addressing mode by reference if this node can be
	/// combined with a load / store to form a post-indexed load / store.
	bool AVRTargetLowering::getPostIndexedAddressParts(SDNode N, SDNode Op,
	SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const {
	EVT VT;
	SDLoc DL(N);

	if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	if (LD->getExtensionType() != ISD::NON_EXTLOAD)
	return false;
	} else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	if (AVR::isProgramMemoryAccess(ST)) {
	return false;
	}
	} else {
	return false;
	}

	if (VT != MVT::i8 && VT != MVT::i16) {
	return false;
	}

	if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) {
	return false;
	}

	if (const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
	int RHSC = RHS->getSExtValue();
	if (Op->getOpcode() == ISD::SUB)
	RHSC = -RHSC;
	if ((VT == MVT::i16 && RHSC != 2) \|\| (VT == MVT::i8 && RHSC != 1)) {
	return false;
	}

	Base = Op->getOperand(0);
	Offset = DAG.getConstant(RHSC, DL, MVT::i8);
	AM = ISD::POST_INC;

	return true;
	}

	return false;
	}

	bool AVRTargetLowering::isOffsetFoldingLegal(
	const GlobalAddressSDNode *GA) const {
	return true;
	}

	//===----------------------------------------------------------------------===//
	// Formal Arguments Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	#include "AVRGenCallingConv.inc"

	/// For each argument in a function store the number of pieces it is composed
	/// of.
	static void parseFunctionArgs(const SmallVectorImpl<ISD::InputArg> &Ins,
	SmallVectorImpl<unsigned> &Out) {
	for (const ISD::InputArg &Arg : Ins) {
	if(Arg.PartOffset > 0) continue;
	unsigned Bytes = ((Arg.ArgVT.getSizeInBits()) + 7) / 8;

	Out.push_back((Bytes + 1) / 2);
	}
	}

	/// For external symbols there is no function prototype information so we
	/// have to rely directly on argument sizes.
	static void parseExternFuncCallArgs(const SmallVectorImpl<ISD::OutputArg> &In,
	SmallVectorImpl<unsigned> &Out) {
	for (unsigned i = 0, e = In.size(); i != e;) {
	unsigned Size = 0;
	unsigned Offset = 0;
	while ((i != e) && (In[i].PartOffset == Offset)) {
	Offset += In[i].VT.getStoreSize();
	++i;
	++Size;
	}
	Out.push_back(Size);
	}
	}

	static StringRef getFunctionName(TargetLowering::CallLoweringInfo &CLI) {
	SDValue Callee = CLI.Callee;

	if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	return G->getSymbol();
	}

	if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	return G->getGlobal()->getName();
	}

	llvm_unreachable("don't know how to get the name for this callee");
	}

	/// Analyze incoming and outgoing function arguments. We need custom C++ code
	/// to handle special constraints in the ABI like reversing the order of the
	/// pieces of splitted arguments. In addition, all pieces of a certain argument
	/// have to be passed either using registers or the stack but never mixing both.
	static void analyzeStandardArguments(TargetLowering::CallLoweringInfo *CLI,
	const Function F, const DataLayout TD,
	const SmallVectorImpl<ISD::OutputArg> *Outs,
	const SmallVectorImpl<ISD::InputArg> *Ins,
	CallingConv::ID CallConv,
	SmallVectorImpl<CCValAssign> &ArgLocs,
	CCState &CCInfo, bool IsCall, bool IsVarArg) {
	static const MCPhysReg RegList8[] = {AVR::R24, AVR::R22, AVR::R20,
	AVR::R18, AVR::R16, AVR::R14,
	AVR::R12, AVR::R10, AVR::R8};
	static const MCPhysReg RegList16[] = {AVR::R25R24, AVR::R23R22, AVR::R21R20,
	AVR::R19R18, AVR::R17R16, AVR::R15R14,
	AVR::R13R12, AVR::R11R10, AVR::R9R8};
	if (IsVarArg) {
	// Variadic functions do not need all the analysis below.
	if (IsCall) {
	CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_Vararg);
	} else {
	CCInfo.AnalyzeFormalArguments(*Ins, ArgCC_AVR_Vararg);
	}
	return;
	}

	// Fill in the Args array which will contain original argument sizes.
	SmallVector<unsigned, 8> Args;
	if (IsCall) {
	parseExternFuncCallArgs(*Outs, Args);
	} else {
	assert(F != nullptr && "function should not be null");
	parseFunctionArgs(*Ins, Args);
	}

	unsigned RegsLeft = array_lengthof(RegList8), ValNo = 0;
	// Variadic functions always use the stack.
	bool UsesStack = false;
	for (unsigned i = 0, pos = 0, e = Args.size(); i != e; ++i) {
	unsigned Size = Args[i];

	// If we have a zero-sized argument, don't attempt to lower it.
	// AVR-GCC does not support zero-sized arguments and so we need not
	// worry about ABI compatibility.
	if (Size == 0) continue;

	MVT LocVT = (IsCall) ? (Outs)[pos].VT : (Ins)[pos].VT;

	// If we have plenty of regs to pass the whole argument do it.
	if (!UsesStack && (Size <= RegsLeft)) {
	const MCPhysReg *RegList = (LocVT == MVT::i16) ? RegList16 : RegList8;

	for (unsigned j = 0; j != Size; ++j) {
	unsigned Reg = CCInfo.AllocateReg(
	ArrayRef<MCPhysReg>(RegList, array_lengthof(RegList8)));
	CCInfo.addLoc(
	CCValAssign::getReg(ValNo++, LocVT, Reg, LocVT, CCValAssign::Full));
	--RegsLeft;
	}

	// Reverse the order of the pieces to agree with the "big endian" format
	// required in the calling convention ABI.
	std::reverse(ArgLocs.begin() + pos, ArgLocs.begin() + pos + Size);
	} else {
	// Pass the rest of arguments using the stack.
	UsesStack = true;
	for (unsigned j = 0; j != Size; ++j) {
	unsigned Offset = CCInfo.AllocateStack(
	TD->getTypeAllocSize(EVT(LocVT).getTypeForEVT(CCInfo.getContext())),
	TD->getABITypeAlignment(
	EVT(LocVT).getTypeForEVT(CCInfo.getContext())));
	CCInfo.addLoc(CCValAssign::getMem(ValNo++, LocVT, Offset, LocVT,
	CCValAssign::Full));
	}
	}
	pos += Size;
	}
	}

	static void analyzeBuiltinArguments(TargetLowering::CallLoweringInfo &CLI,
	const Function F, const DataLayout TD,
	const SmallVectorImpl<ISD::OutputArg> *Outs,
	const SmallVectorImpl<ISD::InputArg> *Ins,
	CallingConv::ID CallConv,
	SmallVectorImpl<CCValAssign> &ArgLocs,
	CCState &CCInfo, bool IsCall, bool IsVarArg) {
	StringRef FuncName = getFunctionName(CLI);

	if (FuncName.startswith("__udivmod") \|\| FuncName.startswith("__divmod")) {
	CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_BUILTIN_DIV);
	} else {
	analyzeStandardArguments(&CLI, F, TD, Outs, Ins,
	CallConv, ArgLocs, CCInfo,
	IsCall, IsVarArg);
	}
	}

	static void analyzeArguments(TargetLowering::CallLoweringInfo *CLI,
	const Function F, const DataLayout TD,
	const SmallVectorImpl<ISD::OutputArg> *Outs,
	const SmallVectorImpl<ISD::InputArg> *Ins,
	CallingConv::ID CallConv,
	SmallVectorImpl<CCValAssign> &ArgLocs,
	CCState &CCInfo, bool IsCall, bool IsVarArg) {
	switch (CallConv) {
	case CallingConv::AVR_BUILTIN: {
	analyzeBuiltinArguments(*CLI, F, TD, Outs, Ins,
	CallConv, ArgLocs, CCInfo,
	IsCall, IsVarArg);
	return;
	}
	default: {
	analyzeStandardArguments(CLI, F, TD, Outs, Ins,
	CallConv, ArgLocs, CCInfo,
	IsCall, IsVarArg);
	return;
	}
	}
	}

	SDValue AVRTargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	auto DL = DAG.getDataLayout();

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	analyzeArguments(nullptr, &MF.getFunction(), &DL, 0, &Ins, CallConv, ArgLocs, CCInfo,
	false, isVarArg);

	SDValue ArgValue;
	for (CCValAssign &VA : ArgLocs) {

	// Arguments stored on registers.
	if (VA.isRegLoc()) {
	EVT RegVT = VA.getLocVT();
	const TargetRegisterClass *RC;
	if (RegVT == MVT::i8) {
	RC = &AVR::GPR8RegClass;
	} else if (RegVT == MVT::i16) {
	RC = &AVR::DREGSRegClass;
	} else {
	llvm_unreachable("Unknown argument type!");
	}

	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);

	// :NOTE: Clang should not promote any i8 into i16 but for safety the
	// following code will handle zexts or sexts generated by other
	// front ends. Otherwise:
	// If this is an 8 bit value, it is really passed promoted
	// to 16 bits. Insert an assert[sz]ext to capture this, then
	// truncate to the right size.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
	break;
	case CCValAssign::SExt:
	ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
	break;
	case CCValAssign::ZExt:
	ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
	break;
	}

	InVals.push_back(ArgValue);
	} else {
	// Sanity check.
	assert(VA.isMemLoc());

	EVT LocVT = VA.getLocVT();

	// Create the frame index object for this incoming parameter.
	int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), true);

	// Create the SelectionDAG nodes corresponding to a load
	// from this parameter.
	SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DL));
	InVals.push_back(DAG.getLoad(LocVT, dl, Chain, FIN,
	MachinePointerInfo::getFixedStack(MF, FI),
	0));
	}
	}

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start.
	if (isVarArg) {
	unsigned StackSize = CCInfo.getNextStackOffset();
	AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();

	AFI->setVarArgsFrameIndex(MFI.CreateFixedObject(2, StackSize, true));
	}

	return Chain;
	}

	//===----------------------------------------------------------------------===//
	// Call Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &DL = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &isTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool isVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();

	// AVR does not yet support tail call optimization.
	isTailCall = false;

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
	// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
	// node so that legalize doesn't hack it.
	const Function *F = nullptr;
	if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = G->getGlobal();

	F = cast<Function>(GV);
	Callee =
	DAG.getTargetGlobalAddress(GV, DL, getPointerTy(DAG.getDataLayout()));
	} else if (const ExternalSymbolSDNode *ES =
	dyn_cast<ExternalSymbolSDNode>(Callee)) {
	Callee = DAG.getTargetExternalSymbol(ES->getSymbol(),
	getPointerTy(DAG.getDataLayout()));
	}

	analyzeArguments(&CLI, F, &DAG.getDataLayout(), &Outs, 0, CallConv, ArgLocs, CCInfo,
	true, isVarArg);

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getNextStackOffset();

	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

	// First, walk the register assignments, inserting copies.
	unsigned AI, AE;
	bool HasStackArgs = false;
	for (AI = 0, AE = ArgLocs.size(); AI != AE; ++AI) {
	CCValAssign &VA = ArgLocs[AI];
	EVT RegVT = VA.getLocVT();
	SDValue Arg = OutVals[AI];

	// Promote the value if needed. With Clang this should not happen.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, RegVT, Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, RegVT, Arg);
	break;
	case CCValAssign::AExt:
	Arg = DAG.getNode(ISD::ANY_EXTEND, DL, RegVT, Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, DL, RegVT, Arg);
	break;
	}

	// Stop when we encounter a stack argument, we need to process them
	// in reverse order in the loop below.
	if (VA.isMemLoc()) {
	HasStackArgs = true;
	break;
	}

	// Arguments that can be passed on registers must be kept in the RegsToPass
	// vector.
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	}

	// Second, stack arguments have to walked in reverse order by inserting
	// chained stores, this ensures their order is not changed by the scheduler
	// and that the push instruction sequence generated is correct, otherwise they
	// can be freely intermixed.
	if (HasStackArgs) {
	for (AE = AI, AI = ArgLocs.size(); AI != AE; --AI) {
	unsigned Loc = AI - 1;
	CCValAssign &VA = ArgLocs[Loc];
	SDValue Arg = OutVals[Loc];

	assert(VA.isMemLoc());

	// SP points to one stack slot further so add one to adjust it.
	SDValue PtrOff = DAG.getNode(
	ISD::ADD, DL, getPointerTy(DAG.getDataLayout()),
	DAG.getRegister(AVR::SP, getPointerTy(DAG.getDataLayout())),
	DAG.getIntPtrConstant(VA.getLocMemOffset() + 1, DL));

	Chain =
	DAG.getStore(Chain, DL, Arg, PtrOff,
	MachinePointerInfo::getStack(MF, VA.getLocMemOffset()),
	0);
	}
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain and
	// flag operands which copy the outgoing args into registers. The InFlag in
	// necessary since all emited instructions must be stuck together.
	SDValue InFlag;
	for (auto Reg : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, InFlag);
	InFlag = Chain.getValue(1);
	}

	// Returns a chain & a flag for retval copy to use.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (auto Reg : RegsToPass) {
	Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
	}

	// Add a register mask operand representing the call-preserved registers.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *Mask =
	TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	if (InFlag.getNode()) {
	Ops.push_back(InFlag);
	}

	Chain = DAG.getNode(AVRISD::CALL, DL, NodeTys, Ops);
	InFlag = Chain.getValue(1);

	// Create the CALLSEQ_END node.
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
	DAG.getIntPtrConstant(0, DL, true), InFlag, DL);

	if (!Ins.empty()) {
	InFlag = Chain.getValue(1);
	}

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, DL, DAG,
	InVals);
	}

	/// Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	///
	SDValue AVRTargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals) const {

	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());

	// Handle runtime calling convs.
	auto CCFunction = CCAssignFnForReturn(CallConv);
	CCInfo.AnalyzeCallResult(Ins, CCFunction);

	if (CallConv != CallingConv::AVR_BUILTIN && RVLocs.size() > 1) {
	// Reverse splitted return values to get the "big endian" format required
	// to agree with the calling convention ABI.
	std::reverse(RVLocs.begin(), RVLocs.end());
	}

	// Copy all of the result registers out of their specified physreg.
	for (CCValAssign const &RVLoc : RVLocs) {
	Chain = DAG.getCopyFromReg(Chain, dl, RVLoc.getLocReg(), RVLoc.getValVT(),
	InFlag)
	.getValue(1);
	InFlag = Chain.getValue(2);
	InVals.push_back(Chain.getValue(0));
	}

	return Chain;
	}

	//===----------------------------------------------------------------------===//
	// Return Value Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	CCAssignFn *AVRTargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
	switch (CC) {
	case CallingConv::AVR_BUILTIN:
	return RetCC_AVR_BUILTIN;
	default:
	return RetCC_AVR;
	}
	}

	bool
	AVRTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
	MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const
	{
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);

	auto CCFunction = CCAssignFnForReturn(CallConv);
	return CCInfo.CheckReturn(Outs, CCFunction);
	}

	SDValue
	AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	// CCValAssign - represent the assignment of the return value to locations.
	SmallVector<CCValAssign, 16> RVLocs;

	// CCState - Info about the registers and stack slot.
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());

	// Analyze return values.
	auto CCFunction = CCAssignFnForReturn(CallConv);
	CCInfo.AnalyzeReturn(Outs, CCFunction);

	// If this is the first return lowered for this function, add the regs to
	// the liveout set for the function.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned e = RVLocs.size();

	// Reverse splitted return values to get the "big endian" format required
	// to agree with the calling convention ABI.
	if (e > 1) {
	std::reverse(RVLocs.begin(), RVLocs.end());
	}

	SDValue Flag;
	SmallVector<SDValue, 4> RetOps(1, Chain);
	// Copy the result values into the output registers.
	for (unsigned i = 0; i != e; ++i) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");

	Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);

	// Guarantee that all emitted copies are stuck together with flags.
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}

	// Don't emit the ret/reti instruction when the naked attribute is present in
	// the function being compiled.
	if (MF.getFunction().getAttributes().hasAttribute(
	AttributeList::FunctionIndex, Attribute::Naked)) {
	return Chain;
	}

	unsigned RetOpc =
	(CallConv == CallingConv::AVR_INTR \|\| CallConv == CallingConv::AVR_SIGNAL)
	? AVRISD::RETI_FLAG
	: AVRISD::RET_FLAG;

	RetOps[0] = Chain; // Update chain.

	if (Flag.getNode()) {
	RetOps.push_back(Flag);
	}

	return DAG.getNode(RetOpc, dl, MVT::Other, RetOps);
	}

	//===----------------------------------------------------------------------===//
	// Custom Inserters
	//===----------------------------------------------------------------------===//

	MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	unsigned Opc;
	const TargetRegisterClass *RC;
	bool HasRepeatedOperand = false;
	MachineFunction *F = BB->getParent();
	MachineRegisterInfo &RI = F->getRegInfo();
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc dl = MI.getDebugLoc();

	switch (MI.getOpcode()) {
	default:
	llvm_unreachable("Invalid shift opcode!");
	case AVR::Lsl8:
	Opc = AVR::ADDRdRr; // LSL is an alias of ADD Rd, Rd
	RC = &AVR::GPR8RegClass;
	HasRepeatedOperand = true;
	break;
	case AVR::Lsl16:
	Opc = AVR::LSLWRd;
	RC = &AVR::DREGSRegClass;
	break;
	case AVR::Asr8:
	Opc = AVR::ASRRd;
	RC = &AVR::GPR8RegClass;
	break;
	case AVR::Asr16:
	Opc = AVR::ASRWRd;
	RC = &AVR::DREGSRegClass;
	break;
	case AVR::Lsr8:
	Opc = AVR::LSRRd;
	RC = &AVR::GPR8RegClass;
	break;
	case AVR::Lsr16:
	Opc = AVR::LSRWRd;
	RC = &AVR::DREGSRegClass;
	break;
	case AVR::Rol8:
	Opc = AVR::ADCRdRr; // ROL is an alias of ADC Rd, Rd
	RC = &AVR::GPR8RegClass;
	HasRepeatedOperand = true;
	break;
	case AVR::Rol16:
	Opc = AVR::ROLWRd;
	RC = &AVR::DREGSRegClass;
	break;
	case AVR::Ror8:
	Opc = AVR::RORRd;
	RC = &AVR::GPR8RegClass;
	break;
	case AVR::Ror16:
	Opc = AVR::RORWRd;
	RC = &AVR::DREGSRegClass;
	break;
	}

	const BasicBlock *LLVM_BB = BB->getBasicBlock();

	MachineFunction::iterator I;
	for (I = BB->getIterator(); I != F->end() && &(*I) != BB; ++I);
	if (I != F->end()) ++I;

	// Create loop block.
	MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *RemBB = F->CreateMachineBasicBlock(LLVM_BB);

	F->insert(I, LoopBB);
	F->insert(I, RemBB);

	// Update machine-CFG edges by transferring all successors of the current
	// block to the block containing instructions after shift.
	RemBB->splice(RemBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
	BB->end());
	RemBB->transferSuccessorsAndUpdatePHIs(BB);

	// Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB.
	BB->addSuccessor(LoopBB);
	BB->addSuccessor(RemBB);
	LoopBB->addSuccessor(RemBB);
	LoopBB->addSuccessor(LoopBB);

	unsigned ShiftAmtReg = RI.createVirtualRegister(&AVR::LD8RegClass);
	unsigned ShiftAmtReg2 = RI.createVirtualRegister(&AVR::LD8RegClass);
	unsigned ShiftReg = RI.createVirtualRegister(RC);
	unsigned ShiftReg2 = RI.createVirtualRegister(RC);
	unsigned ShiftAmtSrcReg = MI.getOperand(2).getReg();
	unsigned SrcReg = MI.getOperand(1).getReg();
	unsigned DstReg = MI.getOperand(0).getReg();

	// BB:
	// cpi N, 0
	// breq RemBB
	BuildMI(BB, dl, TII.get(AVR::CPIRdK)).addReg(ShiftAmtSrcReg).addImm(0);
	BuildMI(BB, dl, TII.get(AVR::BREQk)).addMBB(RemBB);

	// LoopBB:
	// ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
	// ShiftAmt = phi [%N, BB], [%ShiftAmt2, LoopBB]
	// ShiftReg2 = shift ShiftReg
	// ShiftAmt2 = ShiftAmt - 1;
	BuildMI(LoopBB, dl, TII.get(AVR::PHI), ShiftReg)
	.addReg(SrcReg)
	.addMBB(BB)
	.addReg(ShiftReg2)
	.addMBB(LoopBB);
	BuildMI(LoopBB, dl, TII.get(AVR::PHI), ShiftAmtReg)
	.addReg(ShiftAmtSrcReg)
	.addMBB(BB)
	.addReg(ShiftAmtReg2)
	.addMBB(LoopBB);

	auto ShiftMI = BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2).addReg(ShiftReg);
	if (HasRepeatedOperand)
	ShiftMI.addReg(ShiftReg);

	BuildMI(LoopBB, dl, TII.get(AVR::SUBIRdK), ShiftAmtReg2)
	.addReg(ShiftAmtReg)
	.addImm(1);
	BuildMI(LoopBB, dl, TII.get(AVR::BRNEk)).addMBB(LoopBB);

	// RemBB:
	// DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
	BuildMI(*RemBB, RemBB->begin(), dl, TII.get(AVR::PHI), DstReg)
	.addReg(SrcReg)
	.addMBB(BB)
	.addReg(ShiftReg2)
	.addMBB(LoopBB);

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return RemBB;
	}

	static bool isCopyMulResult(MachineBasicBlock::iterator const &I) {
	if (I->getOpcode() == AVR::COPY) {
	unsigned SrcReg = I->getOperand(1).getReg();
	return (SrcReg == AVR::R0 \|\| SrcReg == AVR::R1);
	}

	return false;
	}

	// The mul instructions wreak havock on our zero_reg R1. We need to clear it
	// after the result has been evacuated. This is probably not the best way to do
	// it, but it works for now.
	MachineBasicBlock *AVRTargetLowering::insertMul(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	MachineBasicBlock::iterator I(MI);
	++I; // in any case insert after the mul instruction
	if (isCopyMulResult(I))
	++I;
	if (isCopyMulResult(I))
	++I;
	BuildMI(*BB, I, MI.getDebugLoc(), TII.get(AVR::EORRdRr), AVR::R1)
	.addReg(AVR::R1)
	.addReg(AVR::R1);
	return BB;
	}

	MachineBasicBlock *
	AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	int Opc = MI.getOpcode();

	// Pseudo shift instructions with a non constant shift amount are expanded
	// into a loop.
	switch (Opc) {
	case AVR::Lsl8:
	case AVR::Lsl16:
	case AVR::Lsr8:
	case AVR::Lsr16:
	case AVR::Rol8:
	case AVR::Rol16:
	case AVR::Ror8:
	case AVR::Ror16:
	case AVR::Asr8:
	case AVR::Asr16:
	return insertShift(MI, MBB);
	case AVR::MULRdRr:
	case AVR::MULSRdRr:
	return insertMul(MI, MBB);
	}

	assert((Opc == AVR::Select16 \|\| Opc == AVR::Select8) &&
	"Unexpected instr type to insert");

	const AVRInstrInfo &TII = (const AVRInstrInfo &)*MI.getParent()
	->getParent()
	->getSubtarget()
	.getInstrInfo();
	DebugLoc dl = MI.getDebugLoc();

	// To "insert" a SELECT instruction, we insert the diamond
	// control-flow pattern. The incoming instruction knows the
	// destination vreg to set, the condition code register to branch
	// on, the true/false values to select between, and a branch opcode
	// to use.

	MachineFunction *MF = MBB->getParent();
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineBasicBlock *FallThrough = MBB->getFallThrough();

	// If the current basic block falls through to another basic block,
	// we must insert an unconditional branch to the fallthrough destination
	// if we are to insert basic blocks at the prior fallthrough point.
	if (FallThrough != nullptr) {
	BuildMI(MBB, dl, TII.get(AVR::RJMPk)).addMBB(FallThrough);
	}

	MachineBasicBlock *trueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *falseMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator I;
	for (I = MF->begin(); I != MF->end() && &(*I) != MBB; ++I);
	if (I != MF->end()) ++I;
	MF->insert(I, trueMBB);
	MF->insert(I, falseMBB);

	// Transfer remaining instructions and all successors of the current
	// block to the block which will contain the Phi node for the
	// select.
	trueMBB->splice(trueMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	trueMBB->transferSuccessorsAndUpdatePHIs(MBB);

	AVRCC::CondCodes CC = (AVRCC::CondCodes)MI.getOperand(3).getImm();
	BuildMI(MBB, dl, TII.getBrCond(CC)).addMBB(trueMBB);
	BuildMI(MBB, dl, TII.get(AVR::RJMPk)).addMBB(falseMBB);
	MBB->addSuccessor(falseMBB);
	MBB->addSuccessor(trueMBB);

	// Unconditionally flow back to the true block
	BuildMI(falseMBB, dl, TII.get(AVR::RJMPk)).addMBB(trueMBB);
	falseMBB->addSuccessor(trueMBB);

	// Set up the Phi node to determine where we came from
	BuildMI(*trueMBB, trueMBB->begin(), dl, TII.get(AVR::PHI), MI.getOperand(0).getReg())
	.addReg(MI.getOperand(1).getReg())
	.addMBB(MBB)
	.addReg(MI.getOperand(2).getReg())
	.addMBB(falseMBB) ;

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return trueMBB;
	}

	//===----------------------------------------------------------------------===//
	// Inline Asm Support
	//===----------------------------------------------------------------------===//

	AVRTargetLowering::ConstraintType
	AVRTargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	// See http://www.nongnu.org/avr-libc/user-manual/inline_asm.html
	switch (Constraint[0]) {
	+ default:
	+ break;
	case 'a': // Simple upper registers
	case 'b': // Base pointer registers pairs
	case 'd': // Upper register
	case 'l': // Lower registers
	case 'e': // Pointer register pairs
	case 'q': // Stack pointer register
	case 'r': // Any register
	case 'w': // Special upper register pairs
	return C_RegisterClass;
	case 't': // Temporary register
	case 'x': case 'X': // Pointer register pair X
	case 'y': case 'Y': // Pointer register pair Y
	case 'z': case 'Z': // Pointer register pair Z
	return C_Register;
	case 'Q': // A memory address based on Y or Z pointer with displacement.
	return C_Memory;
	case 'G': // Floating point constant
	case 'I': // 6-bit positive integer constant
	case 'J': // 6-bit negative integer constant
	case 'K': // Integer constant (Range: 2)
	case 'L': // Integer constant (Range: 0)
	case 'M': // 8-bit integer constant
	case 'N': // Integer constant (Range: -1)
	case 'O': // Integer constant (Range: 8, 16, 24)
	case 'P': // Integer constant (Range: 1)
	case 'R': // Integer constant (Range: -6 to 5)x
	- return C_Other;
	- default:
	- break;
	+ return C_Immediate;
	}
	}

	return TargetLowering::getConstraintType(Constraint);
	}

	unsigned
	AVRTargetLowering::getInlineAsmMemConstraint(StringRef ConstraintCode) const {
	// Not sure if this is actually the right thing to do, but we got to do
	// something [agnat]
	switch (ConstraintCode[0]) {
	case 'Q':
	return InlineAsm::Constraint_Q;
	}
	return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
	}

	AVRTargetLowering::ConstraintWeight
	AVRTargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;

	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	// (this behaviour has been copied from the ARM backend)
	if (!CallOperandVal) {
	return CW_Default;
	}

	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	break;
	case 'd':
	case 'r':
	case 'l':
	weight = CW_Register;
	break;
	case 'a':
	case 'b':
	case 'e':
	case 'q':
	case 't':
	case 'w':
	case 'x': case 'X':
	case 'y': case 'Y':
	case 'z': case 'Z':
	weight = CW_SpecificReg;
	break;
	case 'G':
	if (const ConstantFP *C = dyn_cast<ConstantFP>(CallOperandVal)) {
	if (C->isZero()) {
	weight = CW_Constant;
	}
	}
	break;
	case 'I':
	if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (isUInt<6>(C->getZExtValue())) {
	weight = CW_Constant;
	}
	}
	break;
	case 'J':
	if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -63) && (C->getSExtValue() <= 0)) {
	weight = CW_Constant;
	}
	}
	break;
	case 'K':
	if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() == 2) {
	weight = CW_Constant;
	}
	}
	break;
	case 'L':
	if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() == 0) {
	weight = CW_Constant;
	}
	}
	break;
	case 'M':
	if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (isUInt<8>(C->getZExtValue())) {
	weight = CW_Constant;
	}
	}
	break;
	case 'N':
	if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getSExtValue() == -1) {
	weight = CW_Constant;
	}
	}
	break;
	case 'O':
	if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getZExtValue() == 8) \|\| (C->getZExtValue() == 16) \|\|
	(C->getZExtValue() == 24)) {
	weight = CW_Constant;
	}
	}
	break;
	case 'P':
	if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() == 1) {
	weight = CW_Constant;
	}
	}
	break;
	case 'R':
	if (const ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -6) && (C->getSExtValue() <= 5)) {
	weight = CW_Constant;
	}
	}
	break;
	case 'Q':
	weight = CW_Memory;
	break;
	}

	return weight;
	}

	std::pair<unsigned, const TargetRegisterClass *>
	AVRTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	// We only support i8 and i16.
	//
	//:FIXME: remove this assert for now since it gets sometimes executed
	// assert((VT == MVT::i16 \|\| VT == MVT::i8) && "Wrong operand type.");

	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'a': // Simple upper registers r16..r23.
	return std::make_pair(0U, &AVR::LD8loRegClass);
	case 'b': // Base pointer registers: y, z.
	return std::make_pair(0U, &AVR::PTRDISPREGSRegClass);
	case 'd': // Upper registers r16..r31.
	return std::make_pair(0U, &AVR::LD8RegClass);
	case 'l': // Lower registers r0..r15.
	return std::make_pair(0U, &AVR::GPR8loRegClass);
	case 'e': // Pointer register pairs: x, y, z.
	return std::make_pair(0U, &AVR::PTRREGSRegClass);
	case 'q': // Stack pointer register: SPH:SPL.
	return std::make_pair(0U, &AVR::GPRSPRegClass);
	case 'r': // Any register: r0..r31.
	if (VT == MVT::i8)
	return std::make_pair(0U, &AVR::GPR8RegClass);

	assert(VT == MVT::i16 && "inline asm constraint too large");
	return std::make_pair(0U, &AVR::DREGSRegClass);
	case 't': // Temporary register: r0.
	return std::make_pair(unsigned(AVR::R0), &AVR::GPR8RegClass);
	case 'w': // Special upper register pairs: r24, r26, r28, r30.
	return std::make_pair(0U, &AVR::IWREGSRegClass);
	case 'x': // Pointer register pair X: r27:r26.
	case 'X':
	return std::make_pair(unsigned(AVR::R27R26), &AVR::PTRREGSRegClass);
	case 'y': // Pointer register pair Y: r29:r28.
	case 'Y':
	return std::make_pair(unsigned(AVR::R29R28), &AVR::PTRREGSRegClass);
	case 'z': // Pointer register pair Z: r31:r30.
	case 'Z':
	return std::make_pair(unsigned(AVR::R31R30), &AVR::PTRREGSRegClass);
	default:
	break;
	}
	}

	return TargetLowering::getRegForInlineAsmConstraint(
	Subtarget.getRegisterInfo(), Constraint, VT);
	}

	void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const {
	SDValue Result(0, 0);
	SDLoc DL(Op);
	EVT Ty = Op.getValueType();

	// Currently only support length 1 constraints.
	if (Constraint.length() != 1) {
	return;
	}

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default:
	break;
	// Deal with integers first:
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'O':
	case 'P':
	case 'R': {
	const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C) {
	return;
	}

	int64_t CVal64 = C->getSExtValue();
	uint64_t CUVal64 = C->getZExtValue();
	switch (ConstraintLetter) {
	case 'I': // 0..63
	if (!isUInt<6>(CUVal64))
	return;
	Result = DAG.getTargetConstant(CUVal64, DL, Ty);
	break;
	case 'J': // -63..0
	if (CVal64 < -63 \|\| CVal64 > 0)
	return;
	Result = DAG.getTargetConstant(CVal64, DL, Ty);
	break;
	case 'K': // 2
	if (CUVal64 != 2)
	return;
	Result = DAG.getTargetConstant(CUVal64, DL, Ty);
	break;
	case 'L': // 0
	if (CUVal64 != 0)
	return;
	Result = DAG.getTargetConstant(CUVal64, DL, Ty);
	break;
	case 'M': // 0..255
	if (!isUInt<8>(CUVal64))
	return;
	// i8 type may be printed as a negative number,
	// e.g. 254 would be printed as -2,
	// so we force it to i16 at least.
	if (Ty.getSimpleVT() == MVT::i8) {
	Ty = MVT::i16;
	}
	Result = DAG.getTargetConstant(CUVal64, DL, Ty);
	break;
	case 'N': // -1
	if (CVal64 != -1)
	return;
	Result = DAG.getTargetConstant(CVal64, DL, Ty);
	break;
	case 'O': // 8, 16, 24
	if (CUVal64 != 8 && CUVal64 != 16 && CUVal64 != 24)
	return;
	Result = DAG.getTargetConstant(CUVal64, DL, Ty);
	break;
	case 'P': // 1
	if (CUVal64 != 1)
	return;
	Result = DAG.getTargetConstant(CUVal64, DL, Ty);
	break;
	case 'R': // -6..5
	if (CVal64 < -6 \|\| CVal64 > 5)
	return;
	Result = DAG.getTargetConstant(CVal64, DL, Ty);
	break;
	}

	break;
	}
	case 'G':
	const ConstantFPSDNode *FC = dyn_cast<ConstantFPSDNode>(Op);
	if (!FC \|\| !FC->isZero())
	return;
	// Soften float to i8 0
	Result = DAG.getTargetConstant(0, DL, MVT::i8);
	break;
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}

	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	unsigned AVRTargetLowering::getRegisterByName(const char *RegName,
	EVT VT,
	SelectionDAG &DAG) const {
	unsigned Reg;

	if (VT == MVT::i8) {
	Reg = StringSwitch<unsigned>(RegName)
	.Case("r0", AVR::R0).Case("r1", AVR::R1).Case("r2", AVR::R2)
	.Case("r3", AVR::R3).Case("r4", AVR::R4).Case("r5", AVR::R5)
	.Case("r6", AVR::R6).Case("r7", AVR::R7).Case("r8", AVR::R8)
	.Case("r9", AVR::R9).Case("r10", AVR::R10).Case("r11", AVR::R11)
	.Case("r12", AVR::R12).Case("r13", AVR::R13).Case("r14", AVR::R14)
	.Case("r15", AVR::R15).Case("r16", AVR::R16).Case("r17", AVR::R17)
	.Case("r18", AVR::R18).Case("r19", AVR::R19).Case("r20", AVR::R20)
	.Case("r21", AVR::R21).Case("r22", AVR::R22).Case("r23", AVR::R23)
	.Case("r24", AVR::R24).Case("r25", AVR::R25).Case("r26", AVR::R26)
	.Case("r27", AVR::R27).Case("r28", AVR::R28).Case("r29", AVR::R29)
	.Case("r30", AVR::R30).Case("r31", AVR::R31)
	.Case("X", AVR::R27R26).Case("Y", AVR::R29R28).Case("Z", AVR::R31R30)
	.Default(0);
	} else {
	Reg = StringSwitch<unsigned>(RegName)
	.Case("r0", AVR::R1R0).Case("r2", AVR::R3R2)
	.Case("r4", AVR::R5R4).Case("r6", AVR::R7R6)
	.Case("r8", AVR::R9R8).Case("r10", AVR::R11R10)
	.Case("r12", AVR::R13R12).Case("r14", AVR::R15R14)
	.Case("r16", AVR::R17R16).Case("r18", AVR::R19R18)
	.Case("r20", AVR::R21R20).Case("r22", AVR::R23R22)
	.Case("r24", AVR::R25R24).Case("r26", AVR::R27R26)
	.Case("r28", AVR::R29R28).Case("r30", AVR::R31R30)
	.Case("X", AVR::R27R26).Case("Y", AVR::R29R28).Case("Z", AVR::R31R30)
	.Default(0);
	}

	if (Reg)
	return Reg;

	report_fatal_error("Invalid register name global variable");
	}

	} // end of namespace llvm
	Index: vendor/llvm/dist-release_90/lib/Target/BPF/BPFAbstractMemberAccess.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/BPF/BPFAbstractMemberAccess.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/BPF/BPFAbstractMemberAccess.cpp (revision 351303)
	@@ -1,482 +1,480 @@
	//===------ BPFAbstractMemberAccess.cpp - Abstracting Member Accesses -----===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass abstracted struct/union member accesses in order to support
	// compile-once run-everywhere (CO-RE). The CO-RE intends to compile the program
	// which can run on different kernels. In particular, if bpf program tries to
	// access a particular kernel data structure member, the details of the
	// intermediate member access will be remembered so bpf loader can do
	// necessary adjustment right before program loading.
	//
	// For example,
	//
	// struct s {
	// int a;
	// int b;
	// };
	// struct t {
	// struct s c;
	// int d;
	// };
	// struct t e;
	//
	// For the member access e.c.b, the compiler will generate code
	// &e + 4
	//
	// The compile-once run-everywhere instead generates the following code
	// r = 4
	// &e + r
	// The "4" in "r = 4" can be changed based on a particular kernel version.
	// For example, on a particular kernel version, if struct s is changed to
	//
	// struct s {
	// int new_field;
	// int a;
	// int b;
	// }
	//
	// By repeating the member access on the host, the bpf loader can
	// adjust "r = 4" as "r = 8".
	//
	// This feature relies on the following three intrinsic calls:
	// addr = preserve_array_access_index(base, dimension, index)
	// addr = preserve_union_access_index(base, di_index)
	// !llvm.preserve.access.index <union_ditype>
	// addr = preserve_struct_access_index(base, gep_index, di_index)
	// !llvm.preserve.access.index <struct_ditype>
	//
	//===----------------------------------------------------------------------===//

	#include "BPF.h"
	#include "BPFCORE.h"
	#include "BPFTargetMachine.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/Pass.h"
	#include "llvm/Transforms/Utils/BasicBlockUtils.h"

	#define DEBUG_TYPE "bpf-abstract-member-access"

	namespace llvm {
	const std::string BPFCoreSharedInfo::AmaAttr = "btf_ama";
	const std::string BPFCoreSharedInfo::PatchableExtSecName =
	".BPF.patchable_externs";
	} // namespace llvm

	using namespace llvm;

	namespace {

	class BPFAbstractMemberAccess final : public ModulePass {
	StringRef getPassName() const override {
	return "BPF Abstract Member Access";
	}

	bool runOnModule(Module &M) override;

	public:
	static char ID;
	BPFAbstractMemberAccess() : ModulePass(ID) {}

	private:
	enum : uint32_t {
	BPFPreserveArrayAI = 1,
	BPFPreserveUnionAI = 2,
	BPFPreserveStructAI = 3,
	};

	std::map<std::string, GlobalVariable *> GEPGlobals;
	// A map to link preserve_*_access_index instrinsic calls.
	std::map<CallInst , std::pair<CallInst , uint32_t>> AIChain;
	// A map to hold all the base preserve_*_access_index instrinsic calls.
	// The base call is not an input of any other preserve_*_access_index
	// intrinsics.
	std::map<CallInst *, uint32_t> BaseAICalls;

	bool doTransformation(Module &M);

	void traceAICall(CallInst *Call, uint32_t Kind);
	void traceBitCast(BitCastInst BitCast, CallInst Parent, uint32_t Kind);
	void traceGEP(GetElementPtrInst GEP, CallInst Parent, uint32_t Kind);
	void collectAICallChains(Module &M, Function &F);

	bool IsPreserveDIAccessIndexCall(const CallInst *Call, uint32_t &Kind);
	bool removePreserveAccessIndexIntrinsic(Module &M);
	void replaceWithGEP(std::vector<CallInst *> &CallList,
	uint32_t NumOfZerosIndex, uint32_t DIIndex);

	- Value computeBaseAndAccessStr(CallInst Call, std::string &AccessStr,
	- std::string &AccessKey, uint32_t Kind,
	- MDNode *&TypeMeta);
	+ Value computeBaseAndAccessKey(CallInst Call, std::string &AccessKey,
	+ uint32_t Kind, MDNode *&TypeMeta);
	bool getAccessIndex(const Value *IndexValue, uint64_t &AccessIndex);
	bool transformGEPChain(Module &M, CallInst *Call, uint32_t Kind);
	};
	} // End anonymous namespace

	char BPFAbstractMemberAccess::ID = 0;
	INITIALIZE_PASS(BPFAbstractMemberAccess, DEBUG_TYPE,
	"abstracting struct/union member accessees", false, false)

	ModulePass *llvm::createBPFAbstractMemberAccess() {
	return new BPFAbstractMemberAccess();
	}

	bool BPFAbstractMemberAccess::runOnModule(Module &M) {
	LLVM_DEBUG(dbgs() << "******** Abstract Member Accesses ********\n");

	// Bail out if no debug info.
	if (empty(M.debug_compile_units()))
	return false;

	return doTransformation(M);
	}

	/// Check whether a call is a preserve_*_access_index intrinsic call or not.
	bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
	uint32_t &Kind) {
	if (!Call)
	return false;

	const auto *GV = dyn_cast<GlobalValue>(Call->getCalledValue());
	if (!GV)
	return false;
	if (GV->getName().startswith("llvm.preserve.array.access.index")) {
	Kind = BPFPreserveArrayAI;
	return true;
	}
	if (GV->getName().startswith("llvm.preserve.union.access.index")) {
	Kind = BPFPreserveUnionAI;
	return true;
	}
	if (GV->getName().startswith("llvm.preserve.struct.access.index")) {
	Kind = BPFPreserveStructAI;
	return true;
	}

	return false;
	}

	void BPFAbstractMemberAccess::replaceWithGEP(std::vector<CallInst *> &CallList,
	uint32_t DimensionIndex,
	uint32_t GEPIndex) {
	for (auto Call : CallList) {
	uint32_t Dimension = 1;
	if (DimensionIndex > 0)
	Dimension = cast<ConstantInt>(Call->getArgOperand(DimensionIndex))
	->getZExtValue();

	Constant *Zero =
	ConstantInt::get(Type::getInt32Ty(Call->getParent()->getContext()), 0);
	SmallVector<Value *, 4> IdxList;
	for (unsigned I = 0; I < Dimension; ++I)
	IdxList.push_back(Zero);
	IdxList.push_back(Call->getArgOperand(GEPIndex));

	auto *GEP = GetElementPtrInst::CreateInBounds(Call->getArgOperand(0),
	IdxList, "", Call);
	Call->replaceAllUsesWith(GEP);
	Call->eraseFromParent();
	}
	}

	bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) {
	std::vector<CallInst *> PreserveArrayIndexCalls;
	std::vector<CallInst *> PreserveUnionIndexCalls;
	std::vector<CallInst *> PreserveStructIndexCalls;
	bool Found = false;

	for (Function &F : M)
	for (auto &BB : F)
	for (auto &I : BB) {
	auto *Call = dyn_cast<CallInst>(&I);
	uint32_t Kind;
	if (!IsPreserveDIAccessIndexCall(Call, Kind))
	continue;

	Found = true;
	if (Kind == BPFPreserveArrayAI)
	PreserveArrayIndexCalls.push_back(Call);
	else if (Kind == BPFPreserveUnionAI)
	PreserveUnionIndexCalls.push_back(Call);
	else
	PreserveStructIndexCalls.push_back(Call);
	}

	// do the following transformation:
	// . addr = preserve_array_access_index(base, dimension, index)
	// is transformed to
	// addr = GEP(base, dimenion's zero's, index)
	// . addr = preserve_union_access_index(base, di_index)
	// is transformed to
	// addr = base, i.e., all usages of "addr" are replaced by "base".
	// . addr = preserve_struct_access_index(base, gep_index, di_index)
	// is transformed to
	// addr = GEP(base, 0, gep_index)
	replaceWithGEP(PreserveArrayIndexCalls, 1, 2);
	replaceWithGEP(PreserveStructIndexCalls, 0, 1);
	for (auto Call : PreserveUnionIndexCalls) {
	Call->replaceAllUsesWith(Call->getArgOperand(0));
	Call->eraseFromParent();
	}

	return Found;
	}

	void BPFAbstractMemberAccess::traceAICall(CallInst *Call, uint32_t Kind) {
	for (User *U : Call->users()) {
	Instruction *Inst = dyn_cast<Instruction>(U);
	if (!Inst)
	continue;

	if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
	traceBitCast(BI, Call, Kind);
	} else if (auto *CI = dyn_cast<CallInst>(Inst)) {
	uint32_t CIKind;
	if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
	AIChain[CI] = std::make_pair(Call, Kind);
	traceAICall(CI, CIKind);
	} else {
	BaseAICalls[Call] = Kind;
	}
	} else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
	if (GI->hasAllZeroIndices())
	traceGEP(GI, Call, Kind);
	else
	BaseAICalls[Call] = Kind;
	}
	}
	}

	void BPFAbstractMemberAccess::traceBitCast(BitCastInst *BitCast,
	CallInst *Parent, uint32_t Kind) {
	for (User *U : BitCast->users()) {
	Instruction *Inst = dyn_cast<Instruction>(U);
	if (!Inst)
	continue;

	if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
	traceBitCast(BI, Parent, Kind);
	} else if (auto *CI = dyn_cast<CallInst>(Inst)) {
	uint32_t CIKind;
	if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
	AIChain[CI] = std::make_pair(Parent, Kind);
	traceAICall(CI, CIKind);
	} else {
	BaseAICalls[Parent] = Kind;
	}
	} else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
	if (GI->hasAllZeroIndices())
	traceGEP(GI, Parent, Kind);
	else
	BaseAICalls[Parent] = Kind;
	}
	}
	}

	void BPFAbstractMemberAccess::traceGEP(GetElementPtrInst GEP, CallInst Parent,
	uint32_t Kind) {
	for (User *U : GEP->users()) {
	Instruction *Inst = dyn_cast<Instruction>(U);
	if (!Inst)
	continue;

	if (auto *BI = dyn_cast<BitCastInst>(Inst)) {
	traceBitCast(BI, Parent, Kind);
	} else if (auto *CI = dyn_cast<CallInst>(Inst)) {
	uint32_t CIKind;
	if (IsPreserveDIAccessIndexCall(CI, CIKind)) {
	AIChain[CI] = std::make_pair(Parent, Kind);
	traceAICall(CI, CIKind);
	} else {
	BaseAICalls[Parent] = Kind;
	}
	} else if (auto *GI = dyn_cast<GetElementPtrInst>(Inst)) {
	if (GI->hasAllZeroIndices())
	traceGEP(GI, Parent, Kind);
	else
	BaseAICalls[Parent] = Kind;
	}
	}
	}

	void BPFAbstractMemberAccess::collectAICallChains(Module &M, Function &F) {
	AIChain.clear();
	BaseAICalls.clear();

	for (auto &BB : F)
	for (auto &I : BB) {
	uint32_t Kind;
	auto *Call = dyn_cast<CallInst>(&I);
	if (!IsPreserveDIAccessIndexCall(Call, Kind) \|\|
	AIChain.find(Call) != AIChain.end())
	continue;

	traceAICall(Call, Kind);
	}
	}

	/// Get access index from the preserve_*_access_index intrinsic calls.
	bool BPFAbstractMemberAccess::getAccessIndex(const Value *IndexValue,
	uint64_t &AccessIndex) {
	const ConstantInt *CV = dyn_cast<ConstantInt>(IndexValue);
	if (!CV)
	return false;

	AccessIndex = CV->getValue().getZExtValue();
	return true;
	}

	/// Compute the base of the whole preserve_*_access_index chains, i.e., the base
	/// pointer of the first preserve_*_access_index call, and construct the access
	/// string, which will be the name of a global variable.
	-Value BPFAbstractMemberAccess::computeBaseAndAccessStr(CallInst Call,
	- std::string &AccessStr,
	+Value BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst Call,
	std::string &AccessKey,
	uint32_t Kind,
	MDNode *&TypeMeta) {
	Value *Base = nullptr;
	std::vector<uint64_t> AccessIndices;
	uint64_t TypeNameIndex = 0;
	std::string LastTypeName;

	while (Call) {
	// Base of original corresponding GEP
	Base = Call->getArgOperand(0);

	// Type Name
	std::string TypeName;
	MDNode *MDN;
	if (Kind == BPFPreserveUnionAI \|\| Kind == BPFPreserveStructAI) {
	MDN = Call->getMetadata(LLVMContext::MD_preserve_access_index);
	if (!MDN)
	return nullptr;

	DIType *Ty = dyn_cast<DIType>(MDN);
	if (!Ty)
	return nullptr;

	TypeName = Ty->getName();
	}

	// Access Index
	uint64_t AccessIndex;
	uint32_t ArgIndex = (Kind == BPFPreserveUnionAI) ? 1 : 2;
	if (!getAccessIndex(Call->getArgOperand(ArgIndex), AccessIndex))
	return nullptr;

	AccessIndices.push_back(AccessIndex);
	if (TypeName.size()) {
	TypeNameIndex = AccessIndices.size() - 1;
	LastTypeName = TypeName;
	TypeMeta = MDN;
	}

	Kind = AIChain[Call].second;
	Call = AIChain[Call].first;
	}

	// The intial type name is required.
	// FIXME: if the initial type access is an array index, e.g.,
	// &a[3].b.c, only one dimentional array is supported.
	if (!LastTypeName.size() \|\| AccessIndices.size() > TypeNameIndex + 2)
	return nullptr;

	- // Construct the type string AccessStr.
	+ // Construct the type string AccessKey.
	for (unsigned I = 0; I < AccessIndices.size(); ++I)
	- AccessStr = std::to_string(AccessIndices[I]) + ":" + AccessStr;
	+ AccessKey = std::to_string(AccessIndices[I]) + ":" + AccessKey;

	if (TypeNameIndex == AccessIndices.size() - 1)
	- AccessStr = "0:" + AccessStr;
	+ AccessKey = "0:" + AccessKey;

	// Access key is the type name + access string, uniquely identifying
	// one kernel memory access.
	- AccessKey = LastTypeName + ":" + AccessStr;
	+ AccessKey = LastTypeName + ":" + AccessKey;

	return Base;
	}

	/// Call/Kind is the base preserve_*_access_index() call. Attempts to do
	/// transformation to a chain of relocable GEPs.
	bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
	uint32_t Kind) {
	- std::string AccessStr, AccessKey;
	+ std::string AccessKey;
	MDNode *TypeMeta = nullptr;
	Value *Base =
	- computeBaseAndAccessStr(Call, AccessStr, AccessKey, Kind, TypeMeta);
	+ computeBaseAndAccessKey(Call, AccessKey, Kind, TypeMeta);
	if (!Base)
	return false;

	// Do the transformation
	// For any original GEP Call and Base %2 like
	// %4 = bitcast %struct.net_device** %dev1 to i64*
	// it is transformed to:
	// %6 = load __BTF_0:sk_buff:0:0:2:0:
	// %7 = bitcast %struct.sk_buff* %2 to i8*
	// %8 = getelementptr i8, i8* %7, %6
	// %9 = bitcast i8* %8 to i64*
	// using %9 instead of %4
	// The original Call inst is removed.
	BasicBlock *BB = Call->getParent();
	GlobalVariable *GV;

	if (GEPGlobals.find(AccessKey) == GEPGlobals.end()) {
	GV = new GlobalVariable(M, Type::getInt64Ty(BB->getContext()), false,
	- GlobalVariable::ExternalLinkage, NULL, AccessStr);
	+ GlobalVariable::ExternalLinkage, NULL, AccessKey);
	GV->addAttribute(BPFCoreSharedInfo::AmaAttr);
	// Set the metadata (debuginfo types) for the global.
	if (TypeMeta)
	GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta);
	GEPGlobals[AccessKey] = GV;
	} else {
	GV = GEPGlobals[AccessKey];
	}

	// Load the global variable.
	auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV);
	BB->getInstList().insert(Call->getIterator(), LDInst);

	// Generate a BitCast
	auto *BCInst = new BitCastInst(Base, Type::getInt8PtrTy(BB->getContext()));
	BB->getInstList().insert(Call->getIterator(), BCInst);

	// Generate a GetElementPtr
	auto *GEP = GetElementPtrInst::Create(Type::getInt8Ty(BB->getContext()),
	BCInst, LDInst);
	BB->getInstList().insert(Call->getIterator(), GEP);

	// Generate a BitCast
	auto *BCInst2 = new BitCastInst(GEP, Call->getType());
	BB->getInstList().insert(Call->getIterator(), BCInst2);

	Call->replaceAllUsesWith(BCInst2);
	Call->eraseFromParent();

	return true;
	}

	bool BPFAbstractMemberAccess::doTransformation(Module &M) {
	bool Transformed = false;

	for (Function &F : M) {
	// Collect PreserveDIAccessIndex Intrinsic call chains.
	// The call chains will be used to generate the access
	// patterns similar to GEP.
	collectAICallChains(M, F);

	for (auto &C : BaseAICalls)
	Transformed = transformGEPChain(M, C.first, C.second) \|\| Transformed;
	}

	return removePreserveAccessIndexIntrinsic(M) \|\| Transformed;
	}
	Index: vendor/llvm/dist-release_90/lib/Target/BPF/BTFDebug.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/BPF/BTFDebug.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/BPF/BTFDebug.cpp (revision 351303)
	@@ -1,1300 +1,1326 @@
	//===- BTFDebug.cpp - BTF Generator ---------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains support for writing BTF debug info.
	//
	//===----------------------------------------------------------------------===//

	#include "BTFDebug.h"
	#include "BPF.h"
	#include "BPFCORE.h"
	#include "MCTargetDesc/BPFMCTargetDesc.h"
	#include "llvm/BinaryFormat/ELF.h"
	#include "llvm/CodeGen/AsmPrinter.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCObjectFileInfo.h"
	#include "llvm/MC/MCSectionELF.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/Support/LineIterator.h"

	using namespace llvm;

	static const char *BTFKindStr[] = {
	#define HANDLE_BTF_KIND(ID, NAME) "BTF_KIND_" #NAME,
	#include "BTF.def"
	};

	+static const DIType * stripQualifiers(const DIType *Ty) {
	+ while (const auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
	+ unsigned Tag = DTy->getTag();
	+ if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type &&
	+ Tag != dwarf::DW_TAG_volatile_type && Tag != dwarf::DW_TAG_restrict_type)
	+ break;
	+ Ty = DTy->getBaseType();
	+ }
	+
	+ return Ty;
	+}
	+
	/// Emit a BTF common type.
	void BTFTypeBase::emitType(MCStreamer &OS) {
	OS.AddComment(std::string(BTFKindStr[Kind]) + "(id = " + std::to_string(Id) +
	")");
	OS.EmitIntValue(BTFType.NameOff, 4);
	OS.AddComment("0x" + Twine::utohexstr(BTFType.Info));
	OS.EmitIntValue(BTFType.Info, 4);
	OS.EmitIntValue(BTFType.Size, 4);
	}

	BTFTypeDerived::BTFTypeDerived(const DIDerivedType *DTy, unsigned Tag,
	bool NeedsFixup)
	: DTy(DTy), NeedsFixup(NeedsFixup) {
	switch (Tag) {
	case dwarf::DW_TAG_pointer_type:
	Kind = BTF::BTF_KIND_PTR;
	break;
	case dwarf::DW_TAG_const_type:
	Kind = BTF::BTF_KIND_CONST;
	break;
	case dwarf::DW_TAG_volatile_type:
	Kind = BTF::BTF_KIND_VOLATILE;
	break;
	case dwarf::DW_TAG_typedef:
	Kind = BTF::BTF_KIND_TYPEDEF;
	break;
	case dwarf::DW_TAG_restrict_type:
	Kind = BTF::BTF_KIND_RESTRICT;
	break;
	default:
	llvm_unreachable("Unknown DIDerivedType Tag");
	}
	BTFType.Info = Kind << 24;
	}

	void BTFTypeDerived::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	BTFType.NameOff = BDebug.addString(DTy->getName());

	if (NeedsFixup)
	return;

	// The base type for PTR/CONST/VOLATILE could be void.
	const DIType *ResolvedType = DTy->getBaseType();
	if (!ResolvedType) {
	assert((Kind == BTF::BTF_KIND_PTR \|\| Kind == BTF::BTF_KIND_CONST \|\|
	Kind == BTF::BTF_KIND_VOLATILE) &&
	"Invalid null basetype");
	BTFType.Type = 0;
	} else {
	BTFType.Type = BDebug.getTypeId(ResolvedType);
	}
	}

	void BTFTypeDerived::emitType(MCStreamer &OS) { BTFTypeBase::emitType(OS); }

	void BTFTypeDerived::setPointeeType(uint32_t PointeeType) {
	BTFType.Type = PointeeType;
	}

	/// Represent a struct/union forward declaration.
	BTFTypeFwd::BTFTypeFwd(StringRef Name, bool IsUnion) : Name(Name) {
	Kind = BTF::BTF_KIND_FWD;
	BTFType.Info = IsUnion << 31 \| Kind << 24;
	BTFType.Type = 0;
	}

	void BTFTypeFwd::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	BTFType.NameOff = BDebug.addString(Name);
	}

	void BTFTypeFwd::emitType(MCStreamer &OS) { BTFTypeBase::emitType(OS); }

	BTFTypeInt::BTFTypeInt(uint32_t Encoding, uint32_t SizeInBits,
	uint32_t OffsetInBits, StringRef TypeName)
	: Name(TypeName) {
	// Translate IR int encoding to BTF int encoding.
	uint8_t BTFEncoding;
	switch (Encoding) {
	case dwarf::DW_ATE_boolean:
	BTFEncoding = BTF::INT_BOOL;
	break;
	case dwarf::DW_ATE_signed:
	case dwarf::DW_ATE_signed_char:
	BTFEncoding = BTF::INT_SIGNED;
	break;
	case dwarf::DW_ATE_unsigned:
	case dwarf::DW_ATE_unsigned_char:
	BTFEncoding = 0;
	break;
	default:
	llvm_unreachable("Unknown BTFTypeInt Encoding");
	}

	Kind = BTF::BTF_KIND_INT;
	BTFType.Info = Kind << 24;
	BTFType.Size = roundupToBytes(SizeInBits);
	IntVal = (BTFEncoding << 24) \| OffsetInBits << 16 \| SizeInBits;
	}

	void BTFTypeInt::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	BTFType.NameOff = BDebug.addString(Name);
	}

	void BTFTypeInt::emitType(MCStreamer &OS) {
	BTFTypeBase::emitType(OS);
	OS.AddComment("0x" + Twine::utohexstr(IntVal));
	OS.EmitIntValue(IntVal, 4);
	}

	BTFTypeEnum::BTFTypeEnum(const DICompositeType *ETy, uint32_t VLen) : ETy(ETy) {
	Kind = BTF::BTF_KIND_ENUM;
	BTFType.Info = Kind << 24 \| VLen;
	BTFType.Size = roundupToBytes(ETy->getSizeInBits());
	}

	void BTFTypeEnum::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	BTFType.NameOff = BDebug.addString(ETy->getName());

	DINodeArray Elements = ETy->getElements();
	for (const auto Element : Elements) {
	const auto *Enum = cast<DIEnumerator>(Element);

	struct BTF::BTFEnum BTFEnum;
	BTFEnum.NameOff = BDebug.addString(Enum->getName());
	// BTF enum value is 32bit, enforce it.
	BTFEnum.Val = static_cast<uint32_t>(Enum->getValue());
	EnumValues.push_back(BTFEnum);
	}
	}

	void BTFTypeEnum::emitType(MCStreamer &OS) {
	BTFTypeBase::emitType(OS);
	for (const auto &Enum : EnumValues) {
	OS.EmitIntValue(Enum.NameOff, 4);
	OS.EmitIntValue(Enum.Val, 4);
	}
	}

	-BTFTypeArray::BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize,
	- uint32_t NumElems)
	- : ElemSize(ElemSize) {
	+BTFTypeArray::BTFTypeArray(const DIType *Ty, uint32_t ElemTypeId,
	+ uint32_t ElemSize, uint32_t NumElems)
	+ : ElemTyNoQual(Ty), ElemSize(ElemSize) {
	Kind = BTF::BTF_KIND_ARRAY;
	BTFType.NameOff = 0;
	BTFType.Info = Kind << 24;
	BTFType.Size = 0;

	ArrayInfo.ElemType = ElemTypeId;
	ArrayInfo.Nelems = NumElems;
	}

	/// Represent a BTF array.
	void BTFTypeArray::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	// The IR does not really have a type for the index.
	// A special type for array index should have been
	// created during initial type traversal. Just
	// retrieve that type id.
	ArrayInfo.IndexType = BDebug.getArrayIndexTypeId();
	+
	+ ElemTypeNoQual = ElemTyNoQual ? BDebug.getTypeId(ElemTyNoQual)
	+ : ArrayInfo.ElemType;
	}

	void BTFTypeArray::emitType(MCStreamer &OS) {
	BTFTypeBase::emitType(OS);
	OS.EmitIntValue(ArrayInfo.ElemType, 4);
	OS.EmitIntValue(ArrayInfo.IndexType, 4);
	OS.EmitIntValue(ArrayInfo.Nelems, 4);
	}

	void BTFTypeArray::getLocInfo(uint32_t Loc, uint32_t &LocOffset,
	uint32_t &ElementTypeId) {
	- ElementTypeId = ArrayInfo.ElemType;
	+ ElementTypeId = ElemTypeNoQual;
	LocOffset = Loc * ElemSize;
	}

	/// Represent either a struct or a union.
	BTFTypeStruct::BTFTypeStruct(const DICompositeType *STy, bool IsStruct,
	bool HasBitField, uint32_t Vlen)
	: STy(STy), HasBitField(HasBitField) {
	Kind = IsStruct ? BTF::BTF_KIND_STRUCT : BTF::BTF_KIND_UNION;
	BTFType.Size = roundupToBytes(STy->getSizeInBits());
	BTFType.Info = (HasBitField << 31) \| (Kind << 24) \| Vlen;
	}

	void BTFTypeStruct::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	BTFType.NameOff = BDebug.addString(STy->getName());

	// Add struct/union members.
	const DINodeArray Elements = STy->getElements();
	for (const auto *Element : Elements) {
	struct BTF::BTFMember BTFMember;
	const auto *DDTy = cast<DIDerivedType>(Element);

	BTFMember.NameOff = BDebug.addString(DDTy->getName());
	if (HasBitField) {
	uint8_t BitFieldSize = DDTy->isBitField() ? DDTy->getSizeInBits() : 0;
	BTFMember.Offset = BitFieldSize << 24 \| DDTy->getOffsetInBits();
	} else {
	BTFMember.Offset = DDTy->getOffsetInBits();
	}
	- BTFMember.Type = BDebug.getTypeId(DDTy->getBaseType());
	+ const auto *BaseTy = DDTy->getBaseType();
	+ BTFMember.Type = BDebug.getTypeId(BaseTy);
	+ MemberTypeNoQual.push_back(BDebug.getTypeId(stripQualifiers(BaseTy)));
	Members.push_back(BTFMember);
	}
	}

	void BTFTypeStruct::emitType(MCStreamer &OS) {
	BTFTypeBase::emitType(OS);
	for (const auto &Member : Members) {
	OS.EmitIntValue(Member.NameOff, 4);
	OS.EmitIntValue(Member.Type, 4);
	OS.AddComment("0x" + Twine::utohexstr(Member.Offset));
	OS.EmitIntValue(Member.Offset, 4);
	}
	}

	std::string BTFTypeStruct::getName() { return STy->getName(); }

	void BTFTypeStruct::getMemberInfo(uint32_t Loc, uint32_t &MemberOffset,
	uint32_t &MemberType) {
	- MemberType = Members[Loc].Type;
	+ MemberType = MemberTypeNoQual[Loc];
	MemberOffset =
	HasBitField ? Members[Loc].Offset & 0xffffff : Members[Loc].Offset;
	}

	uint32_t BTFTypeStruct::getStructSize() { return STy->getSizeInBits() >> 3; }

	/// The Func kind represents both subprogram and pointee of function
	/// pointers. If the FuncName is empty, it represents a pointee of function
	/// pointer. Otherwise, it represents a subprogram. The func arg names
	/// are empty for pointee of function pointer case, and are valid names
	/// for subprogram.
	BTFTypeFuncProto::BTFTypeFuncProto(
	const DISubroutineType *STy, uint32_t VLen,
	const std::unordered_map<uint32_t, StringRef> &FuncArgNames)
	: STy(STy), FuncArgNames(FuncArgNames) {
	Kind = BTF::BTF_KIND_FUNC_PROTO;
	BTFType.Info = (Kind << 24) \| VLen;
	}

	void BTFTypeFuncProto::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	DITypeRefArray Elements = STy->getTypeArray();
	auto RetType = Elements[0];
	BTFType.Type = RetType ? BDebug.getTypeId(RetType) : 0;
	BTFType.NameOff = 0;

	// For null parameter which is typically the last one
	// to represent the vararg, encode the NameOff/Type to be 0.
	for (unsigned I = 1, N = Elements.size(); I < N; ++I) {
	struct BTF::BTFParam Param;
	auto Element = Elements[I];
	if (Element) {
	Param.NameOff = BDebug.addString(FuncArgNames[I]);
	Param.Type = BDebug.getTypeId(Element);
	} else {
	Param.NameOff = 0;
	Param.Type = 0;
	}
	Parameters.push_back(Param);
	}
	}

	void BTFTypeFuncProto::emitType(MCStreamer &OS) {
	BTFTypeBase::emitType(OS);
	for (const auto &Param : Parameters) {
	OS.EmitIntValue(Param.NameOff, 4);
	OS.EmitIntValue(Param.Type, 4);
	}
	}

	BTFTypeFunc::BTFTypeFunc(StringRef FuncName, uint32_t ProtoTypeId)
	: Name(FuncName) {
	Kind = BTF::BTF_KIND_FUNC;
	BTFType.Info = Kind << 24;
	BTFType.Type = ProtoTypeId;
	}

	void BTFTypeFunc::completeType(BTFDebug &BDebug) {
	if (IsCompleted)
	return;
	IsCompleted = true;

	BTFType.NameOff = BDebug.addString(Name);
	}

	void BTFTypeFunc::emitType(MCStreamer &OS) { BTFTypeBase::emitType(OS); }

	BTFKindVar::BTFKindVar(StringRef VarName, uint32_t TypeId, uint32_t VarInfo)
	: Name(VarName) {
	Kind = BTF::BTF_KIND_VAR;
	BTFType.Info = Kind << 24;
	BTFType.Type = TypeId;
	Info = VarInfo;
	}

	void BTFKindVar::completeType(BTFDebug &BDebug) {
	BTFType.NameOff = BDebug.addString(Name);
	}

	void BTFKindVar::emitType(MCStreamer &OS) {
	BTFTypeBase::emitType(OS);
	OS.EmitIntValue(Info, 4);
	}

	BTFKindDataSec::BTFKindDataSec(AsmPrinter *AsmPrt, std::string SecName)
	: Asm(AsmPrt), Name(SecName) {
	Kind = BTF::BTF_KIND_DATASEC;
	BTFType.Info = Kind << 24;
	BTFType.Size = 0;
	}

	void BTFKindDataSec::completeType(BTFDebug &BDebug) {
	BTFType.NameOff = BDebug.addString(Name);
	BTFType.Info \|= Vars.size();
	}

	void BTFKindDataSec::emitType(MCStreamer &OS) {
	BTFTypeBase::emitType(OS);

	for (const auto &V : Vars) {
	OS.EmitIntValue(std::get<0>(V), 4);
	Asm->EmitLabelReference(std::get<1>(V), 4);
	OS.EmitIntValue(std::get<2>(V), 4);
	}
	}

	uint32_t BTFStringTable::addString(StringRef S) {
	// Check whether the string already exists.
	for (auto &OffsetM : OffsetToIdMap) {
	if (Table[OffsetM.second] == S)
	return OffsetM.first;
	}
	// Not find, add to the string table.
	uint32_t Offset = Size;
	OffsetToIdMap[Offset] = Table.size();
	Table.push_back(S);
	Size += S.size() + 1;
	return Offset;
	}

	BTFDebug::BTFDebug(AsmPrinter *AP)
	: DebugHandlerBase(AP), OS(*Asm->OutStreamer), SkipInstruction(false),
	LineInfoGenerated(false), SecNameOff(0), ArrayIndexTypeId(0),
	MapDefNotCollected(true) {
	addString("\0");
	}

	uint32_t BTFDebug::addType(std::unique_ptr<BTFTypeBase> TypeEntry,
	const DIType *Ty) {
	TypeEntry->setId(TypeEntries.size() + 1);
	uint32_t Id = TypeEntry->getId();
	DIToIdMap[Ty] = Id;
	TypeEntries.push_back(std::move(TypeEntry));
	return Id;
	}

	uint32_t BTFDebug::addType(std::unique_ptr<BTFTypeBase> TypeEntry) {
	TypeEntry->setId(TypeEntries.size() + 1);
	uint32_t Id = TypeEntry->getId();
	TypeEntries.push_back(std::move(TypeEntry));
	return Id;
	}

	void BTFDebug::visitBasicType(const DIBasicType *BTy, uint32_t &TypeId) {
	// Only int types are supported in BTF.
	uint32_t Encoding = BTy->getEncoding();
	if (Encoding != dwarf::DW_ATE_boolean && Encoding != dwarf::DW_ATE_signed &&
	Encoding != dwarf::DW_ATE_signed_char &&
	Encoding != dwarf::DW_ATE_unsigned &&
	Encoding != dwarf::DW_ATE_unsigned_char)
	return;

	// Create a BTF type instance for this DIBasicType and put it into
	// DIToIdMap for cross-type reference check.
	auto TypeEntry = llvm::make_unique<BTFTypeInt>(
	Encoding, BTy->getSizeInBits(), BTy->getOffsetInBits(), BTy->getName());
	TypeId = addType(std::move(TypeEntry), BTy);
	}

	/// Handle subprogram or subroutine types.
	void BTFDebug::visitSubroutineType(
	const DISubroutineType *STy, bool ForSubprog,
	const std::unordered_map<uint32_t, StringRef> &FuncArgNames,
	uint32_t &TypeId) {
	DITypeRefArray Elements = STy->getTypeArray();
	uint32_t VLen = Elements.size() - 1;
	if (VLen > BTF::MAX_VLEN)
	return;

	// Subprogram has a valid non-zero-length name, and the pointee of
	// a function pointer has an empty name. The subprogram type will
	// not be added to DIToIdMap as it should not be referenced by
	// any other types.
	auto TypeEntry = llvm::make_unique<BTFTypeFuncProto>(STy, VLen, FuncArgNames);
	if (ForSubprog)
	TypeId = addType(std::move(TypeEntry)); // For subprogram
	else
	TypeId = addType(std::move(TypeEntry), STy); // For func ptr

	// Visit return type and func arg types.
	for (const auto Element : Elements) {
	visitTypeEntry(Element);
	}
	}

	/// Handle structure/union types.
	void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
	uint32_t &TypeId) {
	const DINodeArray Elements = CTy->getElements();
	uint32_t VLen = Elements.size();
	if (VLen > BTF::MAX_VLEN)
	return;

	// Check whether we have any bitfield members or not
	bool HasBitField = false;
	for (const auto *Element : Elements) {
	auto E = cast<DIDerivedType>(Element);
	if (E->isBitField()) {
	HasBitField = true;
	break;
	}
	}

	auto TypeEntry =
	llvm::make_unique<BTFTypeStruct>(CTy, IsStruct, HasBitField, VLen);
	StructTypes.push_back(TypeEntry.get());
	TypeId = addType(std::move(TypeEntry), CTy);

	// Visit all struct members.
	for (const auto *Element : Elements)
	visitTypeEntry(cast<DIDerivedType>(Element));
	}

	void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) {
	// Visit array element type.
	uint32_t ElemTypeId, ElemSize;
	const DIType *ElemType = CTy->getBaseType();
	visitTypeEntry(ElemType, ElemTypeId, false, false);
	+
	+ // Strip qualifiers from element type to get accurate element size.
	+ ElemType = stripQualifiers(ElemType);
	ElemSize = ElemType->getSizeInBits() >> 3;

	if (!CTy->getSizeInBits()) {
	- auto TypeEntry = llvm::make_unique<BTFTypeArray>(ElemTypeId, 0, 0);
	+ auto TypeEntry = llvm::make_unique<BTFTypeArray>(ElemType, ElemTypeId, 0, 0);
	ArrayTypes.push_back(TypeEntry.get());
	ElemTypeId = addType(std::move(TypeEntry), CTy);
	} else {
	// Visit array dimensions.
	DINodeArray Elements = CTy->getElements();
	for (int I = Elements.size() - 1; I >= 0; --I) {
	if (auto *Element = dyn_cast_or_null<DINode>(Elements[I]))
	if (Element->getTag() == dwarf::DW_TAG_subrange_type) {
	const DISubrange *SR = cast<DISubrange>(Element);
	auto CI = SR->getCount().dyn_cast<ConstantInt >();
	int64_t Count = CI->getSExtValue();
	+ const DIType *ArrayElemTy = (I == 0) ? ElemType : nullptr;

	auto TypeEntry =
	- llvm::make_unique<BTFTypeArray>(ElemTypeId, ElemSize, Count);
	+ llvm::make_unique<BTFTypeArray>(ArrayElemTy, ElemTypeId,
	+ ElemSize, Count);
	ArrayTypes.push_back(TypeEntry.get());
	if (I == 0)
	ElemTypeId = addType(std::move(TypeEntry), CTy);
	else
	ElemTypeId = addType(std::move(TypeEntry));
	ElemSize = ElemSize * Count;
	}
	}
	}

	// The array TypeId is the type id of the outermost dimension.
	TypeId = ElemTypeId;

	// The IR does not have a type for array index while BTF wants one.
	// So create an array index type if there is none.
	if (!ArrayIndexTypeId) {
	auto TypeEntry = llvm::make_unique<BTFTypeInt>(dwarf::DW_ATE_unsigned, 32,
	0, "__ARRAY_SIZE_TYPE__");
	ArrayIndexTypeId = addType(std::move(TypeEntry));
	}
	}

	void BTFDebug::visitEnumType(const DICompositeType *CTy, uint32_t &TypeId) {
	DINodeArray Elements = CTy->getElements();
	uint32_t VLen = Elements.size();
	if (VLen > BTF::MAX_VLEN)
	return;

	auto TypeEntry = llvm::make_unique<BTFTypeEnum>(CTy, VLen);
	TypeId = addType(std::move(TypeEntry), CTy);
	// No need to visit base type as BTF does not encode it.
	}

	/// Handle structure/union forward declarations.
	void BTFDebug::visitFwdDeclType(const DICompositeType *CTy, bool IsUnion,
	uint32_t &TypeId) {
	auto TypeEntry = llvm::make_unique<BTFTypeFwd>(CTy->getName(), IsUnion);
	TypeId = addType(std::move(TypeEntry), CTy);
	}

	/// Handle structure, union, array and enumeration types.
	void BTFDebug::visitCompositeType(const DICompositeType *CTy,
	uint32_t &TypeId) {
	auto Tag = CTy->getTag();
	if (Tag == dwarf::DW_TAG_structure_type \|\| Tag == dwarf::DW_TAG_union_type) {
	// Handle forward declaration differently as it does not have members.
	if (CTy->isForwardDecl())
	visitFwdDeclType(CTy, Tag == dwarf::DW_TAG_union_type, TypeId);
	else
	visitStructType(CTy, Tag == dwarf::DW_TAG_structure_type, TypeId);
	} else if (Tag == dwarf::DW_TAG_array_type)
	visitArrayType(CTy, TypeId);
	else if (Tag == dwarf::DW_TAG_enumeration_type)
	visitEnumType(CTy, TypeId);
	}

	/// Handle pointer, typedef, const, volatile, restrict and member types.
	void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
	bool CheckPointer, bool SeenPointer) {
	unsigned Tag = DTy->getTag();

	/// Try to avoid chasing pointees, esp. structure pointees which may
	/// unnecessary bring in a lot of types.
	if (CheckPointer && !SeenPointer) {
	SeenPointer = Tag == dwarf::DW_TAG_pointer_type;
	}

	if (CheckPointer && SeenPointer) {
	const DIType *Base = DTy->getBaseType();
	if (Base) {
	if (const auto *CTy = dyn_cast<DICompositeType>(Base)) {
	auto CTag = CTy->getTag();
	if ((CTag == dwarf::DW_TAG_structure_type \|\|
	CTag == dwarf::DW_TAG_union_type) &&
	!CTy->isForwardDecl()) {
	/// Find a candidate, generate a fixup. Later on the struct/union
	/// pointee type will be replaced with either a real type or
	/// a forward declaration.
	auto TypeEntry = llvm::make_unique<BTFTypeDerived>(DTy, Tag, true);
	auto &Fixup = FixupDerivedTypes[CTy->getName()];
	Fixup.first = CTag == dwarf::DW_TAG_union_type;
	Fixup.second.push_back(TypeEntry.get());
	TypeId = addType(std::move(TypeEntry), DTy);
	return;
	}
	}
	}
	}

	if (Tag == dwarf::DW_TAG_pointer_type \|\| Tag == dwarf::DW_TAG_typedef \|\|
	Tag == dwarf::DW_TAG_const_type \|\| Tag == dwarf::DW_TAG_volatile_type \|\|
	Tag == dwarf::DW_TAG_restrict_type) {
	auto TypeEntry = llvm::make_unique<BTFTypeDerived>(DTy, Tag, false);
	TypeId = addType(std::move(TypeEntry), DTy);
	} else if (Tag != dwarf::DW_TAG_member) {
	return;
	}

	// Visit base type of pointer, typedef, const, volatile, restrict or
	// struct/union member.
	uint32_t TempTypeId = 0;
	if (Tag == dwarf::DW_TAG_member)
	visitTypeEntry(DTy->getBaseType(), TempTypeId, true, false);
	else
	visitTypeEntry(DTy->getBaseType(), TempTypeId, CheckPointer, SeenPointer);
	}

	void BTFDebug::visitTypeEntry(const DIType *Ty, uint32_t &TypeId,
	bool CheckPointer, bool SeenPointer) {
	if (!Ty \|\| DIToIdMap.find(Ty) != DIToIdMap.end()) {
	TypeId = DIToIdMap[Ty];
	return;
	}

	if (const auto *BTy = dyn_cast<DIBasicType>(Ty))
	visitBasicType(BTy, TypeId);
	else if (const auto *STy = dyn_cast<DISubroutineType>(Ty))
	visitSubroutineType(STy, false, std::unordered_map<uint32_t, StringRef>(),
	TypeId);
	else if (const auto *CTy = dyn_cast<DICompositeType>(Ty))
	visitCompositeType(CTy, TypeId);
	else if (const auto *DTy = dyn_cast<DIDerivedType>(Ty))
	visitDerivedType(DTy, TypeId, CheckPointer, SeenPointer);
	else
	llvm_unreachable("Unknown DIType");
	}

	void BTFDebug::visitTypeEntry(const DIType *Ty) {
	uint32_t TypeId;
	visitTypeEntry(Ty, TypeId, false, false);
	}

	void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
	if (!Ty \|\| DIToIdMap.find(Ty) != DIToIdMap.end()) {
	TypeId = DIToIdMap[Ty];
	return;
	}

	// MapDef type is a struct type
	const auto *CTy = dyn_cast<DICompositeType>(Ty);
	if (!CTy)
	return;

	auto Tag = CTy->getTag();
	if (Tag != dwarf::DW_TAG_structure_type \|\| CTy->isForwardDecl())
	return;

	// Record this type
	const DINodeArray Elements = CTy->getElements();
	bool HasBitField = false;
	for (const auto *Element : Elements) {
	auto E = cast<DIDerivedType>(Element);
	if (E->isBitField()) {
	HasBitField = true;
	break;
	}
	}

	auto TypeEntry =
	llvm::make_unique<BTFTypeStruct>(CTy, true, HasBitField, Elements.size());
	StructTypes.push_back(TypeEntry.get());
	TypeId = addType(std::move(TypeEntry), CTy);

	// Visit all struct members
	for (const auto *Element : Elements) {
	const auto *MemberType = cast<DIDerivedType>(Element);
	visitTypeEntry(MemberType->getBaseType());
	}
	}

	/// Read file contents from the actual file or from the source
	std::string BTFDebug::populateFileContent(const DISubprogram *SP) {
	auto File = SP->getFile();
	std::string FileName;

	if (!File->getFilename().startswith("/") && File->getDirectory().size())
	FileName = File->getDirectory().str() + "/" + File->getFilename().str();
	else
	FileName = File->getFilename();

	// No need to populate the contends if it has been populated!
	if (FileContent.find(FileName) != FileContent.end())
	return FileName;

	std::vector<std::string> Content;
	std::string Line;
	Content.push_back(Line); // Line 0 for empty string

	std::unique_ptr<MemoryBuffer> Buf;
	auto Source = File->getSource();
	if (Source)
	Buf = MemoryBuffer::getMemBufferCopy(*Source);
	else if (ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
	MemoryBuffer::getFile(FileName))
	Buf = std::move(*BufOrErr);
	if (Buf)
	for (line_iterator I(*Buf, false), E; I != E; ++I)
	Content.push_back(*I);

	FileContent[FileName] = Content;
	return FileName;
	}

	void BTFDebug::constructLineInfo(const DISubprogram SP, MCSymbol Label,
	uint32_t Line, uint32_t Column) {
	std::string FileName = populateFileContent(SP);
	BTFLineInfo LineInfo;

	LineInfo.Label = Label;
	LineInfo.FileNameOff = addString(FileName);
	// If file content is not available, let LineOff = 0.
	if (Line < FileContent[FileName].size())
	LineInfo.LineOff = addString(FileContent[FileName][Line]);
	else
	LineInfo.LineOff = 0;
	LineInfo.LineNum = Line;
	LineInfo.ColumnNum = Column;
	LineInfoTable[SecNameOff].push_back(LineInfo);
	}

	void BTFDebug::emitCommonHeader() {
	OS.AddComment("0x" + Twine::utohexstr(BTF::MAGIC));
	OS.EmitIntValue(BTF::MAGIC, 2);
	OS.EmitIntValue(BTF::VERSION, 1);
	OS.EmitIntValue(0, 1);
	}

	void BTFDebug::emitBTFSection() {
	// Do not emit section if no types and only "" string.
	if (!TypeEntries.size() && StringTable.getSize() == 1)
	return;

	MCContext &Ctx = OS.getContext();
	OS.SwitchSection(Ctx.getELFSection(".BTF", ELF::SHT_PROGBITS, 0));

	// Emit header.
	emitCommonHeader();
	OS.EmitIntValue(BTF::HeaderSize, 4);

	uint32_t TypeLen = 0, StrLen;
	for (const auto &TypeEntry : TypeEntries)
	TypeLen += TypeEntry->getSize();
	StrLen = StringTable.getSize();

	OS.EmitIntValue(0, 4);
	OS.EmitIntValue(TypeLen, 4);
	OS.EmitIntValue(TypeLen, 4);
	OS.EmitIntValue(StrLen, 4);

	// Emit type table.
	for (const auto &TypeEntry : TypeEntries)
	TypeEntry->emitType(OS);

	// Emit string table.
	uint32_t StringOffset = 0;
	for (const auto &S : StringTable.getTable()) {
	OS.AddComment("string offset=" + std::to_string(StringOffset));
	OS.EmitBytes(S);
	OS.EmitBytes(StringRef("\0", 1));
	StringOffset += S.size() + 1;
	}
	}

	void BTFDebug::emitBTFExtSection() {
	// Do not emit section if empty FuncInfoTable and LineInfoTable.
	if (!FuncInfoTable.size() && !LineInfoTable.size() &&
	!OffsetRelocTable.size() && !ExternRelocTable.size())
	return;

	MCContext &Ctx = OS.getContext();
	OS.SwitchSection(Ctx.getELFSection(".BTF.ext", ELF::SHT_PROGBITS, 0));

	// Emit header.
	emitCommonHeader();
	OS.EmitIntValue(BTF::ExtHeaderSize, 4);

	// Account for FuncInfo/LineInfo record size as well.
	uint32_t FuncLen = 4, LineLen = 4;
	// Do not account for optional OffsetReloc/ExternReloc.
	uint32_t OffsetRelocLen = 0, ExternRelocLen = 0;
	for (const auto &FuncSec : FuncInfoTable) {
	FuncLen += BTF::SecFuncInfoSize;
	FuncLen += FuncSec.second.size() * BTF::BPFFuncInfoSize;
	}
	for (const auto &LineSec : LineInfoTable) {
	LineLen += BTF::SecLineInfoSize;
	LineLen += LineSec.second.size() * BTF::BPFLineInfoSize;
	}
	for (const auto &OffsetRelocSec : OffsetRelocTable) {
	OffsetRelocLen += BTF::SecOffsetRelocSize;
	OffsetRelocLen += OffsetRelocSec.second.size() * BTF::BPFOffsetRelocSize;
	}
	for (const auto &ExternRelocSec : ExternRelocTable) {
	ExternRelocLen += BTF::SecExternRelocSize;
	ExternRelocLen += ExternRelocSec.second.size() * BTF::BPFExternRelocSize;
	}

	if (OffsetRelocLen)
	OffsetRelocLen += 4;
	if (ExternRelocLen)
	ExternRelocLen += 4;

	OS.EmitIntValue(0, 4);
	OS.EmitIntValue(FuncLen, 4);
	OS.EmitIntValue(FuncLen, 4);
	OS.EmitIntValue(LineLen, 4);
	OS.EmitIntValue(FuncLen + LineLen, 4);
	OS.EmitIntValue(OffsetRelocLen, 4);
	OS.EmitIntValue(FuncLen + LineLen + OffsetRelocLen, 4);
	OS.EmitIntValue(ExternRelocLen, 4);

	// Emit func_info table.
	OS.AddComment("FuncInfo");
	OS.EmitIntValue(BTF::BPFFuncInfoSize, 4);
	for (const auto &FuncSec : FuncInfoTable) {
	OS.AddComment("FuncInfo section string offset=" +
	std::to_string(FuncSec.first));
	OS.EmitIntValue(FuncSec.first, 4);
	OS.EmitIntValue(FuncSec.second.size(), 4);
	for (const auto &FuncInfo : FuncSec.second) {
	Asm->EmitLabelReference(FuncInfo.Label, 4);
	OS.EmitIntValue(FuncInfo.TypeId, 4);
	}
	}

	// Emit line_info table.
	OS.AddComment("LineInfo");
	OS.EmitIntValue(BTF::BPFLineInfoSize, 4);
	for (const auto &LineSec : LineInfoTable) {
	OS.AddComment("LineInfo section string offset=" +
	std::to_string(LineSec.first));
	OS.EmitIntValue(LineSec.first, 4);
	OS.EmitIntValue(LineSec.second.size(), 4);
	for (const auto &LineInfo : LineSec.second) {
	Asm->EmitLabelReference(LineInfo.Label, 4);
	OS.EmitIntValue(LineInfo.FileNameOff, 4);
	OS.EmitIntValue(LineInfo.LineOff, 4);
	OS.AddComment("Line " + std::to_string(LineInfo.LineNum) + " Col " +
	std::to_string(LineInfo.ColumnNum));
	OS.EmitIntValue(LineInfo.LineNum << 10 \| LineInfo.ColumnNum, 4);
	}
	}

	// Emit offset reloc table.
	if (OffsetRelocLen) {
	OS.AddComment("OffsetReloc");
	OS.EmitIntValue(BTF::BPFOffsetRelocSize, 4);
	for (const auto &OffsetRelocSec : OffsetRelocTable) {
	OS.AddComment("Offset reloc section string offset=" +
	std::to_string(OffsetRelocSec.first));
	OS.EmitIntValue(OffsetRelocSec.first, 4);
	OS.EmitIntValue(OffsetRelocSec.second.size(), 4);
	for (const auto &OffsetRelocInfo : OffsetRelocSec.second) {
	Asm->EmitLabelReference(OffsetRelocInfo.Label, 4);
	OS.EmitIntValue(OffsetRelocInfo.TypeID, 4);
	OS.EmitIntValue(OffsetRelocInfo.OffsetNameOff, 4);
	}
	}
	}

	// Emit extern reloc table.
	if (ExternRelocLen) {
	OS.AddComment("ExternReloc");
	OS.EmitIntValue(BTF::BPFExternRelocSize, 4);
	for (const auto &ExternRelocSec : ExternRelocTable) {
	OS.AddComment("Extern reloc section string offset=" +
	std::to_string(ExternRelocSec.first));
	OS.EmitIntValue(ExternRelocSec.first, 4);
	OS.EmitIntValue(ExternRelocSec.second.size(), 4);
	for (const auto &ExternRelocInfo : ExternRelocSec.second) {
	Asm->EmitLabelReference(ExternRelocInfo.Label, 4);
	OS.EmitIntValue(ExternRelocInfo.ExternNameOff, 4);
	}
	}
	}
	}

	void BTFDebug::beginFunctionImpl(const MachineFunction *MF) {
	auto *SP = MF->getFunction().getSubprogram();
	auto *Unit = SP->getUnit();

	if (Unit->getEmissionKind() == DICompileUnit::NoDebug) {
	SkipInstruction = true;
	return;
	}
	SkipInstruction = false;

	// Collect MapDef types. Map definition needs to collect
	// pointee types. Do it first. Otherwise, for the following
	// case:
	// struct m { ...};
	// struct t {
	// struct m *key;
	// };
	// foo(struct t *arg);
	//
	// struct mapdef {
	// ...
	// struct m *key;
	// ...
	// } __attribute__((section(".maps"))) hash_map;
	//
	// If subroutine foo is traversed first, a type chain
	// "ptr->struct m(fwd)" will be created and later on
	// when traversing mapdef, since "ptr->struct m" exists,
	// the traversal of "struct m" will be omitted.
	if (MapDefNotCollected) {
	processGlobals(true);
	MapDefNotCollected = false;
	}

	// Collect all types locally referenced in this function.
	// Use RetainedNodes so we can collect all argument names
	// even if the argument is not used.
	std::unordered_map<uint32_t, StringRef> FuncArgNames;
	for (const DINode *DN : SP->getRetainedNodes()) {
	if (const auto *DV = dyn_cast<DILocalVariable>(DN)) {
	// Collect function arguments for subprogram func type.
	uint32_t Arg = DV->getArg();
	if (Arg) {
	visitTypeEntry(DV->getType());
	FuncArgNames[Arg] = DV->getName();
	}
	}
	}

	// Construct subprogram func proto type.
	uint32_t ProtoTypeId;
	visitSubroutineType(SP->getType(), true, FuncArgNames, ProtoTypeId);

	// Construct subprogram func type
	auto FuncTypeEntry =
	llvm::make_unique<BTFTypeFunc>(SP->getName(), ProtoTypeId);
	uint32_t FuncTypeId = addType(std::move(FuncTypeEntry));

	for (const auto &TypeEntry : TypeEntries)
	TypeEntry->completeType(*this);

	// Construct funcinfo and the first lineinfo for the function.
	MCSymbol *FuncLabel = Asm->getFunctionBegin();
	BTFFuncInfo FuncInfo;
	FuncInfo.Label = FuncLabel;
	FuncInfo.TypeId = FuncTypeId;
	if (FuncLabel->isInSection()) {
	MCSection &Section = FuncLabel->getSection();
	const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section);
	assert(SectionELF && "Null section for Function Label");
	SecNameOff = addString(SectionELF->getSectionName());
	} else {
	SecNameOff = addString(".text");
	}
	FuncInfoTable[SecNameOff].push_back(FuncInfo);
	}

	void BTFDebug::endFunctionImpl(const MachineFunction *MF) {
	SkipInstruction = false;
	LineInfoGenerated = false;
	SecNameOff = 0;
	}

	/// On-demand populate struct types as requested from abstract member
	/// accessing.
	unsigned BTFDebug::populateStructType(const DIType *Ty) {
	unsigned Id;
	visitTypeEntry(Ty, Id, false, false);
	for (const auto &TypeEntry : TypeEntries)
	TypeEntry->completeType(*this);
	return Id;
	}

	// Find struct/array debuginfo types given a type id.
	void BTFDebug::setTypeFromId(uint32_t TypeId, BTFTypeStruct **PrevStructType,
	BTFTypeArray **PrevArrayType) {
	for (const auto &StructType : StructTypes) {
	if (StructType->getId() == TypeId) {
	*PrevStructType = StructType;
	return;
	}
	}
	for (const auto &ArrayType : ArrayTypes) {
	if (ArrayType->getId() == TypeId) {
	*PrevArrayType = ArrayType;
	return;
	}
	}
	}

	/// Generate a struct member offset relocation.
	void BTFDebug::generateOffsetReloc(const MachineInstr *MI,
	const MCSymbol ORSym, DIType RootTy,
	StringRef AccessPattern) {
	BTFTypeStruct *PrevStructType = nullptr;
	BTFTypeArray *PrevArrayType = nullptr;
	unsigned RootId = populateStructType(RootTy);
	setTypeFromId(RootId, &PrevStructType, &PrevArrayType);
	unsigned RootTySize = PrevStructType->getStructSize();
	+ StringRef IndexPattern = AccessPattern.substr(AccessPattern.find_first_of(':') + 1);

	BTFOffsetReloc OffsetReloc;
	OffsetReloc.Label = ORSym;
	- OffsetReloc.OffsetNameOff = addString(AccessPattern.drop_back());
	+ OffsetReloc.OffsetNameOff = addString(IndexPattern.drop_back());
	OffsetReloc.TypeID = RootId;

	uint32_t Start = 0, End = 0, Offset = 0;
	bool FirstAccess = true;
	- for (auto C : AccessPattern) {
	+ for (auto C : IndexPattern) {
	if (C != ':') {
	End++;
	} else {
	- std::string SubStr = AccessPattern.substr(Start, End - Start);
	+ std::string SubStr = IndexPattern.substr(Start, End - Start);
	int Loc = std::stoi(SubStr);

	if (FirstAccess) {
	Offset = Loc * RootTySize;
	FirstAccess = false;
	} else if (PrevStructType) {
	uint32_t MemberOffset, MemberTypeId;
	PrevStructType->getMemberInfo(Loc, MemberOffset, MemberTypeId);

	Offset += MemberOffset >> 3;
	PrevStructType = nullptr;
	setTypeFromId(MemberTypeId, &PrevStructType, &PrevArrayType);
	} else if (PrevArrayType) {
	uint32_t LocOffset, ElementTypeId;
	PrevArrayType->getLocInfo(Loc, LocOffset, ElementTypeId);

	Offset += LocOffset;
	PrevArrayType = nullptr;
	setTypeFromId(ElementTypeId, &PrevStructType, &PrevArrayType);
	+ } else {
	+ llvm_unreachable("Internal Error: BTF offset relocation type traversal error");
	}
	+
	Start = End + 1;
	End = Start;
	}
	}
	- AccessOffsets[RootTy->getName().str() + ":" + AccessPattern.str()] = Offset;
	+ AccessOffsets[AccessPattern.str()] = Offset;
	OffsetRelocTable[SecNameOff].push_back(OffsetReloc);
	}

	void BTFDebug::processLDimm64(const MachineInstr *MI) {
	// If the insn is an LD_imm64, the following two cases
	// will generate an .BTF.ext record.
	//
	// If the insn is "r2 = LD_imm64 @__BTF_...",
	// add this insn into the .BTF.ext OffsetReloc subsection.
	// Relocation looks like:
	// . SecName:
	// . InstOffset
	// . TypeID
	// . OffSetNameOff
	// Later, the insn is replaced with "r2 = <offset>"
	// where "<offset>" equals to the offset based on current
	// type definitions.
	//
	// If the insn is "r2 = LD_imm64 @VAR" and VAR is
	// a patchable external global, add this insn into the .BTF.ext
	// ExternReloc subsection.
	// Relocation looks like:
	// . SecName:
	// . InstOffset
	// . ExternNameOff
	// Later, the insn is replaced with "r2 = <value>" or
	// "LD_imm64 r2, <value>" where "<value>" = 0.

	// check whether this is a candidate or not
	const MachineOperand &MO = MI->getOperand(1);
	if (MO.isGlobal()) {
	const GlobalValue *GVal = MO.getGlobal();
	auto *GVar = dyn_cast<GlobalVariable>(GVal);
	if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
	MCSymbol *ORSym = OS.getContext().createTempSymbol();
	OS.EmitLabel(ORSym);

	MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index);
	DIType *Ty = dyn_cast<DIType>(MDN);
	generateOffsetReloc(MI, ORSym, Ty, GVar->getName());
	} else if (GVar && !GVar->hasInitializer() && GVar->hasExternalLinkage() &&
	GVar->getSection() == BPFCoreSharedInfo::PatchableExtSecName) {
	MCSymbol *ORSym = OS.getContext().createTempSymbol();
	OS.EmitLabel(ORSym);

	BTFExternReloc ExternReloc;
	ExternReloc.Label = ORSym;
	ExternReloc.ExternNameOff = addString(GVar->getName());
	ExternRelocTable[SecNameOff].push_back(ExternReloc);
	}
	}
	}

	void BTFDebug::beginInstruction(const MachineInstr *MI) {
	DebugHandlerBase::beginInstruction(MI);

	if (SkipInstruction \|\| MI->isMetaInstruction() \|\|
	MI->getFlag(MachineInstr::FrameSetup))
	return;

	if (MI->isInlineAsm()) {
	// Count the number of register definitions to find the asm string.
	unsigned NumDefs = 0;
	for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef();
	++NumDefs)
	;

	// Skip this inline asm instruction if the asmstr is empty.
	const char *AsmStr = MI->getOperand(NumDefs).getSymbolName();
	if (AsmStr[0] == 0)
	return;
	}

	if (MI->getOpcode() == BPF::LD_imm64)
	processLDimm64(MI);

	// Skip this instruction if no DebugLoc or the DebugLoc
	// is the same as the previous instruction.
	const DebugLoc &DL = MI->getDebugLoc();
	if (!DL \|\| PrevInstLoc == DL) {
	// This instruction will be skipped, no LineInfo has
	// been generated, construct one based on function signature.
	if (LineInfoGenerated == false) {
	auto *S = MI->getMF()->getFunction().getSubprogram();
	MCSymbol *FuncLabel = Asm->getFunctionBegin();
	constructLineInfo(S, FuncLabel, S->getLine(), 0);
	LineInfoGenerated = true;
	}

	return;
	}

	// Create a temporary label to remember the insn for lineinfo.
	MCSymbol *LineSym = OS.getContext().createTempSymbol();
	OS.EmitLabel(LineSym);

	// Construct the lineinfo.
	auto SP = DL.get()->getScope()->getSubprogram();
	constructLineInfo(SP, LineSym, DL.getLine(), DL.getCol());

	LineInfoGenerated = true;
	PrevInstLoc = DL;
	}

	void BTFDebug::processGlobals(bool ProcessingMapDef) {
	// Collect all types referenced by globals.
	const Module *M = MMI->getModule();
	for (const GlobalVariable &Global : M->globals()) {
	// Ignore external globals for now.
	if (!Global.hasInitializer() && Global.hasExternalLinkage())
	continue;

	// Decide the section name.
	StringRef SecName;
	if (Global.hasSection()) {
	SecName = Global.getSection();
	} else {
	// data, bss, or readonly sections
	if (Global.isConstant())
	SecName = ".rodata";
	else
	SecName = Global.getInitializer()->isZeroValue() ? ".bss" : ".data";
	}

	if (ProcessingMapDef != SecName.startswith(".maps"))
	continue;

	SmallVector<DIGlobalVariableExpression *, 1> GVs;
	Global.getDebugInfo(GVs);
	uint32_t GVTypeId = 0;
	for (auto *GVE : GVs) {
	if (SecName.startswith(".maps"))
	visitMapDefType(GVE->getVariable()->getType(), GVTypeId);
	else
	visitTypeEntry(GVE->getVariable()->getType(), GVTypeId, false, false);
	break;
	}

	// Only support the following globals:
	// . static variables
	// . non-static global variables with section attributes
	// Essentially means:
	// . .bcc/.data/.rodata DataSec entities only contain static data
	// . Other DataSec entities contain static or initialized global data.
	// Initialized global data are mostly used for finding map key/value type
	// id's. Whether DataSec is readonly or not can be found from
	// corresponding ELF section flags.
	auto Linkage = Global.getLinkage();
	if (Linkage != GlobalValue::InternalLinkage &&
	(Linkage != GlobalValue::ExternalLinkage \|\| !Global.hasSection()))
	continue;

	uint32_t GVarInfo = Linkage == GlobalValue::ExternalLinkage
	? BTF::VAR_GLOBAL_ALLOCATED
	: BTF::VAR_STATIC;
	auto VarEntry =
	llvm::make_unique<BTFKindVar>(Global.getName(), GVTypeId, GVarInfo);
	uint32_t VarId = addType(std::move(VarEntry));

	// Find or create a DataSec
	if (DataSecEntries.find(SecName) == DataSecEntries.end()) {
	DataSecEntries[SecName] = llvm::make_unique<BTFKindDataSec>(Asm, SecName);
	}

	// Calculate symbol size
	const DataLayout &DL = Global.getParent()->getDataLayout();
	uint32_t Size = DL.getTypeAllocSize(Global.getType()->getElementType());

	DataSecEntries[SecName]->addVar(VarId, Asm->getSymbol(&Global), Size);
	}
	}

	/// Emit proper patchable instructions.
	bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) {
	if (MI->getOpcode() == BPF::LD_imm64) {
	const MachineOperand &MO = MI->getOperand(1);
	if (MO.isGlobal()) {
	const GlobalValue *GVal = MO.getGlobal();
	auto *GVar = dyn_cast<GlobalVariable>(GVal);
	if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
	MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index);
	DIType *Ty = dyn_cast<DIType>(MDN);
	std::string TypeName = Ty->getName();
	- int64_t Imm = AccessOffsets[TypeName + ":" + GVar->getName().str()];
	+ int64_t Imm = AccessOffsets[GVar->getName().str()];

	// Emit "mov ri, <imm>" for abstract member accesses.
	OutMI.setOpcode(BPF::MOV_ri);
	OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
	OutMI.addOperand(MCOperand::createImm(Imm));
	return true;
	} else if (GVar && !GVar->hasInitializer() &&
	GVar->hasExternalLinkage() &&
	GVar->getSection() == BPFCoreSharedInfo::PatchableExtSecName) {
	const IntegerType *IntTy = dyn_cast<IntegerType>(GVar->getValueType());
	assert(IntTy);
	// For patchable externals, emit "LD_imm64, ri, 0" if the external
	// variable is 64bit width, emit "mov ri, 0" otherwise.
	if (IntTy->getBitWidth() == 64)
	OutMI.setOpcode(BPF::LD_imm64);
	else
	OutMI.setOpcode(BPF::MOV_ri);
	OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
	OutMI.addOperand(MCOperand::createImm(0));
	return true;
	}
	}
	}
	return false;
	}

	void BTFDebug::endModule() {
	// Collect MapDef globals if not collected yet.
	if (MapDefNotCollected) {
	processGlobals(true);
	MapDefNotCollected = false;
	}

	// Collect global types/variables except MapDef globals.
	processGlobals(false);
	for (auto &DataSec : DataSecEntries)
	addType(std::move(DataSec.second));

	// Fixups
	for (auto &Fixup : FixupDerivedTypes) {
	StringRef TypeName = Fixup.first;
	bool IsUnion = Fixup.second.first;

	// Search through struct types
	uint32_t StructTypeId = 0;
	for (const auto &StructType : StructTypes) {
	if (StructType->getName() == TypeName) {
	StructTypeId = StructType->getId();
	break;
	}
	}

	if (StructTypeId == 0) {
	auto FwdTypeEntry = llvm::make_unique<BTFTypeFwd>(TypeName, IsUnion);
	StructTypeId = addType(std::move(FwdTypeEntry));
	}

	for (auto &DType : Fixup.second.second) {
	DType->setPointeeType(StructTypeId);
	}
	}

	// Complete BTF type cross refereences.
	for (const auto &TypeEntry : TypeEntries)
	TypeEntry->completeType(*this);

	// Emit BTF sections.
	emitBTFSection();
	emitBTFExtSection();
	}
	Index: vendor/llvm/dist-release_90/lib/Target/BPF/BTFDebug.h
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/BPF/BTFDebug.h (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/BPF/BTFDebug.h (revision 351303)
	@@ -1,371 +1,375 @@
	//===- BTFDebug.h ------------------------------------------------ C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// This file contains support for writing BTF debug info.
	///
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_BPF_BTFDEBUG_H
	#define LLVM_LIB_TARGET_BPF_BTFDEBUG_H

	#include "llvm/ADT/StringMap.h"
	#include "llvm/CodeGen/DebugHandlerBase.h"
	#include <unordered_map>
	#include "BTF.h"

	namespace llvm {

	class AsmPrinter;
	class BTFDebug;
	class DIType;
	class MCStreamer;
	class MCSymbol;
	class MachineFunction;

	/// The base class for BTF type generation.
	class BTFTypeBase {
	protected:
	uint8_t Kind;
	bool IsCompleted;
	uint32_t Id;
	struct BTF::CommonType BTFType;

	public:
	BTFTypeBase() : IsCompleted(false) {}
	virtual ~BTFTypeBase() = default;
	void setId(uint32_t Id) { this->Id = Id; }
	uint32_t getId() { return Id; }
	uint32_t roundupToBytes(uint32_t NumBits) { return (NumBits + 7) >> 3; }
	/// Get the size of this BTF type entry.
	virtual uint32_t getSize() { return BTF::CommonTypeSize; }
	/// Complete BTF type generation after all related DebugInfo types
	/// have been visited so their BTF type id's are available
	/// for cross referece.
	virtual void completeType(BTFDebug &BDebug) {}
	/// Emit types for this BTF type entry.
	virtual void emitType(MCStreamer &OS);
	};

	/// Handle several derived types include pointer, const,
	/// volatile, typedef and restrict.
	class BTFTypeDerived : public BTFTypeBase {
	const DIDerivedType *DTy;
	bool NeedsFixup;

	public:
	BTFTypeDerived(const DIDerivedType *Ty, unsigned Tag, bool NeedsFixup);
	void completeType(BTFDebug &BDebug);
	void emitType(MCStreamer &OS);
	void setPointeeType(uint32_t PointeeType);
	};

	/// Handle struct or union forward declaration.
	class BTFTypeFwd : public BTFTypeBase {
	StringRef Name;

	public:
	BTFTypeFwd(StringRef Name, bool IsUnion);
	void completeType(BTFDebug &BDebug);
	void emitType(MCStreamer &OS);
	};

	/// Handle int type.
	class BTFTypeInt : public BTFTypeBase {
	StringRef Name;
	uint32_t IntVal; ///< Encoding, offset, bits

	public:
	BTFTypeInt(uint32_t Encoding, uint32_t SizeInBits, uint32_t OffsetInBits,
	StringRef TypeName);
	uint32_t getSize() { return BTFTypeBase::getSize() + sizeof(uint32_t); }
	void completeType(BTFDebug &BDebug);
	void emitType(MCStreamer &OS);
	};

	/// Handle enumerate type.
	class BTFTypeEnum : public BTFTypeBase {
	const DICompositeType *ETy;
	std::vector<struct BTF::BTFEnum> EnumValues;

	public:
	BTFTypeEnum(const DICompositeType *ETy, uint32_t NumValues);
	uint32_t getSize() {
	return BTFTypeBase::getSize() + EnumValues.size() * BTF::BTFEnumSize;
	}
	void completeType(BTFDebug &BDebug);
	void emitType(MCStreamer &OS);
	};

	/// Handle array type.
	class BTFTypeArray : public BTFTypeBase {
	+ const DIType *ElemTyNoQual;
	uint32_t ElemSize;
	struct BTF::BTFArray ArrayInfo;
	+ uint32_t ElemTypeNoQual;

	public:
	- BTFTypeArray(uint32_t ElemTypeId, uint32_t ElemSize, uint32_t NumElems);
	+ BTFTypeArray(const DIType *Ty, uint32_t ElemTypeId,
	+ uint32_t ElemSize, uint32_t NumElems);
	uint32_t getSize() { return BTFTypeBase::getSize() + BTF::BTFArraySize; }
	void completeType(BTFDebug &BDebug);
	void emitType(MCStreamer &OS);
	void getLocInfo(uint32_t Loc, uint32_t &LocOffset, uint32_t &ElementTypeId);
	};

	/// Handle struct/union type.
	class BTFTypeStruct : public BTFTypeBase {
	const DICompositeType *STy;
	bool HasBitField;
	std::vector<struct BTF::BTFMember> Members;
	+ std::vector<uint32_t> MemberTypeNoQual;

	public:
	BTFTypeStruct(const DICompositeType *STy, bool IsStruct, bool HasBitField,
	uint32_t NumMembers);
	uint32_t getSize() {
	return BTFTypeBase::getSize() + Members.size() * BTF::BTFMemberSize;
	}
	void completeType(BTFDebug &BDebug);
	void emitType(MCStreamer &OS);
	std::string getName();
	void getMemberInfo(uint32_t Loc, uint32_t &Offset, uint32_t &MemberType);
	uint32_t getStructSize();
	};

	/// Handle function pointer.
	class BTFTypeFuncProto : public BTFTypeBase {
	const DISubroutineType *STy;
	std::unordered_map<uint32_t, StringRef> FuncArgNames;
	std::vector<struct BTF::BTFParam> Parameters;

	public:
	BTFTypeFuncProto(const DISubroutineType *STy, uint32_t NumParams,
	const std::unordered_map<uint32_t, StringRef> &FuncArgNames);
	uint32_t getSize() {
	return BTFTypeBase::getSize() + Parameters.size() * BTF::BTFParamSize;
	}
	void completeType(BTFDebug &BDebug);
	void emitType(MCStreamer &OS);
	};

	/// Handle subprogram
	class BTFTypeFunc : public BTFTypeBase {
	StringRef Name;

	public:
	BTFTypeFunc(StringRef FuncName, uint32_t ProtoTypeId);
	uint32_t getSize() { return BTFTypeBase::getSize(); }
	void completeType(BTFDebug &BDebug);
	void emitType(MCStreamer &OS);
	};

	/// Handle variable instances
	class BTFKindVar : public BTFTypeBase {
	StringRef Name;
	uint32_t Info;

	public:
	BTFKindVar(StringRef VarName, uint32_t TypeId, uint32_t VarInfo);
	uint32_t getSize() { return BTFTypeBase::getSize() + 4; }
	void completeType(BTFDebug &BDebug);
	void emitType(MCStreamer &OS);
	};

	/// Handle data sections
	class BTFKindDataSec : public BTFTypeBase {
	AsmPrinter *Asm;
	std::string Name;
	std::vector<std::tuple<uint32_t, const MCSymbol *, uint32_t>> Vars;

	public:
	BTFKindDataSec(AsmPrinter *AsmPrt, std::string SecName);
	uint32_t getSize() {
	return BTFTypeBase::getSize() + BTF::BTFDataSecVarSize * Vars.size();
	}
	void addVar(uint32_t Id, const MCSymbol *Sym, uint32_t Size) {
	Vars.push_back(std::make_tuple(Id, Sym, Size));
	}
	std::string getName() { return Name; }
	void completeType(BTFDebug &BDebug);
	void emitType(MCStreamer &OS);
	};

	/// String table.
	class BTFStringTable {
	/// String table size in bytes.
	uint32_t Size;
	/// A mapping from string table offset to the index
	/// of the Table. It is used to avoid putting
	/// duplicated strings in the table.
	std::unordered_map<uint32_t, uint32_t> OffsetToIdMap;
	/// A vector of strings to represent the string table.
	std::vector<std::string> Table;

	public:
	BTFStringTable() : Size(0) {}
	uint32_t getSize() { return Size; }
	std::vector<std::string> &getTable() { return Table; }
	/// Add a string to the string table and returns its offset
	/// in the table.
	uint32_t addString(StringRef S);
	};

	/// Represent one func and its type id.
	struct BTFFuncInfo {
	const MCSymbol *Label; ///< Func MCSymbol
	uint32_t TypeId; ///< Type id referring to .BTF type section
	};

	/// Represent one line info.
	struct BTFLineInfo {
	MCSymbol *Label; ///< MCSymbol identifying insn for the lineinfo
	uint32_t FileNameOff; ///< file name offset in the .BTF string table
	uint32_t LineOff; ///< line offset in the .BTF string table
	uint32_t LineNum; ///< the line number
	uint32_t ColumnNum; ///< the column number
	};

	/// Represent one offset relocation.
	struct BTFOffsetReloc {
	const MCSymbol *Label; ///< MCSymbol identifying insn for the reloc
	uint32_t TypeID; ///< Type ID
	uint32_t OffsetNameOff; ///< The string to traverse types
	};

	/// Represent one extern relocation.
	struct BTFExternReloc {
	const MCSymbol *Label; ///< MCSymbol identifying insn for the reloc
	uint32_t ExternNameOff; ///< The extern variable name
	};

	/// Collect and emit BTF information.
	class BTFDebug : public DebugHandlerBase {
	MCStreamer &OS;
	bool SkipInstruction;
	bool LineInfoGenerated;
	uint32_t SecNameOff;
	uint32_t ArrayIndexTypeId;
	bool MapDefNotCollected;
	BTFStringTable StringTable;
	std::vector<std::unique_ptr<BTFTypeBase>> TypeEntries;
	std::unordered_map<const DIType *, uint32_t> DIToIdMap;
	std::map<uint32_t, std::vector<BTFFuncInfo>> FuncInfoTable;
	std::map<uint32_t, std::vector<BTFLineInfo>> LineInfoTable;
	std::map<uint32_t, std::vector<BTFOffsetReloc>> OffsetRelocTable;
	std::map<uint32_t, std::vector<BTFExternReloc>> ExternRelocTable;
	StringMap<std::vector<std::string>> FileContent;
	std::map<std::string, std::unique_ptr<BTFKindDataSec>> DataSecEntries;
	std::vector<BTFTypeStruct *> StructTypes;
	std::vector<BTFTypeArray *> ArrayTypes;
	std::map<std::string, int64_t> AccessOffsets;
	std::map<StringRef, std::pair<bool, std::vector<BTFTypeDerived *>>>
	FixupDerivedTypes;

	/// Add types to TypeEntries.
	/// @{
	/// Add types to TypeEntries and DIToIdMap.
	uint32_t addType(std::unique_ptr<BTFTypeBase> TypeEntry, const DIType *Ty);
	/// Add types to TypeEntries only and return type id.
	uint32_t addType(std::unique_ptr<BTFTypeBase> TypeEntry);
	/// @}

	/// IR type visiting functions.
	/// @{
	void visitTypeEntry(const DIType *Ty);
	void visitTypeEntry(const DIType *Ty, uint32_t &TypeId, bool CheckPointer,
	bool SeenPointer);
	void visitBasicType(const DIBasicType *BTy, uint32_t &TypeId);
	void visitSubroutineType(
	const DISubroutineType *STy, bool ForSubprog,
	const std::unordered_map<uint32_t, StringRef> &FuncArgNames,
	uint32_t &TypeId);
	void visitFwdDeclType(const DICompositeType *CTy, bool IsUnion,
	uint32_t &TypeId);
	void visitCompositeType(const DICompositeType *CTy, uint32_t &TypeId);
	void visitStructType(const DICompositeType *STy, bool IsStruct,
	uint32_t &TypeId);
	void visitArrayType(const DICompositeType *ATy, uint32_t &TypeId);
	void visitEnumType(const DICompositeType *ETy, uint32_t &TypeId);
	void visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
	bool CheckPointer, bool SeenPointer);
	void visitMapDefType(const DIType *Ty, uint32_t &TypeId);
	/// @}

	/// Get the file content for the subprogram. Certain lines of the file
	/// later may be put into string table and referenced by line info.
	std::string populateFileContent(const DISubprogram *SP);

	/// Construct a line info.
	void constructLineInfo(const DISubprogram SP, MCSymbol Label, uint32_t Line,
	uint32_t Column);

	/// Generate types and variables for globals.
	void processGlobals(bool ProcessingMapDef);

	/// Generate one offset relocation record.
	void generateOffsetReloc(const MachineInstr MI, const MCSymbol ORSym,
	DIType *RootTy, StringRef AccessPattern);

	/// Set the to-be-traversed Struct/Array Type based on TypeId.
	void setTypeFromId(uint32_t TypeId, BTFTypeStruct **PrevStructType,
	BTFTypeArray **PrevArrayType);

	/// Populating unprocessed struct type.
	unsigned populateStructType(const DIType *Ty);

	/// Process LD_imm64 instructions.
	void processLDimm64(const MachineInstr *MI);

	/// Emit common header of .BTF and .BTF.ext sections.
	void emitCommonHeader();

	/// Emit the .BTF section.
	void emitBTFSection();

	/// Emit the .BTF.ext section.
	void emitBTFExtSection();

	protected:
	/// Gather pre-function debug information.
	void beginFunctionImpl(const MachineFunction *MF) override;

	/// Post process after all instructions in this function are processed.
	void endFunctionImpl(const MachineFunction *MF) override;

	public:
	BTFDebug(AsmPrinter *AP);

	///
	bool InstLower(const MachineInstr *MI, MCInst &OutMI);

	/// Get the special array index type id.
	uint32_t getArrayIndexTypeId() {
	assert(ArrayIndexTypeId);
	return ArrayIndexTypeId;
	}

	/// Add string to the string table.
	size_t addString(StringRef S) { return StringTable.addString(S); }

	/// Get the type id for a particular DIType.
	uint32_t getTypeId(const DIType *Ty) {
	assert(Ty && "Invalid null Type");
	assert(DIToIdMap.find(Ty) != DIToIdMap.end() &&
	"DIType not added in the BDIToIdMap");
	return DIToIdMap[Ty];
	}

	void setSymbolSize(const MCSymbol *Symbol, uint64_t Size) override {}

	/// Process beginning of an instruction.
	void beginInstruction(const MachineInstr *MI) override;

	/// Complete all the types and emit the BTF sections.
	void endModule() override;
	};

	} // end namespace llvm

	#endif
	Index: vendor/llvm/dist-release_90/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp (revision 351303)
	@@ -1,1775 +1,1793 @@
	//===-- RISCVAsmParser.cpp - Parse RISCV assembly to MCInst instructions --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "MCTargetDesc/RISCVAsmBackend.h"
	#include "MCTargetDesc/RISCVMCExpr.h"
	#include "MCTargetDesc/RISCVMCTargetDesc.h"
	#include "MCTargetDesc/RISCVTargetStreamer.h"
	#include "TargetInfo/RISCVTargetInfo.h"
	#include "Utils/RISCVBaseInfo.h"
	#include "Utils/RISCVMatInt.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/MC/MCAssembler.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCInst.h"
	#include "llvm/MC/MCInstBuilder.h"
	#include "llvm/MC/MCObjectFileInfo.h"
	#include "llvm/MC/MCParser/MCAsmLexer.h"
	#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
	#include "llvm/MC/MCParser/MCTargetAsmParser.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MCSubtargetInfo.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/TargetRegistry.h"

	#include <limits>

	using namespace llvm;

	// Include the auto-generated portion of the compress emitter.
	#define GEN_COMPRESS_INSTR
	#include "RISCVGenCompressInstEmitter.inc"

	namespace {
	struct RISCVOperand;

	class RISCVAsmParser : public MCTargetAsmParser {
	SmallVector<FeatureBitset, 4> FeatureBitStack;

	SMLoc getLoc() const { return getParser().getTok().getLoc(); }
	bool isRV64() const { return getSTI().hasFeature(RISCV::Feature64Bit); }
	bool isRV32E() const { return getSTI().hasFeature(RISCV::FeatureRV32E); }

	RISCVTargetStreamer &getTargetStreamer() {
	MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
	return static_cast<RISCVTargetStreamer &>(TS);
	}

	unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
	unsigned Kind) override;

	bool generateImmOutOfRangeError(OperandVector &Operands, uint64_t ErrorInfo,
	int64_t Lower, int64_t Upper, Twine Msg);

	bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
	OperandVector &Operands, MCStreamer &Out,
	uint64_t &ErrorInfo,
	bool MatchingInlineAsm) override;

	bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;

	bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
	SMLoc NameLoc, OperandVector &Operands) override;

	bool ParseDirective(AsmToken DirectiveID) override;

	// Helper to actually emit an instruction to the MCStreamer. Also, when
	// possible, compression of the instruction is performed.
	void emitToStreamer(MCStreamer &S, const MCInst &Inst);

	// Helper to emit a combination of LUI, ADDI(W), and SLLI instructions that
	// synthesize the desired immedate value into the destination register.
	void emitLoadImm(unsigned DestReg, int64_t Value, MCStreamer &Out);

	// Helper to emit a combination of AUIPC and SecondOpcode. Used to implement
	// helpers such as emitLoadLocalAddress and emitLoadAddress.
	void emitAuipcInstPair(MCOperand DestReg, MCOperand TmpReg,
	const MCExpr *Symbol, RISCVMCExpr::VariantKind VKHi,
	unsigned SecondOpcode, SMLoc IDLoc, MCStreamer &Out);

	// Helper to emit pseudo instruction "lla" used in PC-rel addressing.
	void emitLoadLocalAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);

	// Helper to emit pseudo instruction "la" used in GOT/PC-rel addressing.
	void emitLoadAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);

	// Helper to emit pseudo instruction "la.tls.ie" used in initial-exec TLS
	// addressing.
	void emitLoadTLSIEAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);

	// Helper to emit pseudo instruction "la.tls.gd" used in global-dynamic TLS
	// addressing.
	void emitLoadTLSGDAddress(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);

	// Helper to emit pseudo load/store instruction with a symbol.
	void emitLoadStoreSymbol(MCInst &Inst, unsigned Opcode, SMLoc IDLoc,
	MCStreamer &Out, bool HasTmpReg);

	// Checks that a PseudoAddTPRel is using x4/tp in its second input operand.
	// Enforcing this using a restricted register class for the second input
	// operand of PseudoAddTPRel results in a poor diagnostic due to the fact
	// 'add' is an overloaded mnemonic.
	bool checkPseudoAddTPRel(MCInst &Inst, OperandVector &Operands);

	/// Helper for processing MC instructions that have been successfully matched
	/// by MatchAndEmitInstruction. Modifications to the emitted instructions,
	/// like the expansion of pseudo instructions (e.g., "li"), can be performed
	/// in this method.
	bool processInstruction(MCInst &Inst, SMLoc IDLoc, OperandVector &Operands,
	MCStreamer &Out);

	// Auto-generated instruction matching functions
	#define GET_ASSEMBLER_HEADER
	#include "RISCVGenAsmMatcher.inc"

	OperandMatchResultTy parseCSRSystemRegister(OperandVector &Operands);
	OperandMatchResultTy parseImmediate(OperandVector &Operands);
	OperandMatchResultTy parseRegister(OperandVector &Operands,
	bool AllowParens = false);
	OperandMatchResultTy parseMemOpBaseReg(OperandVector &Operands);
	OperandMatchResultTy parseOperandWithModifier(OperandVector &Operands);
	OperandMatchResultTy parseBareSymbol(OperandVector &Operands);
	OperandMatchResultTy parseCallSymbol(OperandVector &Operands);
	OperandMatchResultTy parseJALOffset(OperandVector &Operands);

	bool parseOperand(OperandVector &Operands, StringRef Mnemonic);

	bool parseDirectiveOption();

	void setFeatureBits(uint64_t Feature, StringRef FeatureString) {
	if (!(getSTI().getFeatureBits()[Feature])) {
	MCSubtargetInfo &STI = copySTI();
	setAvailableFeatures(
	ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
	}
	}

	void clearFeatureBits(uint64_t Feature, StringRef FeatureString) {
	if (getSTI().getFeatureBits()[Feature]) {
	MCSubtargetInfo &STI = copySTI();
	setAvailableFeatures(
	ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
	}
	}

	void pushFeatureBits() {
	FeatureBitStack.push_back(getSTI().getFeatureBits());
	}

	bool popFeatureBits() {
	if (FeatureBitStack.empty())
	return true;

	FeatureBitset FeatureBits = FeatureBitStack.pop_back_val();
	copySTI().setFeatureBits(FeatureBits);
	setAvailableFeatures(ComputeAvailableFeatures(FeatureBits));

	return false;
	}
	public:
	enum RISCVMatchResultTy {
	Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
	#define GET_OPERAND_DIAGNOSTIC_TYPES
	#include "RISCVGenAsmMatcher.inc"
	#undef GET_OPERAND_DIAGNOSTIC_TYPES
	};

	static bool classifySymbolRef(const MCExpr *Expr,
	RISCVMCExpr::VariantKind &Kind,
	int64_t &Addend);

	RISCVAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
	const MCInstrInfo &MII, const MCTargetOptions &Options)
	: MCTargetAsmParser(Options, STI, MII) {
	Parser.addAliasForDirective(".half", ".2byte");
	Parser.addAliasForDirective(".hword", ".2byte");
	Parser.addAliasForDirective(".word", ".4byte");
	Parser.addAliasForDirective(".dword", ".8byte");
	setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
	}
	};

	/// RISCVOperand - Instances of this class represent a parsed machine
	/// instruction
	struct RISCVOperand : public MCParsedAsmOperand {

	enum KindTy {
	Token,
	Register,
	Immediate,
	SystemRegister
	} Kind;

	bool IsRV64;

	struct RegOp {
	unsigned RegNum;
	};

	struct ImmOp {
	const MCExpr *Val;
	};

	struct SysRegOp {
	const char *Data;
	unsigned Length;
	unsigned Encoding;
	// FIXME: Add the Encoding parsed fields as needed for checks,
	// e.g.: read/write or user/supervisor/machine privileges.
	};

	SMLoc StartLoc, EndLoc;
	union {
	StringRef Tok;
	RegOp Reg;
	ImmOp Imm;
	struct SysRegOp SysReg;
	};

	RISCVOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}

	public:
	RISCVOperand(const RISCVOperand &o) : MCParsedAsmOperand() {
	Kind = o.Kind;
	IsRV64 = o.IsRV64;
	StartLoc = o.StartLoc;
	EndLoc = o.EndLoc;
	switch (Kind) {
	case Register:
	Reg = o.Reg;
	break;
	case Immediate:
	Imm = o.Imm;
	break;
	case Token:
	Tok = o.Tok;
	break;
	case SystemRegister:
	SysReg = o.SysReg;
	break;
	}
	}

	bool isToken() const override { return Kind == Token; }
	bool isReg() const override { return Kind == Register; }
	bool isImm() const override { return Kind == Immediate; }
	bool isMem() const override { return false; }
	bool isSystemRegister() const { return Kind == SystemRegister; }

	static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm,
	RISCVMCExpr::VariantKind &VK) {
	if (auto *RE = dyn_cast<RISCVMCExpr>(Expr)) {
	VK = RE->getKind();
	return RE->evaluateAsConstant(Imm);
	}

	if (auto CE = dyn_cast<MCConstantExpr>(Expr)) {
	VK = RISCVMCExpr::VK_RISCV_None;
	Imm = CE->getValue();
	return true;
	}

	return false;
	}

	// True if operand is a symbol with no modifiers, or a constant with no
	// modifiers and isShiftedInt<N-1, 1>(Op).
	template <int N> bool isBareSimmNLsb0() const {
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	if (!isImm())
	return false;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	bool IsValid;
	if (!IsConstantImm)
	IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
	else
	IsValid = isShiftedInt<N - 1, 1>(Imm);
	return IsValid && VK == RISCVMCExpr::VK_RISCV_None;
	}

	// Predicate methods for AsmOperands defined in RISCVInstrInfo.td

	bool isBareSymbol() const {
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	// Must be of 'immediate' type but not a constant.
	if (!isImm() \|\| evaluateConstantImm(getImm(), Imm, VK))
	return false;
	return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
	VK == RISCVMCExpr::VK_RISCV_None;
	}

	bool isCallSymbol() const {
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	// Must be of 'immediate' type but not a constant.
	if (!isImm() \|\| evaluateConstantImm(getImm(), Imm, VK))
	return false;
	return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
	(VK == RISCVMCExpr::VK_RISCV_CALL \|\|
	VK == RISCVMCExpr::VK_RISCV_CALL_PLT);
	}

	bool isTPRelAddSymbol() const {
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	// Must be of 'immediate' type but not a constant.
	if (!isImm() \|\| evaluateConstantImm(getImm(), Imm, VK))
	return false;
	return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
	VK == RISCVMCExpr::VK_RISCV_TPREL_ADD;
	}

	bool isCSRSystemRegister() const { return isSystemRegister(); }

	/// Return true if the operand is a valid for the fence instruction e.g.
	/// ('iorw').
	bool isFenceArg() const {
	if (!isImm())
	return false;
	const MCExpr *Val = getImm();
	auto *SVal = dyn_cast<MCSymbolRefExpr>(Val);
	if (!SVal \|\| SVal->getKind() != MCSymbolRefExpr::VK_None)
	return false;

	StringRef Str = SVal->getSymbol().getName();
	// Letters must be unique, taken from 'iorw', and in ascending order. This
	// holds as long as each individual character is one of 'iorw' and is
	// greater than the previous character.
	char Prev = '\0';
	for (char c : Str) {
	if (c != 'i' && c != 'o' && c != 'r' && c != 'w')
	return false;
	if (c <= Prev)
	return false;
	Prev = c;
	}
	return true;
	}

	/// Return true if the operand is a valid floating point rounding mode.
	bool isFRMArg() const {
	if (!isImm())
	return false;
	const MCExpr *Val = getImm();
	auto *SVal = dyn_cast<MCSymbolRefExpr>(Val);
	if (!SVal \|\| SVal->getKind() != MCSymbolRefExpr::VK_None)
	return false;

	StringRef Str = SVal->getSymbol().getName();

	return RISCVFPRndMode::stringToRoundingMode(Str) != RISCVFPRndMode::Invalid;
	}

	bool isImmXLenLI() const {
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	if (!isImm())
	return false;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	if (VK == RISCVMCExpr::VK_RISCV_LO \|\| VK == RISCVMCExpr::VK_RISCV_PCREL_LO)
	return true;
	// Given only Imm, ensuring that the actually specified constant is either
	// a signed or unsigned 64-bit number is unfortunately impossible.
	bool IsInRange = isRV64() ? true : isInt<32>(Imm) \|\| isUInt<32>(Imm);
	return IsConstantImm && IsInRange && VK == RISCVMCExpr::VK_RISCV_None;
	}

	bool isUImmLog2XLen() const {
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	if (!isImm())
	return false;
	if (!evaluateConstantImm(getImm(), Imm, VK) \|\|
	VK != RISCVMCExpr::VK_RISCV_None)
	return false;
	return (isRV64() && isUInt<6>(Imm)) \|\| isUInt<5>(Imm);
	}

	bool isUImmLog2XLenNonZero() const {
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	if (!isImm())
	return false;
	if (!evaluateConstantImm(getImm(), Imm, VK) \|\|
	VK != RISCVMCExpr::VK_RISCV_None)
	return false;
	if (Imm == 0)
	return false;
	return (isRV64() && isUInt<6>(Imm)) \|\| isUInt<5>(Imm);
	}

	bool isUImm5() const {
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	if (!isImm())
	return false;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	return IsConstantImm && isUInt<5>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
	}

	bool isUImm5NonZero() const {
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	if (!isImm())
	return false;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	return IsConstantImm && isUInt<5>(Imm) && (Imm != 0) &&
	VK == RISCVMCExpr::VK_RISCV_None;
	}

	bool isSImm6() const {
	if (!isImm())
	return false;
	RISCVMCExpr::VariantKind VK;
	int64_t Imm;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	return IsConstantImm && isInt<6>(Imm) &&
	VK == RISCVMCExpr::VK_RISCV_None;
	}

	bool isSImm6NonZero() const {
	if (!isImm())
	return false;
	RISCVMCExpr::VariantKind VK;
	int64_t Imm;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	return IsConstantImm && isInt<6>(Imm) && (Imm != 0) &&
	VK == RISCVMCExpr::VK_RISCV_None;
	}

	bool isCLUIImm() const {
	if (!isImm())
	return false;
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	return IsConstantImm && (Imm != 0) &&
	(isUInt<5>(Imm) \|\| (Imm >= 0xfffe0 && Imm <= 0xfffff)) &&
	VK == RISCVMCExpr::VK_RISCV_None;
	}

	bool isUImm7Lsb00() const {
	if (!isImm())
	return false;
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	return IsConstantImm && isShiftedUInt<5, 2>(Imm) &&
	VK == RISCVMCExpr::VK_RISCV_None;
	}

	bool isUImm8Lsb00() const {
	if (!isImm())
	return false;
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	return IsConstantImm && isShiftedUInt<6, 2>(Imm) &&
	VK == RISCVMCExpr::VK_RISCV_None;
	}

	bool isUImm8Lsb000() const {
	if (!isImm())
	return false;
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	return IsConstantImm && isShiftedUInt<5, 3>(Imm) &&
	VK == RISCVMCExpr::VK_RISCV_None;
	}

	bool isSImm9Lsb0() const { return isBareSimmNLsb0<9>(); }

	bool isUImm9Lsb000() const {
	if (!isImm())
	return false;
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	return IsConstantImm && isShiftedUInt<6, 3>(Imm) &&
	VK == RISCVMCExpr::VK_RISCV_None;
	}

	bool isUImm10Lsb00NonZero() const {
	if (!isImm())
	return false;
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	return IsConstantImm && isShiftedUInt<8, 2>(Imm) && (Imm != 0) &&
	VK == RISCVMCExpr::VK_RISCV_None;
	}

	bool isSImm12() const {
	RISCVMCExpr::VariantKind VK;
	int64_t Imm;
	bool IsValid;
	if (!isImm())
	return false;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	if (!IsConstantImm)
	IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
	else
	IsValid = isInt<12>(Imm);
	return IsValid && ((IsConstantImm && VK == RISCVMCExpr::VK_RISCV_None) \|\|
	VK == RISCVMCExpr::VK_RISCV_LO \|\|
	VK == RISCVMCExpr::VK_RISCV_PCREL_LO \|\|
	VK == RISCVMCExpr::VK_RISCV_TPREL_LO);
	}

	bool isSImm12Lsb0() const { return isBareSimmNLsb0<12>(); }

	bool isSImm13Lsb0() const { return isBareSimmNLsb0<13>(); }

	bool isSImm10Lsb0000NonZero() const {
	if (!isImm())
	return false;
	int64_t Imm;
	RISCVMCExpr::VariantKind VK;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	return IsConstantImm && (Imm != 0) && isShiftedInt<6, 4>(Imm) &&
	VK == RISCVMCExpr::VK_RISCV_None;
	}

	bool isUImm20LUI() const {
	RISCVMCExpr::VariantKind VK;
	int64_t Imm;
	bool IsValid;
	if (!isImm())
	return false;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	if (!IsConstantImm) {
	IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
	return IsValid && (VK == RISCVMCExpr::VK_RISCV_HI \|\|
	VK == RISCVMCExpr::VK_RISCV_TPREL_HI);
	} else {
	return isUInt<20>(Imm) && (VK == RISCVMCExpr::VK_RISCV_None \|\|
	VK == RISCVMCExpr::VK_RISCV_HI \|\|
	VK == RISCVMCExpr::VK_RISCV_TPREL_HI);
	}
	}

	bool isUImm20AUIPC() const {
	RISCVMCExpr::VariantKind VK;
	int64_t Imm;
	bool IsValid;
	if (!isImm())
	return false;
	bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
	if (!IsConstantImm) {
	IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm);
	return IsValid && (VK == RISCVMCExpr::VK_RISCV_PCREL_HI \|\|
	VK == RISCVMCExpr::VK_RISCV_GOT_HI \|\|
	VK == RISCVMCExpr::VK_RISCV_TLS_GOT_HI \|\|
	VK == RISCVMCExpr::VK_RISCV_TLS_GD_HI);
	} else {
	return isUInt<20>(Imm) && (VK == RISCVMCExpr::VK_RISCV_None \|\|
	VK == RISCVMCExpr::VK_RISCV_PCREL_HI \|\|
	VK == RISCVMCExpr::VK_RISCV_GOT_HI \|\|
	VK == RISCVMCExpr::VK_RISCV_TLS_GOT_HI \|\|
	VK == RISCVMCExpr::VK_RISCV_TLS_GD_HI);
	}
	}

	bool isSImm21Lsb0JAL() const { return isBareSimmNLsb0<21>(); }

	/// getStartLoc - Gets location of the first token of this operand
	SMLoc getStartLoc() const override { return StartLoc; }
	/// getEndLoc - Gets location of the last token of this operand
	SMLoc getEndLoc() const override { return EndLoc; }
	/// True if this operand is for an RV64 instruction
	bool isRV64() const { return IsRV64; }

	unsigned getReg() const override {
	assert(Kind == Register && "Invalid type access!");
	return Reg.RegNum;
	}

	StringRef getSysReg() const {
	assert(Kind == SystemRegister && "Invalid access!");
	return StringRef(SysReg.Data, SysReg.Length);
	}

	const MCExpr *getImm() const {
	assert(Kind == Immediate && "Invalid type access!");
	return Imm.Val;
	}

	StringRef getToken() const {
	assert(Kind == Token && "Invalid type access!");
	return Tok;
	}

	void print(raw_ostream &OS) const override {
	switch (Kind) {
	case Immediate:
	OS << *getImm();
	break;
	case Register:
	OS << "<register x";
	OS << getReg() << ">";
	break;
	case Token:
	OS << "'" << getToken() << "'";
	break;
	case SystemRegister:
	OS << "<sysreg: " << getSysReg() << '>';
	break;
	}
	}

	static std::unique_ptr<RISCVOperand> createToken(StringRef Str, SMLoc S,
	bool IsRV64) {
	auto Op = make_unique<RISCVOperand>(Token);
	Op->Tok = Str;
	Op->StartLoc = S;
	Op->EndLoc = S;
	Op->IsRV64 = IsRV64;
	return Op;
	}

	static std::unique_ptr<RISCVOperand> createReg(unsigned RegNo, SMLoc S,
	SMLoc E, bool IsRV64) {
	auto Op = make_unique<RISCVOperand>(Register);
	Op->Reg.RegNum = RegNo;
	Op->StartLoc = S;
	Op->EndLoc = E;
	Op->IsRV64 = IsRV64;
	return Op;
	}

	static std::unique_ptr<RISCVOperand> createImm(const MCExpr *Val, SMLoc S,
	SMLoc E, bool IsRV64) {
	auto Op = make_unique<RISCVOperand>(Immediate);
	Op->Imm.Val = Val;
	Op->StartLoc = S;
	Op->EndLoc = E;
	Op->IsRV64 = IsRV64;
	return Op;
	}

	static std::unique_ptr<RISCVOperand>
	createSysReg(StringRef Str, SMLoc S, unsigned Encoding, bool IsRV64) {
	auto Op = make_unique<RISCVOperand>(SystemRegister);
	Op->SysReg.Data = Str.data();
	Op->SysReg.Length = Str.size();
	Op->SysReg.Encoding = Encoding;
	Op->StartLoc = S;
	Op->IsRV64 = IsRV64;
	return Op;
	}

	void addExpr(MCInst &Inst, const MCExpr *Expr) const {
	assert(Expr && "Expr shouldn't be null!");
	int64_t Imm = 0;
	RISCVMCExpr::VariantKind VK;
	bool IsConstant = evaluateConstantImm(Expr, Imm, VK);

	if (IsConstant)
	Inst.addOperand(MCOperand::createImm(Imm));
	else
	Inst.addOperand(MCOperand::createExpr(Expr));
	}

	// Used by the TableGen Code
	void addRegOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createReg(getReg()));
	}

	void addImmOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	addExpr(Inst, getImm());
	}

	void addFenceArgOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	// isFenceArg has validated the operand, meaning this cast is safe
	auto SE = cast<MCSymbolRefExpr>(getImm());

	unsigned Imm = 0;
	for (char c : SE->getSymbol().getName()) {
	switch (c) {
	default:
	llvm_unreachable("FenceArg must contain only [iorw]");
	case 'i': Imm \|= RISCVFenceField::I; break;
	case 'o': Imm \|= RISCVFenceField::O; break;
	case 'r': Imm \|= RISCVFenceField::R; break;
	case 'w': Imm \|= RISCVFenceField::W; break;
	}
	}
	Inst.addOperand(MCOperand::createImm(Imm));
	}

	void addCSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createImm(SysReg.Encoding));
	}

	// Returns the rounding mode represented by this RISCVOperand. Should only
	// be called after checking isFRMArg.
	RISCVFPRndMode::RoundingMode getRoundingMode() const {
	// isFRMArg has validated the operand, meaning this cast is safe.
	auto SE = cast<MCSymbolRefExpr>(getImm());
	RISCVFPRndMode::RoundingMode FRM =
	RISCVFPRndMode::stringToRoundingMode(SE->getSymbol().getName());
	assert(FRM != RISCVFPRndMode::Invalid && "Invalid rounding mode");
	return FRM;
	}

	void addFRMArgOperands(MCInst &Inst, unsigned N) const {
	assert(N == 1 && "Invalid number of operands!");
	Inst.addOperand(MCOperand::createImm(getRoundingMode()));
	}
	};
	} // end anonymous namespace.

	#define GET_REGISTER_MATCHER
	#define GET_MATCHER_IMPLEMENTATION
	#include "RISCVGenAsmMatcher.inc"

	// Return the matching FPR64 register for the given FPR32.
	// FIXME: Ideally this function could be removed in favour of using
	// information from TableGen.
	unsigned convertFPR32ToFPR64(unsigned Reg) {
	switch (Reg) {
	default:
	llvm_unreachable("Not a recognised FPR32 register");
	case RISCV::F0_32: return RISCV::F0_64;
	case RISCV::F1_32: return RISCV::F1_64;
	case RISCV::F2_32: return RISCV::F2_64;
	case RISCV::F3_32: return RISCV::F3_64;
	case RISCV::F4_32: return RISCV::F4_64;
	case RISCV::F5_32: return RISCV::F5_64;
	case RISCV::F6_32: return RISCV::F6_64;
	case RISCV::F7_32: return RISCV::F7_64;
	case RISCV::F8_32: return RISCV::F8_64;
	case RISCV::F9_32: return RISCV::F9_64;
	case RISCV::F10_32: return RISCV::F10_64;
	case RISCV::F11_32: return RISCV::F11_64;
	case RISCV::F12_32: return RISCV::F12_64;
	case RISCV::F13_32: return RISCV::F13_64;
	case RISCV::F14_32: return RISCV::F14_64;
	case RISCV::F15_32: return RISCV::F15_64;
	case RISCV::F16_32: return RISCV::F16_64;
	case RISCV::F17_32: return RISCV::F17_64;
	case RISCV::F18_32: return RISCV::F18_64;
	case RISCV::F19_32: return RISCV::F19_64;
	case RISCV::F20_32: return RISCV::F20_64;
	case RISCV::F21_32: return RISCV::F21_64;
	case RISCV::F22_32: return RISCV::F22_64;
	case RISCV::F23_32: return RISCV::F23_64;
	case RISCV::F24_32: return RISCV::F24_64;
	case RISCV::F25_32: return RISCV::F25_64;
	case RISCV::F26_32: return RISCV::F26_64;
	case RISCV::F27_32: return RISCV::F27_64;
	case RISCV::F28_32: return RISCV::F28_64;
	case RISCV::F29_32: return RISCV::F29_64;
	case RISCV::F30_32: return RISCV::F30_64;
	case RISCV::F31_32: return RISCV::F31_64;
	}
	}

	unsigned RISCVAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
	unsigned Kind) {
	RISCVOperand &Op = static_cast<RISCVOperand &>(AsmOp);
	if (!Op.isReg())
	return Match_InvalidOperand;

	unsigned Reg = Op.getReg();
	bool IsRegFPR32 =
	RISCVMCRegisterClasses[RISCV::FPR32RegClassID].contains(Reg);
	bool IsRegFPR32C =
	RISCVMCRegisterClasses[RISCV::FPR32CRegClassID].contains(Reg);

	// As the parser couldn't differentiate an FPR32 from an FPR64, coerce the
	// register from FPR32 to FPR64 or FPR32C to FPR64C if necessary.
	if ((IsRegFPR32 && Kind == MCK_FPR64) \|\|
	(IsRegFPR32C && Kind == MCK_FPR64C)) {
	Op.Reg.RegNum = convertFPR32ToFPR64(Reg);
	return Match_Success;
	}
	return Match_InvalidOperand;
	}

	bool RISCVAsmParser::generateImmOutOfRangeError(
	OperandVector &Operands, uint64_t ErrorInfo, int64_t Lower, int64_t Upper,
	Twine Msg = "immediate must be an integer in the range") {
	SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
	return Error(ErrorLoc, Msg + " [" + Twine(Lower) + ", " + Twine(Upper) + "]");
	}

	bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
	OperandVector &Operands,
	MCStreamer &Out,
	uint64_t &ErrorInfo,
	bool MatchingInlineAsm) {
	MCInst Inst;

	auto Result =
	MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
	switch (Result) {
	default:
	break;
	case Match_Success:
	return processInstruction(Inst, IDLoc, Operands, Out);
	case Match_MissingFeature:
	return Error(IDLoc, "instruction use requires an option to be enabled");
	case Match_MnemonicFail:
	return Error(IDLoc, "unrecognized instruction mnemonic");
	case Match_InvalidOperand: {
	SMLoc ErrorLoc = IDLoc;
	if (ErrorInfo != ~0U) {
	if (ErrorInfo >= Operands.size())
	return Error(ErrorLoc, "too few operands for instruction");

	ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
	if (ErrorLoc == SMLoc())
	ErrorLoc = IDLoc;
	}
	return Error(ErrorLoc, "invalid operand for instruction");
	}
	}

	// Handle the case when the error message is of specific type
	// other than the generic Match_InvalidOperand, and the
	// corresponding operand is missing.
	if (Result > FIRST_TARGET_MATCH_RESULT_TY) {
	SMLoc ErrorLoc = IDLoc;
	if (ErrorInfo != ~0U && ErrorInfo >= Operands.size())
	return Error(ErrorLoc, "too few operands for instruction");
	}

	switch(Result) {
	default:
	break;
	case Match_InvalidImmXLenLI:
	if (isRV64()) {
	SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
	return Error(ErrorLoc, "operand must be a constant 64-bit integer");
	}
	return generateImmOutOfRangeError(Operands, ErrorInfo,
	std::numeric_limits<int32_t>::min(),
	std::numeric_limits<uint32_t>::max());
	case Match_InvalidUImmLog2XLen:
	if (isRV64())
	return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 6) - 1);
	return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 5) - 1);
	case Match_InvalidUImmLog2XLenNonZero:
	if (isRV64())
	return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 6) - 1);
	return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 5) - 1);
	case Match_InvalidUImm5:
	return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 5) - 1);
	case Match_InvalidSImm6:
	return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 5),
	(1 << 5) - 1);
	case Match_InvalidSImm6NonZero:
	return generateImmOutOfRangeError(
	Operands, ErrorInfo, -(1 << 5), (1 << 5) - 1,
	"immediate must be non-zero in the range");
	case Match_InvalidCLUIImm:
	return generateImmOutOfRangeError(
	Operands, ErrorInfo, 1, (1 << 5) - 1,
	"immediate must be in [0xfffe0, 0xfffff] or");
	case Match_InvalidUImm7Lsb00:
	return generateImmOutOfRangeError(
	Operands, ErrorInfo, 0, (1 << 7) - 4,
	"immediate must be a multiple of 4 bytes in the range");
	case Match_InvalidUImm8Lsb00:
	return generateImmOutOfRangeError(
	Operands, ErrorInfo, 0, (1 << 8) - 4,
	"immediate must be a multiple of 4 bytes in the range");
	case Match_InvalidUImm8Lsb000:
	return generateImmOutOfRangeError(
	Operands, ErrorInfo, 0, (1 << 8) - 8,
	"immediate must be a multiple of 8 bytes in the range");
	case Match_InvalidSImm9Lsb0:
	return generateImmOutOfRangeError(
	Operands, ErrorInfo, -(1 << 8), (1 << 8) - 2,
	"immediate must be a multiple of 2 bytes in the range");
	case Match_InvalidUImm9Lsb000:
	return generateImmOutOfRangeError(
	Operands, ErrorInfo, 0, (1 << 9) - 8,
	"immediate must be a multiple of 8 bytes in the range");
	case Match_InvalidUImm10Lsb00NonZero:
	return generateImmOutOfRangeError(
	Operands, ErrorInfo, 4, (1 << 10) - 4,
	"immediate must be a multiple of 4 bytes in the range");
	case Match_InvalidSImm10Lsb0000NonZero:
	return generateImmOutOfRangeError(
	Operands, ErrorInfo, -(1 << 9), (1 << 9) - 16,
	"immediate must be a multiple of 16 bytes and non-zero in the range");
	case Match_InvalidSImm12:
	return generateImmOutOfRangeError(
	Operands, ErrorInfo, -(1 << 11), (1 << 11) - 1,
	"operand must be a symbol with %lo/%pcrel_lo/%tprel_lo modifier or an "
	"integer in the range");
	case Match_InvalidSImm12Lsb0:
	return generateImmOutOfRangeError(
	Operands, ErrorInfo, -(1 << 11), (1 << 11) - 2,
	"immediate must be a multiple of 2 bytes in the range");
	case Match_InvalidSImm13Lsb0:
	return generateImmOutOfRangeError(
	Operands, ErrorInfo, -(1 << 12), (1 << 12) - 2,
	"immediate must be a multiple of 2 bytes in the range");
	case Match_InvalidUImm20LUI:
	return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 20) - 1,
	"operand must be a symbol with "
	"%hi/%tprel_hi modifier or an integer in "
	"the range");
	case Match_InvalidUImm20AUIPC:
	return generateImmOutOfRangeError(
	Operands, ErrorInfo, 0, (1 << 20) - 1,
	"operand must be a symbol with a "
	"%pcrel_hi/%got_pcrel_hi/%tls_ie_pcrel_hi/%tls_gd_pcrel_hi modifier or "
	"an integer in the range");
	case Match_InvalidSImm21Lsb0JAL:
	return generateImmOutOfRangeError(
	Operands, ErrorInfo, -(1 << 20), (1 << 20) - 2,
	"immediate must be a multiple of 2 bytes in the range");
	case Match_InvalidCSRSystemRegister: {
	return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 12) - 1,
	"operand must be a valid system register "
	"name or an integer in the range");
	}
	case Match_InvalidFenceArg: {
	SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
	return Error(
	ErrorLoc,
	"operand must be formed of letters selected in-order from 'iorw'");
	}
	case Match_InvalidFRMArg: {
	SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
	return Error(
	ErrorLoc,
	"operand must be a valid floating point rounding mode mnemonic");
	}
	case Match_InvalidBareSymbol: {
	SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
	return Error(ErrorLoc, "operand must be a bare symbol name");
	}
	case Match_InvalidCallSymbol: {
	SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
	return Error(ErrorLoc, "operand must be a bare symbol name");
	}
	case Match_InvalidTPRelAddSymbol: {
	SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
	return Error(ErrorLoc, "operand must be a symbol with %tprel_add modifier");
	}
	}

	llvm_unreachable("Unknown match type detected!");
	}

	// Attempts to match Name as a register (either using the default name or
	// alternative ABI names), setting RegNo to the matching register. Upon
	// failure, returns true and sets RegNo to 0. If IsRV32E then registers
	// x16-x31 will be rejected.
	static bool matchRegisterNameHelper(bool IsRV32E, unsigned &RegNo,
	StringRef Name) {
	RegNo = MatchRegisterName(Name);
	if (RegNo == 0)
	RegNo = MatchRegisterAltName(Name);
	if (IsRV32E && RegNo >= RISCV::X16 && RegNo <= RISCV::X31)
	RegNo = 0;
	return RegNo == 0;
	}

	bool RISCVAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
	SMLoc &EndLoc) {
	const AsmToken &Tok = getParser().getTok();
	StartLoc = Tok.getLoc();
	EndLoc = Tok.getEndLoc();
	RegNo = 0;
	StringRef Name = getLexer().getTok().getIdentifier();

	if (matchRegisterNameHelper(isRV32E(), RegNo, Name))
	return Error(StartLoc, "invalid register name");

	getParser().Lex(); // Eat identifier token.
	return false;
	}

	OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands,
	bool AllowParens) {
	SMLoc FirstS = getLoc();
	bool HadParens = false;
	AsmToken LParen;

	// If this is an LParen and a parenthesised register name is allowed, parse it
	// atomically.
	if (AllowParens && getLexer().is(AsmToken::LParen)) {
	AsmToken Buf[2];
	size_t ReadCount = getLexer().peekTokens(Buf);
	if (ReadCount == 2 && Buf[1].getKind() == AsmToken::RParen) {
	HadParens = true;
	LParen = getParser().getTok();
	getParser().Lex(); // Eat '('
	}
	}

	switch (getLexer().getKind()) {
	default:
	if (HadParens)
	getLexer().UnLex(LParen);
	return MatchOperand_NoMatch;
	case AsmToken::Identifier:
	StringRef Name = getLexer().getTok().getIdentifier();
	unsigned RegNo;
	matchRegisterNameHelper(isRV32E(), RegNo, Name);

	if (RegNo == 0) {
	if (HadParens)
	getLexer().UnLex(LParen);
	return MatchOperand_NoMatch;
	}
	if (HadParens)
	Operands.push_back(RISCVOperand::createToken("(", FirstS, isRV64()));
	SMLoc S = getLoc();
	SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
	getLexer().Lex();
	Operands.push_back(RISCVOperand::createReg(RegNo, S, E, isRV64()));
	}

	if (HadParens) {
	getParser().Lex(); // Eat ')'
	Operands.push_back(RISCVOperand::createToken(")", getLoc(), isRV64()));
	}

	return MatchOperand_Success;
	}

	OperandMatchResultTy
	RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) {
	SMLoc S = getLoc();
	const MCExpr *Res;

	switch (getLexer().getKind()) {
	default:
	return MatchOperand_NoMatch;
	case AsmToken::LParen:
	case AsmToken::Minus:
	case AsmToken::Plus:
	case AsmToken::Exclaim:
	case AsmToken::Tilde:
	case AsmToken::Integer:
	case AsmToken::String: {
	if (getParser().parseExpression(Res))
	return MatchOperand_ParseFail;

	auto *CE = dyn_cast<MCConstantExpr>(Res);
	if (CE) {
	int64_t Imm = CE->getValue();
	if (isUInt<12>(Imm)) {
	auto SysReg = RISCVSysReg::lookupSysRegByEncoding(Imm);
	// Accept an immediate representing a named or un-named Sys Reg
	// if the range is valid, regardless of the required features.
	Operands.push_back(RISCVOperand::createSysReg(
	SysReg ? SysReg->Name : "", S, Imm, isRV64()));
	return MatchOperand_Success;
	}
	}

	Twine Msg = "immediate must be an integer in the range";
	Error(S, Msg + " [" + Twine(0) + ", " + Twine((1 << 12) - 1) + "]");
	return MatchOperand_ParseFail;
	}
	case AsmToken::Identifier: {
	StringRef Identifier;
	if (getParser().parseIdentifier(Identifier))
	return MatchOperand_ParseFail;

	auto SysReg = RISCVSysReg::lookupSysRegByName(Identifier);
	// Accept a named Sys Reg if the required features are present.
	if (SysReg) {
	if (!SysReg->haveRequiredFeatures(getSTI().getFeatureBits())) {
	Error(S, "system register use requires an option to be enabled");
	return MatchOperand_ParseFail;
	}
	Operands.push_back(RISCVOperand::createSysReg(
	Identifier, S, SysReg->Encoding, isRV64()));
	return MatchOperand_Success;
	}

	Twine Msg = "operand must be a valid system register name "
	"or an integer in the range";
	Error(S, Msg + " [" + Twine(0) + ", " + Twine((1 << 12) - 1) + "]");
	return MatchOperand_ParseFail;
	}
	case AsmToken::Percent: {
	// Discard operand with modifier.
	Twine Msg = "immediate must be an integer in the range";
	Error(S, Msg + " [" + Twine(0) + ", " + Twine((1 << 12) - 1) + "]");
	return MatchOperand_ParseFail;
	}
	}

	return MatchOperand_NoMatch;
	}

	OperandMatchResultTy RISCVAsmParser::parseImmediate(OperandVector &Operands) {
	SMLoc S = getLoc();
	SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
	const MCExpr *Res;

	switch (getLexer().getKind()) {
	default:
	return MatchOperand_NoMatch;
	case AsmToken::LParen:
	case AsmToken::Dot:
	case AsmToken::Minus:
	case AsmToken::Plus:
	case AsmToken::Exclaim:
	case AsmToken::Tilde:
	case AsmToken::Integer:
	case AsmToken::String:
	case AsmToken::Identifier:
	if (getParser().parseExpression(Res))
	return MatchOperand_ParseFail;
	break;
	case AsmToken::Percent:
	return parseOperandWithModifier(Operands);
	}

	Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
	return MatchOperand_Success;
	}

	OperandMatchResultTy
	RISCVAsmParser::parseOperandWithModifier(OperandVector &Operands) {
	SMLoc S = getLoc();
	SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);

	if (getLexer().getKind() != AsmToken::Percent) {
	Error(getLoc(), "expected '%' for operand modifier");
	return MatchOperand_ParseFail;
	}

	getParser().Lex(); // Eat '%'

	if (getLexer().getKind() != AsmToken::Identifier) {
	Error(getLoc(), "expected valid identifier for operand modifier");
	return MatchOperand_ParseFail;
	}
	StringRef Identifier = getParser().getTok().getIdentifier();
	RISCVMCExpr::VariantKind VK = RISCVMCExpr::getVariantKindForName(Identifier);
	if (VK == RISCVMCExpr::VK_RISCV_Invalid) {
	Error(getLoc(), "unrecognized operand modifier");
	return MatchOperand_ParseFail;
	}

	getParser().Lex(); // Eat the identifier
	if (getLexer().getKind() != AsmToken::LParen) {
	Error(getLoc(), "expected '('");
	return MatchOperand_ParseFail;
	}
	getParser().Lex(); // Eat '('

	const MCExpr *SubExpr;
	if (getParser().parseParenExpression(SubExpr, E)) {
	return MatchOperand_ParseFail;
	}

	const MCExpr *ModExpr = RISCVMCExpr::create(SubExpr, VK, getContext());
	Operands.push_back(RISCVOperand::createImm(ModExpr, S, E, isRV64()));
	return MatchOperand_Success;
	}

	OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) {
	SMLoc S = getLoc();
	SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
	const MCExpr *Res;

	if (getLexer().getKind() != AsmToken::Identifier)
	return MatchOperand_NoMatch;

	StringRef Identifier;
	AsmToken Tok = getLexer().getTok();

	if (getParser().parseIdentifier(Identifier))
	return MatchOperand_ParseFail;

	if (Identifier.consume_back("@plt")) {
	Error(getLoc(), "'@plt' operand not valid for instruction");
	return MatchOperand_ParseFail;
	}

	MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);

	if (Sym->isVariable()) {
	const MCExpr V = Sym->getVariableValue(/SetUsed=*/false);
	if (!isa<MCSymbolRefExpr>(V)) {
	getLexer().UnLex(Tok); // Put back if it's not a bare symbol.
	return MatchOperand_NoMatch;
	}
	Res = V;
	} else
	Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
	+
	+ MCBinaryExpr::Opcode Opcode;
	+ switch (getLexer().getKind()) {
	+ default:
	+ Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
	+ return MatchOperand_Success;
	+ case AsmToken::Plus:
	+ Opcode = MCBinaryExpr::Add;
	+ break;
	+ case AsmToken::Minus:
	+ Opcode = MCBinaryExpr::Sub;
	+ break;
	+ }
	+
	+ const MCExpr *Expr;
	+ if (getParser().parseExpression(Expr))
	+ return MatchOperand_ParseFail;
	+ Res = MCBinaryExpr::create(Opcode, Res, Expr, getContext());
	Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
	return MatchOperand_Success;
	}

	OperandMatchResultTy RISCVAsmParser::parseCallSymbol(OperandVector &Operands) {
	SMLoc S = getLoc();
	SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
	const MCExpr *Res;

	if (getLexer().getKind() != AsmToken::Identifier)
	return MatchOperand_NoMatch;

	// Avoid parsing the register in `call rd, foo` as a call symbol.
	if (getLexer().peekTok().getKind() != AsmToken::EndOfStatement)
	return MatchOperand_NoMatch;

	StringRef Identifier;
	if (getParser().parseIdentifier(Identifier))
	return MatchOperand_ParseFail;

	RISCVMCExpr::VariantKind Kind = RISCVMCExpr::VK_RISCV_CALL;
	if (Identifier.consume_back("@plt"))
	Kind = RISCVMCExpr::VK_RISCV_CALL_PLT;

	MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
	Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
	Res = RISCVMCExpr::create(Res, Kind, getContext());
	Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
	return MatchOperand_Success;
	}

	OperandMatchResultTy RISCVAsmParser::parseJALOffset(OperandVector &Operands) {
	// Parsing jal operands is fiddly due to the `jal foo` and `jal ra, foo`
	// both being acceptable forms. When parsing `jal ra, foo` this function
	// will be called for the `ra` register operand in an attempt to match the
	// single-operand alias. parseJALOffset must fail for this case. It would
	// seem logical to try parse the operand using parseImmediate and return
	// NoMatch if the next token is a comma (meaning we must be parsing a jal in
	// the second form rather than the first). We can't do this as there's no
	// way of rewinding the lexer state. Instead, return NoMatch if this operand
	// is an identifier and is followed by a comma.
	if (getLexer().is(AsmToken::Identifier) &&
	getLexer().peekTok().is(AsmToken::Comma))
	return MatchOperand_NoMatch;

	return parseImmediate(Operands);
	}

	OperandMatchResultTy
	RISCVAsmParser::parseMemOpBaseReg(OperandVector &Operands) {
	if (getLexer().isNot(AsmToken::LParen)) {
	Error(getLoc(), "expected '('");
	return MatchOperand_ParseFail;
	}

	getParser().Lex(); // Eat '('
	Operands.push_back(RISCVOperand::createToken("(", getLoc(), isRV64()));

	if (parseRegister(Operands) != MatchOperand_Success) {
	Error(getLoc(), "expected register");
	return MatchOperand_ParseFail;
	}

	if (getLexer().isNot(AsmToken::RParen)) {
	Error(getLoc(), "expected ')'");
	return MatchOperand_ParseFail;
	}

	getParser().Lex(); // Eat ')'
	Operands.push_back(RISCVOperand::createToken(")", getLoc(), isRV64()));

	return MatchOperand_Success;
	}

	/// Looks at a token type and creates the relevant operand from this
	/// information, adding to Operands. If operand was parsed, returns false, else
	/// true.
	bool RISCVAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
	// Check if the current operand has a custom associated parser, if so, try to
	// custom parse the operand, or fallback to the general approach.
	OperandMatchResultTy Result =
	MatchOperandParserImpl(Operands, Mnemonic, /ParseForAllFeatures=/true);
	if (Result == MatchOperand_Success)
	return false;
	if (Result == MatchOperand_ParseFail)
	return true;

	// Attempt to parse token as a register.
	if (parseRegister(Operands, true) == MatchOperand_Success)
	return false;

	// Attempt to parse token as an immediate
	if (parseImmediate(Operands) == MatchOperand_Success) {
	// Parse memory base register if present
	if (getLexer().is(AsmToken::LParen))
	return parseMemOpBaseReg(Operands) != MatchOperand_Success;
	return false;
	}

	// Finally we have exhausted all options and must declare defeat.
	Error(getLoc(), "unknown operand");
	return true;
	}

	bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info,
	StringRef Name, SMLoc NameLoc,
	OperandVector &Operands) {
	// Ensure that if the instruction occurs when relaxation is enabled,
	// relocations are forced for the file. Ideally this would be done when there
	// is enough information to reliably determine if the instruction itself may
	// cause relaxations. Unfortunately instruction processing stage occurs in the
	// same pass as relocation emission, so it's too late to set a 'sticky bit'
	// for the entire file.
	if (getSTI().getFeatureBits()[RISCV::FeatureRelax]) {
	auto *Assembler = getTargetStreamer().getStreamer().getAssemblerPtr();
	if (Assembler != nullptr) {
	RISCVAsmBackend &MAB =
	static_cast<RISCVAsmBackend &>(Assembler->getBackend());
	MAB.setForceRelocs();
	}
	}

	// First operand is token for instruction
	Operands.push_back(RISCVOperand::createToken(Name, NameLoc, isRV64()));

	// If there are no more operands, then finish
	if (getLexer().is(AsmToken::EndOfStatement))
	return false;

	// Parse first operand
	if (parseOperand(Operands, Name))
	return true;

	// Parse until end of statement, consuming commas between operands
	unsigned OperandIdx = 1;
	while (getLexer().is(AsmToken::Comma)) {
	// Consume comma token
	getLexer().Lex();

	// Parse next operand
	if (parseOperand(Operands, Name))
	return true;

	++OperandIdx;
	}

	if (getLexer().isNot(AsmToken::EndOfStatement)) {
	SMLoc Loc = getLexer().getLoc();
	getParser().eatToEndOfStatement();
	return Error(Loc, "unexpected token");
	}

	getParser().Lex(); // Consume the EndOfStatement.
	return false;
	}

	bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr,
	RISCVMCExpr::VariantKind &Kind,
	int64_t &Addend) {
	Kind = RISCVMCExpr::VK_RISCV_None;
	Addend = 0;

	if (const RISCVMCExpr *RE = dyn_cast<RISCVMCExpr>(Expr)) {
	Kind = RE->getKind();
	Expr = RE->getSubExpr();
	}

	// It's a simple symbol reference or constant with no addend.
	if (isa<MCConstantExpr>(Expr) \|\| isa<MCSymbolRefExpr>(Expr))
	return true;

	const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
	if (!BE)
	return false;

	if (!isa<MCSymbolRefExpr>(BE->getLHS()))
	return false;

	if (BE->getOpcode() != MCBinaryExpr::Add &&
	BE->getOpcode() != MCBinaryExpr::Sub)
	return false;

	// We are able to support the subtraction of two symbol references
	if (BE->getOpcode() == MCBinaryExpr::Sub &&
	isa<MCSymbolRefExpr>(BE->getRHS()))
	return true;

	// See if the addend is a constant, otherwise there's more going
	// on here than we can deal with.
	auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
	if (!AddendExpr)
	return false;

	Addend = AddendExpr->getValue();
	if (BE->getOpcode() == MCBinaryExpr::Sub)
	Addend = -Addend;

	// It's some symbol reference + a constant addend
	return Kind != RISCVMCExpr::VK_RISCV_Invalid;
	}

	bool RISCVAsmParser::ParseDirective(AsmToken DirectiveID) {
	// This returns false if this function recognizes the directive
	// regardless of whether it is successfully handles or reports an
	// error. Otherwise it returns true to give the generic parser a
	// chance at recognizing it.
	StringRef IDVal = DirectiveID.getString();

	if (IDVal == ".option")
	return parseDirectiveOption();

	return true;
	}

	bool RISCVAsmParser::parseDirectiveOption() {
	MCAsmParser &Parser = getParser();
	// Get the option token.
	AsmToken Tok = Parser.getTok();
	// At the moment only identifiers are supported.
	if (Tok.isNot(AsmToken::Identifier))
	return Error(Parser.getTok().getLoc(),
	"unexpected token, expected identifier");

	StringRef Option = Tok.getIdentifier();

	if (Option == "push") {
	getTargetStreamer().emitDirectiveOptionPush();

	Parser.Lex();
	if (Parser.getTok().isNot(AsmToken::EndOfStatement))
	return Error(Parser.getTok().getLoc(),
	"unexpected token, expected end of statement");

	pushFeatureBits();
	return false;
	}

	if (Option == "pop") {
	SMLoc StartLoc = Parser.getTok().getLoc();
	getTargetStreamer().emitDirectiveOptionPop();

	Parser.Lex();
	if (Parser.getTok().isNot(AsmToken::EndOfStatement))
	return Error(Parser.getTok().getLoc(),
	"unexpected token, expected end of statement");

	if (popFeatureBits())
	return Error(StartLoc, ".option pop with no .option push");

	return false;
	}

	if (Option == "rvc") {
	getTargetStreamer().emitDirectiveOptionRVC();

	Parser.Lex();
	if (Parser.getTok().isNot(AsmToken::EndOfStatement))
	return Error(Parser.getTok().getLoc(),
	"unexpected token, expected end of statement");

	setFeatureBits(RISCV::FeatureStdExtC, "c");
	return false;
	}

	if (Option == "norvc") {
	getTargetStreamer().emitDirectiveOptionNoRVC();

	Parser.Lex();
	if (Parser.getTok().isNot(AsmToken::EndOfStatement))
	return Error(Parser.getTok().getLoc(),
	"unexpected token, expected end of statement");

	clearFeatureBits(RISCV::FeatureStdExtC, "c");
	return false;
	}

	if (Option == "relax") {
	getTargetStreamer().emitDirectiveOptionRelax();

	Parser.Lex();
	if (Parser.getTok().isNot(AsmToken::EndOfStatement))
	return Error(Parser.getTok().getLoc(),
	"unexpected token, expected end of statement");

	setFeatureBits(RISCV::FeatureRelax, "relax");
	return false;
	}

	if (Option == "norelax") {
	getTargetStreamer().emitDirectiveOptionNoRelax();

	Parser.Lex();
	if (Parser.getTok().isNot(AsmToken::EndOfStatement))
	return Error(Parser.getTok().getLoc(),
	"unexpected token, expected end of statement");

	clearFeatureBits(RISCV::FeatureRelax, "relax");
	return false;
	}

	// Unknown option.
	Warning(Parser.getTok().getLoc(),
	"unknown option, expected 'push', 'pop', 'rvc', 'norvc', 'relax' or "
	"'norelax'");
	Parser.eatToEndOfStatement();
	return false;
	}

	void RISCVAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) {
	MCInst CInst;
	bool Res = compressInst(CInst, Inst, getSTI(), S.getContext());
	CInst.setLoc(Inst.getLoc());
	S.EmitInstruction((Res ? CInst : Inst), getSTI());
	}

	void RISCVAsmParser::emitLoadImm(unsigned DestReg, int64_t Value,
	MCStreamer &Out) {
	RISCVMatInt::InstSeq Seq;
	RISCVMatInt::generateInstSeq(Value, isRV64(), Seq);

	unsigned SrcReg = RISCV::X0;
	for (RISCVMatInt::Inst &Inst : Seq) {
	if (Inst.Opc == RISCV::LUI) {
	emitToStreamer(
	Out, MCInstBuilder(RISCV::LUI).addReg(DestReg).addImm(Inst.Imm));
	} else {
	emitToStreamer(
	Out, MCInstBuilder(Inst.Opc).addReg(DestReg).addReg(SrcReg).addImm(
	Inst.Imm));
	}

	// Only the first instruction has X0 as its source.
	SrcReg = DestReg;
	}
	}

	void RISCVAsmParser::emitAuipcInstPair(MCOperand DestReg, MCOperand TmpReg,
	const MCExpr *Symbol,
	RISCVMCExpr::VariantKind VKHi,
	unsigned SecondOpcode, SMLoc IDLoc,
	MCStreamer &Out) {
	// A pair of instructions for PC-relative addressing; expands to
	// TmpLabel: AUIPC TmpReg, VKHi(symbol)
	// OP DestReg, TmpReg, %pcrel_lo(TmpLabel)
	MCContext &Ctx = getContext();

	MCSymbol *TmpLabel = Ctx.createTempSymbol(
	"pcrel_hi", /* AlwaysAddSuffix / true, / CanBeUnnamed */ false);
	Out.EmitLabel(TmpLabel);

	const RISCVMCExpr *SymbolHi = RISCVMCExpr::create(Symbol, VKHi, Ctx);
	emitToStreamer(
	Out, MCInstBuilder(RISCV::AUIPC).addOperand(TmpReg).addExpr(SymbolHi));

	const MCExpr *RefToLinkTmpLabel =
	RISCVMCExpr::create(MCSymbolRefExpr::create(TmpLabel, Ctx),
	RISCVMCExpr::VK_RISCV_PCREL_LO, Ctx);

	emitToStreamer(Out, MCInstBuilder(SecondOpcode)
	.addOperand(DestReg)
	.addOperand(TmpReg)
	.addExpr(RefToLinkTmpLabel));
	}

	void RISCVAsmParser::emitLoadLocalAddress(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out) {
	// The load local address pseudo-instruction "lla" is used in PC-relative
	// addressing of local symbols:
	// lla rdest, symbol
	// expands to
	// TmpLabel: AUIPC rdest, %pcrel_hi(symbol)
	// ADDI rdest, rdest, %pcrel_lo(TmpLabel)
	MCOperand DestReg = Inst.getOperand(0);
	const MCExpr *Symbol = Inst.getOperand(1).getExpr();
	emitAuipcInstPair(DestReg, DestReg, Symbol, RISCVMCExpr::VK_RISCV_PCREL_HI,
	RISCV::ADDI, IDLoc, Out);
	}

	void RISCVAsmParser::emitLoadAddress(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out) {
	// The load address pseudo-instruction "la" is used in PC-relative and
	// GOT-indirect addressing of global symbols:
	// la rdest, symbol
	// expands to either (for non-PIC)
	// TmpLabel: AUIPC rdest, %pcrel_hi(symbol)
	// ADDI rdest, rdest, %pcrel_lo(TmpLabel)
	// or (for PIC)
	// TmpLabel: AUIPC rdest, %got_pcrel_hi(symbol)
	// Lx rdest, %pcrel_lo(TmpLabel)(rdest)
	MCOperand DestReg = Inst.getOperand(0);
	const MCExpr *Symbol = Inst.getOperand(1).getExpr();
	unsigned SecondOpcode;
	RISCVMCExpr::VariantKind VKHi;
	// FIXME: Should check .option (no)pic when implemented
	if (getContext().getObjectFileInfo()->isPositionIndependent()) {
	SecondOpcode = isRV64() ? RISCV::LD : RISCV::LW;
	VKHi = RISCVMCExpr::VK_RISCV_GOT_HI;
	} else {
	SecondOpcode = RISCV::ADDI;
	VKHi = RISCVMCExpr::VK_RISCV_PCREL_HI;
	}
	emitAuipcInstPair(DestReg, DestReg, Symbol, VKHi, SecondOpcode, IDLoc, Out);
	}

	void RISCVAsmParser::emitLoadTLSIEAddress(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out) {
	// The load TLS IE address pseudo-instruction "la.tls.ie" is used in
	// initial-exec TLS model addressing of global symbols:
	// la.tls.ie rdest, symbol
	// expands to
	// TmpLabel: AUIPC rdest, %tls_ie_pcrel_hi(symbol)
	// Lx rdest, %pcrel_lo(TmpLabel)(rdest)
	MCOperand DestReg = Inst.getOperand(0);
	const MCExpr *Symbol = Inst.getOperand(1).getExpr();
	unsigned SecondOpcode = isRV64() ? RISCV::LD : RISCV::LW;
	emitAuipcInstPair(DestReg, DestReg, Symbol, RISCVMCExpr::VK_RISCV_TLS_GOT_HI,
	SecondOpcode, IDLoc, Out);
	}

	void RISCVAsmParser::emitLoadTLSGDAddress(MCInst &Inst, SMLoc IDLoc,
	MCStreamer &Out) {
	// The load TLS GD address pseudo-instruction "la.tls.gd" is used in
	// global-dynamic TLS model addressing of global symbols:
	// la.tls.gd rdest, symbol
	// expands to
	// TmpLabel: AUIPC rdest, %tls_gd_pcrel_hi(symbol)
	// ADDI rdest, rdest, %pcrel_lo(TmpLabel)
	MCOperand DestReg = Inst.getOperand(0);
	const MCExpr *Symbol = Inst.getOperand(1).getExpr();
	emitAuipcInstPair(DestReg, DestReg, Symbol, RISCVMCExpr::VK_RISCV_TLS_GD_HI,
	RISCV::ADDI, IDLoc, Out);
	}

	void RISCVAsmParser::emitLoadStoreSymbol(MCInst &Inst, unsigned Opcode,
	SMLoc IDLoc, MCStreamer &Out,
	bool HasTmpReg) {
	// The load/store pseudo-instruction does a pc-relative load with
	// a symbol.
	//
	// The expansion looks like this
	//
	// TmpLabel: AUIPC tmp, %pcrel_hi(symbol)
	// [S\|L]X rd, %pcrel_lo(TmpLabel)(tmp)
	MCOperand DestReg = Inst.getOperand(0);
	unsigned SymbolOpIdx = HasTmpReg ? 2 : 1;
	unsigned TmpRegOpIdx = HasTmpReg ? 1 : 0;
	MCOperand TmpReg = Inst.getOperand(TmpRegOpIdx);
	const MCExpr *Symbol = Inst.getOperand(SymbolOpIdx).getExpr();
	emitAuipcInstPair(DestReg, TmpReg, Symbol, RISCVMCExpr::VK_RISCV_PCREL_HI,
	Opcode, IDLoc, Out);
	}

	bool RISCVAsmParser::checkPseudoAddTPRel(MCInst &Inst,
	OperandVector &Operands) {
	assert(Inst.getOpcode() == RISCV::PseudoAddTPRel && "Invalid instruction");
	assert(Inst.getOperand(2).isReg() && "Unexpected second operand kind");
	if (Inst.getOperand(2).getReg() != RISCV::X4) {
	SMLoc ErrorLoc = ((RISCVOperand &)*Operands[3]).getStartLoc();
	return Error(ErrorLoc, "the second input operand must be tp/x4 when using "
	"%tprel_add modifier");
	}

	return false;
	}

	bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
	OperandVector &Operands,
	MCStreamer &Out) {
	Inst.setLoc(IDLoc);

	switch (Inst.getOpcode()) {
	default:
	break;
	case RISCV::PseudoLI: {
	unsigned Reg = Inst.getOperand(0).getReg();
	const MCOperand &Op1 = Inst.getOperand(1);
	if (Op1.isExpr()) {
	// We must have li reg, %lo(sym) or li reg, %pcrel_lo(sym) or similar.
	// Just convert to an addi. This allows compatibility with gas.
	emitToStreamer(Out, MCInstBuilder(RISCV::ADDI)
	.addReg(Reg)
	.addReg(RISCV::X0)
	.addExpr(Op1.getExpr()));
	return false;
	}
	int64_t Imm = Inst.getOperand(1).getImm();
	// On RV32 the immediate here can either be a signed or an unsigned
	// 32-bit number. Sign extension has to be performed to ensure that Imm
	// represents the expected signed 64-bit number.
	if (!isRV64())
	Imm = SignExtend64<32>(Imm);
	emitLoadImm(Reg, Imm, Out);
	return false;
	}
	case RISCV::PseudoLLA:
	emitLoadLocalAddress(Inst, IDLoc, Out);
	return false;
	case RISCV::PseudoLA:
	emitLoadAddress(Inst, IDLoc, Out);
	return false;
	case RISCV::PseudoLA_TLS_IE:
	emitLoadTLSIEAddress(Inst, IDLoc, Out);
	return false;
	case RISCV::PseudoLA_TLS_GD:
	emitLoadTLSGDAddress(Inst, IDLoc, Out);
	return false;
	case RISCV::PseudoLB:
	emitLoadStoreSymbol(Inst, RISCV::LB, IDLoc, Out, /HasTmpReg=/false);
	return false;
	case RISCV::PseudoLBU:
	emitLoadStoreSymbol(Inst, RISCV::LBU, IDLoc, Out, /HasTmpReg=/false);
	return false;
	case RISCV::PseudoLH:
	emitLoadStoreSymbol(Inst, RISCV::LH, IDLoc, Out, /HasTmpReg=/false);
	return false;
	case RISCV::PseudoLHU:
	emitLoadStoreSymbol(Inst, RISCV::LHU, IDLoc, Out, /HasTmpReg=/false);
	return false;
	case RISCV::PseudoLW:
	emitLoadStoreSymbol(Inst, RISCV::LW, IDLoc, Out, /HasTmpReg=/false);
	return false;
	case RISCV::PseudoLWU:
	emitLoadStoreSymbol(Inst, RISCV::LWU, IDLoc, Out, /HasTmpReg=/false);
	return false;
	case RISCV::PseudoLD:
	emitLoadStoreSymbol(Inst, RISCV::LD, IDLoc, Out, /HasTmpReg=/false);
	return false;
	case RISCV::PseudoFLW:
	emitLoadStoreSymbol(Inst, RISCV::FLW, IDLoc, Out, /HasTmpReg=/true);
	return false;
	case RISCV::PseudoFLD:
	emitLoadStoreSymbol(Inst, RISCV::FLD, IDLoc, Out, /HasTmpReg=/true);
	return false;
	case RISCV::PseudoSB:
	emitLoadStoreSymbol(Inst, RISCV::SB, IDLoc, Out, /HasTmpReg=/true);
	return false;
	case RISCV::PseudoSH:
	emitLoadStoreSymbol(Inst, RISCV::SH, IDLoc, Out, /HasTmpReg=/true);
	return false;
	case RISCV::PseudoSW:
	emitLoadStoreSymbol(Inst, RISCV::SW, IDLoc, Out, /HasTmpReg=/true);
	return false;
	case RISCV::PseudoSD:
	emitLoadStoreSymbol(Inst, RISCV::SD, IDLoc, Out, /HasTmpReg=/true);
	return false;
	case RISCV::PseudoFSW:
	emitLoadStoreSymbol(Inst, RISCV::FSW, IDLoc, Out, /HasTmpReg=/true);
	return false;
	case RISCV::PseudoFSD:
	emitLoadStoreSymbol(Inst, RISCV::FSD, IDLoc, Out, /HasTmpReg=/true);
	return false;
	case RISCV::PseudoAddTPRel:
	if (checkPseudoAddTPRel(Inst, Operands))
	return true;
	break;
	}

	emitToStreamer(Out, Inst);
	return false;
	}

	extern "C" void LLVMInitializeRISCVAsmParser() {
	RegisterMCAsmParser<RISCVAsmParser> X(getTheRISCV32Target());
	RegisterMCAsmParser<RISCVAsmParser> Y(getTheRISCV64Target());
	}
	Index: vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVFrameLowering.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVFrameLowering.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVFrameLowering.cpp (revision 351303)
	@@ -1,364 +1,408 @@
	//===-- RISCVFrameLowering.cpp - RISCV Frame Information ------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the RISCV implementation of TargetFrameLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "RISCVFrameLowering.h"
	#include "RISCVMachineFunctionInfo.h"
	#include "RISCVSubtarget.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RegisterScavenging.h"
	#include "llvm/MC/MCDwarf.h"

	using namespace llvm;

	bool RISCVFrameLowering::hasFP(const MachineFunction &MF) const {
	const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();

	const MachineFrameInfo &MFI = MF.getFrameInfo();
	return MF.getTarget().Options.DisableFramePointerElim(MF) \|\|
	RegInfo->needsStackRealignment(MF) \|\| MFI.hasVarSizedObjects() \|\|
	MFI.isFrameAddressTaken();
	}

	// Determines the size of the frame and maximum call frame size.
	void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const RISCVRegisterInfo *RI = STI.getRegisterInfo();

	// Get the number of bytes to allocate from the FrameInfo.
	uint64_t FrameSize = MFI.getStackSize();

	// Get the alignment.
	- uint64_t StackAlign = RI->needsStackRealignment(MF) ? MFI.getMaxAlignment()
	- : getStackAlignment();
	+ unsigned StackAlign = getStackAlignment();
	+ if (RI->needsStackRealignment(MF)) {
	+ unsigned MaxStackAlign = std::max(StackAlign, MFI.getMaxAlignment());
	+ FrameSize += (MaxStackAlign - StackAlign);
	+ StackAlign = MaxStackAlign;
	+ }

	+ // Set Max Call Frame Size
	+ uint64_t MaxCallSize = alignTo(MFI.getMaxCallFrameSize(), StackAlign);
	+ MFI.setMaxCallFrameSize(MaxCallSize);
	+
	// Make sure the frame is aligned.
	FrameSize = alignTo(FrameSize, StackAlign);

	// Update frame info.
	MFI.setStackSize(FrameSize);
	}

	void RISCVFrameLowering::adjustReg(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, unsigned DestReg,
	unsigned SrcReg, int64_t Val,
	MachineInstr::MIFlag Flag) const {
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	const RISCVInstrInfo *TII = STI.getInstrInfo();

	if (DestReg == SrcReg && Val == 0)
	return;

	if (isInt<12>(Val)) {
	BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI), DestReg)
	.addReg(SrcReg)
	.addImm(Val)
	.setMIFlag(Flag);
	} else if (isInt<32>(Val)) {
	unsigned Opc = RISCV::ADD;
	bool isSub = Val < 0;
	if (isSub) {
	Val = -Val;
	Opc = RISCV::SUB;
	}

	unsigned ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
	TII->movImm32(MBB, MBBI, DL, ScratchReg, Val, Flag);
	BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
	.addReg(SrcReg)
	.addReg(ScratchReg, RegState::Kill)
	.setMIFlag(Flag);
	} else {
	report_fatal_error("adjustReg cannot yet handle adjustments >32 bits");
	}
	}

	// Returns the register used to hold the frame pointer.
	static unsigned getFPReg(const RISCVSubtarget &STI) { return RISCV::X8; }

	// Returns the register used to hold the stack pointer.
	static unsigned getSPReg(const RISCVSubtarget &STI) { return RISCV::X2; }

	void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
	MachineBasicBlock &MBB) const {
	assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");

	MachineFrameInfo &MFI = MF.getFrameInfo();
	auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
	const RISCVRegisterInfo *RI = STI.getRegisterInfo();
	const RISCVInstrInfo *TII = STI.getInstrInfo();
	MachineBasicBlock::iterator MBBI = MBB.begin();

	+ if (RI->needsStackRealignment(MF) && MFI.hasVarSizedObjects()) {
	+ report_fatal_error(
	+ "RISC-V backend can't currently handle functions that need stack "
	+ "realignment and have variable sized objects");
	+ }
	+
	unsigned FPReg = getFPReg(STI);
	unsigned SPReg = getSPReg(STI);

	// Debug location must be unknown since the first debug location is used
	// to determine the end of the prologue.
	DebugLoc DL;

	// Determine the correct frame layout
	determineFrameLayout(MF);

	// FIXME (note copied from Lanai): This appears to be overallocating. Needs
	// investigation. Get the number of bytes to allocate from the FrameInfo.
	uint64_t StackSize = MFI.getStackSize();

	// Early exit if there is no need to allocate on the stack
	if (StackSize == 0 && !MFI.adjustsStack())
	return;

	// Allocate space on the stack if necessary.
	adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup);

	// Emit ".cfi_def_cfa_offset StackSize"
	unsigned CFIIndex = MF.addFrameInst(
	MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex);

	// The frame pointer is callee-saved, and code has been generated for us to
	// save it to the stack. We need to skip over the storing of callee-saved
	// registers as the frame pointer must be modified after it has been saved
	// to the stack, not before.
	// FIXME: assumes exactly one instruction is used to save each callee-saved
	// register.
	const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
	std::advance(MBBI, CSI.size());

	// Iterate over list of callee-saved registers and emit .cfi_offset
	// directives.
	for (const auto &Entry : CSI) {
	int64_t Offset = MFI.getObjectOffset(Entry.getFrameIdx());
	unsigned Reg = Entry.getReg();
	unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
	nullptr, RI->getDwarfRegNum(Reg, true), Offset));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex);
	}

	// Generate new FP.
	if (hasFP(MF)) {
	adjustReg(MBB, MBBI, DL, FPReg, SPReg,
	StackSize - RVFI->getVarArgsSaveSize(), MachineInstr::FrameSetup);

	// Emit ".cfi_def_cfa $fp, 0"
	unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
	nullptr, RI->getDwarfRegNum(FPReg, true), 0));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex);
	+
	+ // Realign Stack
	+ const RISCVRegisterInfo *RI = STI.getRegisterInfo();
	+ if (RI->needsStackRealignment(MF)) {
	+ unsigned MaxAlignment = MFI.getMaxAlignment();
	+
	+ const RISCVInstrInfo *TII = STI.getInstrInfo();
	+ if (isInt<12>(-(int)MaxAlignment)) {
	+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::ANDI), SPReg)
	+ .addReg(SPReg)
	+ .addImm(-(int)MaxAlignment);
	+ } else {
	+ unsigned ShiftAmount = countTrailingZeros(MaxAlignment);
	+ unsigned VR =
	+ MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
	+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::SRLI), VR)
	+ .addReg(SPReg)
	+ .addImm(ShiftAmount);
	+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::SLLI), SPReg)
	+ .addReg(VR)
	+ .addImm(ShiftAmount);
	+ }
	+ }
	}
	}

	void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
	MachineBasicBlock &MBB) const {
	MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
	const RISCVRegisterInfo *RI = STI.getRegisterInfo();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
	DebugLoc DL = MBBI->getDebugLoc();
	const RISCVInstrInfo *TII = STI.getInstrInfo();
	unsigned FPReg = getFPReg(STI);
	unsigned SPReg = getSPReg(STI);

	// Skip to before the restores of callee-saved registers
	// FIXME: assumes exactly one instruction is used to restore each
	// callee-saved register.
	auto LastFrameDestroy = std::prev(MBBI, MFI.getCalleeSavedInfo().size());

	uint64_t StackSize = MFI.getStackSize();
	uint64_t FPOffset = StackSize - RVFI->getVarArgsSaveSize();

	// Restore the stack pointer using the value of the frame pointer. Only
	// necessary if the stack pointer was modified, meaning the stack size is
	// unknown.
	if (RI->needsStackRealignment(MF) \|\| MFI.hasVarSizedObjects()) {
	assert(hasFP(MF) && "frame pointer should not have been eliminated");
	adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg, -FPOffset,
	MachineInstr::FrameDestroy);
	}

	if (hasFP(MF)) {
	// To find the instruction restoring FP from stack.
	for (auto &I = LastFrameDestroy; I != MBBI; ++I) {
	if (I->mayLoad() && I->getOperand(0).isReg()) {
	unsigned DestReg = I->getOperand(0).getReg();
	if (DestReg == FPReg) {
	// If there is frame pointer, after restoring $fp registers, we
	// need adjust CFA to ($sp - FPOffset).
	// Emit ".cfi_def_cfa $sp, -FPOffset"
	unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
	nullptr, RI->getDwarfRegNum(SPReg, true), -FPOffset));
	BuildMI(MBB, std::next(I), DL,
	TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex);
	break;
	}
	}
	}
	}

	// Add CFI directives for callee-saved registers.
	const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
	// Iterate over list of callee-saved registers and emit .cfi_restore
	// directives.
	for (const auto &Entry : CSI) {
	unsigned Reg = Entry.getReg();
	unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
	nullptr, RI->getDwarfRegNum(Reg, true)));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex);
	}

	// Deallocate stack
	adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy);

	// After restoring $sp, we need to adjust CFA to $(sp + 0)
	// Emit ".cfi_def_cfa_offset 0"
	unsigned CFIIndex =
	MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex);
	}

	int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
	int FI,
	unsigned &FrameReg) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
	const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();

	// Callee-saved registers should be referenced relative to the stack
	// pointer (positive offset), otherwise use the frame pointer (negative
	// offset).
	const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
	int MinCSFI = 0;
	int MaxCSFI = -1;

	int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea() +
	MFI.getOffsetAdjustment();

	if (CSI.size()) {
	MinCSFI = CSI[0].getFrameIdx();
	MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
	}

	if (FI >= MinCSFI && FI <= MaxCSFI) {
	+ FrameReg = RISCV::X2;
	+ Offset += MF.getFrameInfo().getStackSize();
	+ } else if (RI->needsStackRealignment(MF)) {
	+ assert(!MFI.hasVarSizedObjects() &&
	+ "Unexpected combination of stack realignment and varsized objects");
	+ // If the stack was realigned, the frame pointer is set in order to allow
	+ // SP to be restored, but we still access stack objects using SP.
	FrameReg = RISCV::X2;
	Offset += MF.getFrameInfo().getStackSize();
	} else {
	FrameReg = RI->getFrameRegister(MF);
	if (hasFP(MF))
	Offset += RVFI->getVarArgsSaveSize();
	else
	Offset += MF.getFrameInfo().getStackSize();
	}
	return Offset;
	}

	void RISCVFrameLowering::determineCalleeSaves(MachineFunction &MF,
	BitVector &SavedRegs,
	RegScavenger *RS) const {
	TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
	// Unconditionally spill RA and FP only if the function uses a frame
	// pointer.
	if (hasFP(MF)) {
	SavedRegs.set(RISCV::X1);
	SavedRegs.set(RISCV::X8);
	}

	// If interrupt is enabled and there are calls in the handler,
	// unconditionally save all Caller-saved registers and
	// all FP registers, regardless whether they are used.
	MachineFrameInfo &MFI = MF.getFrameInfo();

	if (MF.getFunction().hasFnAttribute("interrupt") && MFI.hasCalls()) {

	static const MCPhysReg CSRegs[] = { RISCV::X1, /* ra */
	RISCV::X5, RISCV::X6, RISCV::X7, /* t0-t2 */
	RISCV::X10, RISCV::X11, /* a0-a1, a2-a7 */
	RISCV::X12, RISCV::X13, RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17,
	RISCV::X28, RISCV::X29, RISCV::X30, RISCV::X31, 0 /* t3-t6 */
	};

	for (unsigned i = 0; CSRegs[i]; ++i)
	SavedRegs.set(CSRegs[i]);

	if (MF.getSubtarget<RISCVSubtarget>().hasStdExtD() \|\|
	MF.getSubtarget<RISCVSubtarget>().hasStdExtF()) {

	// If interrupt is enabled, this list contains all FP registers.
	const MCPhysReg * Regs = MF.getRegInfo().getCalleeSavedRegs();

	for (unsigned i = 0; Regs[i]; ++i)
	if (RISCV::FPR32RegClass.contains(Regs[i]) \|\|
	RISCV::FPR64RegClass.contains(Regs[i]))
	SavedRegs.set(Regs[i]);
	}
	}
	}

	void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
	MachineFunction &MF, RegScavenger *RS) const {
	const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const TargetRegisterClass *RC = &RISCV::GPRRegClass;
	// estimateStackSize has been observed to under-estimate the final stack
	// size, so give ourselves wiggle-room by checking for stack size
	// representable an 11-bit signed field rather than 12-bits.
	// FIXME: It may be possible to craft a function with a small stack that
	// still needs an emergency spill slot for branch relaxation. This case
	// would currently be missed.
	if (!isInt<11>(MFI.estimateStackSize(MF))) {
	int RegScavFI = MFI.CreateStackObject(
	RegInfo->getSpillSize(RC), RegInfo->getSpillAlignment(RC), false);
	RS->addScavengingFrameIndex(RegScavFI);
	}
	}

	// Not preserve stack space within prologue for outgoing variables when the
	// function contains variable size objects and let eliminateCallFramePseudoInstr
	// preserve stack space for it.
	bool RISCVFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
	return !MF.getFrameInfo().hasVarSizedObjects();
	}

	// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions.
	MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
	MachineFunction &MF, MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI) const {
	unsigned SPReg = RISCV::X2;
	DebugLoc DL = MI->getDebugLoc();

	if (!hasReservedCallFrame(MF)) {
	// If space has not been reserved for a call frame, ADJCALLSTACKDOWN and
	// ADJCALLSTACKUP must be converted to instructions manipulating the stack
	// pointer. This is necessary when there is a variable length stack
	// allocation (e.g. alloca), which means it's not possible to allocate
	// space for outgoing arguments from within the function prologue.
	int64_t Amount = MI->getOperand(0).getImm();

	if (Amount != 0) {
	// Ensure the stack remains aligned after adjustment.
	Amount = alignSPAdjust(Amount);

	if (MI->getOpcode() == RISCV::ADJCALLSTACKDOWN)
	Amount = -Amount;

	adjustReg(MBB, MI, DL, SPReg, SPReg, Amount, MachineInstr::NoFlags);
	}
	}

	return MBB.erase(MI);
	}
	Index: vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.cpp (revision 351303)
	@@ -1,2621 +1,2648 @@
	//===-- RISCVISelLowering.cpp - RISCV DAG Lowering Implementation --------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that RISCV uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "RISCVISelLowering.h"
	#include "RISCV.h"
	#include "RISCVMachineFunctionInfo.h"
	#include "RISCVRegisterInfo.h"
	#include "RISCVSubtarget.h"
	#include "RISCVTargetMachine.h"
	#include "Utils/RISCVMatInt.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/SelectionDAGISel.h"
	#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/DiagnosticPrinter.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"

	using namespace llvm;

	#define DEBUG_TYPE "riscv-lower"

	STATISTIC(NumTailCalls, "Number of tail calls");

	RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
	const RISCVSubtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {

	if (Subtarget.isRV32E())
	report_fatal_error("Codegen not yet implemented for RV32E");

	RISCVABI::ABI ABI = Subtarget.getTargetABI();
	assert(ABI != RISCVABI::ABI_Unknown && "Improperly initialised target ABI");

	switch (ABI) {
	default:
	report_fatal_error("Don't know how to lower this ABI");
	case RISCVABI::ABI_ILP32:
	case RISCVABI::ABI_ILP32F:
	case RISCVABI::ABI_ILP32D:
	case RISCVABI::ABI_LP64:
	case RISCVABI::ABI_LP64F:
	case RISCVABI::ABI_LP64D:
	break;
	}

	MVT XLenVT = Subtarget.getXLenVT();

	// Set up the register classes.
	addRegisterClass(XLenVT, &RISCV::GPRRegClass);

	if (Subtarget.hasStdExtF())
	addRegisterClass(MVT::f32, &RISCV::FPR32RegClass);
	if (Subtarget.hasStdExtD())
	addRegisterClass(MVT::f64, &RISCV::FPR64RegClass);

	// Compute derived properties from the register classes.
	computeRegisterProperties(STI.getRegisterInfo());

	setStackPointerRegisterToSaveRestore(RISCV::X2);

	for (auto N : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD})
	setLoadExtAction(N, XLenVT, MVT::i1, Promote);

	// TODO: add all necessary setOperationAction calls.
	setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Expand);

	setOperationAction(ISD::BR_JT, MVT::Other, Expand);
	setOperationAction(ISD::BR_CC, XLenVT, Expand);
	setOperationAction(ISD::SELECT, XLenVT, Custom);
	setOperationAction(ISD::SELECT_CC, XLenVT, Expand);

	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	setOperationAction(ISD::VASTART, MVT::Other, Custom);
	setOperationAction(ISD::VAARG, MVT::Other, Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Expand);
	setOperationAction(ISD::VAEND, MVT::Other, Expand);

	for (auto VT : {MVT::i1, MVT::i8, MVT::i16})
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);

	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::SHL, MVT::i32, Custom);
	setOperationAction(ISD::SRA, MVT::i32, Custom);
	setOperationAction(ISD::SRL, MVT::i32, Custom);
	}

	if (!Subtarget.hasStdExtM()) {
	setOperationAction(ISD::MUL, XLenVT, Expand);
	setOperationAction(ISD::MULHS, XLenVT, Expand);
	setOperationAction(ISD::MULHU, XLenVT, Expand);
	setOperationAction(ISD::SDIV, XLenVT, Expand);
	setOperationAction(ISD::UDIV, XLenVT, Expand);
	setOperationAction(ISD::SREM, XLenVT, Expand);
	setOperationAction(ISD::UREM, XLenVT, Expand);
	}

	if (Subtarget.is64Bit() && Subtarget.hasStdExtM()) {
	setOperationAction(ISD::SDIV, MVT::i32, Custom);
	setOperationAction(ISD::UDIV, MVT::i32, Custom);
	setOperationAction(ISD::UREM, MVT::i32, Custom);
	}

	setOperationAction(ISD::SDIVREM, XLenVT, Expand);
	setOperationAction(ISD::UDIVREM, XLenVT, Expand);
	setOperationAction(ISD::SMUL_LOHI, XLenVT, Expand);
	setOperationAction(ISD::UMUL_LOHI, XLenVT, Expand);

	setOperationAction(ISD::SHL_PARTS, XLenVT, Custom);
	setOperationAction(ISD::SRL_PARTS, XLenVT, Custom);
	setOperationAction(ISD::SRA_PARTS, XLenVT, Custom);

	setOperationAction(ISD::ROTL, XLenVT, Expand);
	setOperationAction(ISD::ROTR, XLenVT, Expand);
	setOperationAction(ISD::BSWAP, XLenVT, Expand);
	setOperationAction(ISD::CTTZ, XLenVT, Expand);
	setOperationAction(ISD::CTLZ, XLenVT, Expand);
	setOperationAction(ISD::CTPOP, XLenVT, Expand);

	ISD::CondCode FPCCToExtend[] = {
	ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
	ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT,
	ISD::SETGE, ISD::SETNE};

	ISD::NodeType FPOpToExtend[] = {
	ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM};

	if (Subtarget.hasStdExtF()) {
	setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
	for (auto CC : FPCCToExtend)
	setCondCodeAction(CC, MVT::f32, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
	setOperationAction(ISD::SELECT, MVT::f32, Custom);
	setOperationAction(ISD::BR_CC, MVT::f32, Expand);
	for (auto Op : FPOpToExtend)
	setOperationAction(Op, MVT::f32, Expand);
	}

	if (Subtarget.hasStdExtF() && Subtarget.is64Bit())
	setOperationAction(ISD::BITCAST, MVT::i32, Custom);

	if (Subtarget.hasStdExtD()) {
	setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
	for (auto CC : FPCCToExtend)
	setCondCodeAction(CC, MVT::f64, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
	setOperationAction(ISD::SELECT, MVT::f64, Custom);
	setOperationAction(ISD::BR_CC, MVT::f64, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
	setTruncStoreAction(MVT::f64, MVT::f32, Expand);
	for (auto Op : FPOpToExtend)
	setOperationAction(Op, MVT::f64, Expand);
	}

	setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
	setOperationAction(ISD::BlockAddress, XLenVT, Custom);
	setOperationAction(ISD::ConstantPool, XLenVT, Custom);

	setOperationAction(ISD::GlobalTLSAddress, XLenVT, Custom);

	// TODO: On M-mode only targets, the cycle[h] CSR may not be present.
	// Unfortunately this can't be determined just from the ISA naming string.
	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64,
	Subtarget.is64Bit() ? Legal : Custom);

	if (Subtarget.hasStdExtA()) {
	setMaxAtomicSizeInBitsSupported(Subtarget.getXLen());
	setMinCmpXchgSizeInBits(32);
	} else {
	setMaxAtomicSizeInBitsSupported(0);
	}

	setBooleanContents(ZeroOrOneBooleanContent);

	// Function alignments (log2).
	unsigned FunctionAlignment = Subtarget.hasStdExtC() ? 1 : 2;
	setMinFunctionAlignment(FunctionAlignment);
	setPrefFunctionAlignment(FunctionAlignment);

	// Effectively disable jump table generation.
	setMinimumJumpTableEntries(INT_MAX);
	}

	EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
	EVT VT) const {
	if (!VT.isVector())
	return getPointerTy(DL);
	return VT.changeVectorElementTypeToInteger();
	}

	bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {
	switch (Intrinsic) {
	default:
	return false;
	case Intrinsic::riscv_masked_atomicrmw_xchg_i32:
	case Intrinsic::riscv_masked_atomicrmw_add_i32:
	case Intrinsic::riscv_masked_atomicrmw_sub_i32:
	case Intrinsic::riscv_masked_atomicrmw_nand_i32:
	case Intrinsic::riscv_masked_atomicrmw_max_i32:
	case Intrinsic::riscv_masked_atomicrmw_min_i32:
	case Intrinsic::riscv_masked_atomicrmw_umax_i32:
	case Intrinsic::riscv_masked_atomicrmw_umin_i32:
	case Intrinsic::riscv_masked_cmpxchg_i32:
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = 4;
	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOStore \|
	MachineMemOperand::MOVolatile;
	return true;
	}
	}

	bool RISCVTargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS,
	Instruction *I) const {
	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	// Require a 12-bit signed offset.
	if (!isInt<12>(AM.BaseOffs))
	return false;

	switch (AM.Scale) {
	case 0: // "r+i" or just "i", depending on HasBaseReg.
	break;
	case 1:
	if (!AM.HasBaseReg) // allow "r+i".
	break;
	return false; // disallow "r+r" or "r+r+i".
	default:
	return false;
	}

	return true;
	}

	bool RISCVTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<12>(Imm);
	}

	bool RISCVTargetLowering::isLegalAddImmediate(int64_t Imm) const {
	return isInt<12>(Imm);
	}

	// On RV32, 64-bit integers are split into their high and low parts and held
	// in two different registers, so the trunc is free since the low register can
	// just be used.
	bool RISCVTargetLowering::isTruncateFree(Type SrcTy, Type DstTy) const {
	if (Subtarget.is64Bit() \|\| !SrcTy->isIntegerTy() \|\| !DstTy->isIntegerTy())
	return false;
	unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
	unsigned DestBits = DstTy->getPrimitiveSizeInBits();
	return (SrcBits == 64 && DestBits == 32);
	}

	bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
	if (Subtarget.is64Bit() \|\| SrcVT.isVector() \|\| DstVT.isVector() \|\|
	!SrcVT.isInteger() \|\| !DstVT.isInteger())
	return false;
	unsigned SrcBits = SrcVT.getSizeInBits();
	unsigned DestBits = DstVT.getSizeInBits();
	return (SrcBits == 64 && DestBits == 32);
	}

	bool RISCVTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	// Zexts are free if they can be combined with a load.
	if (auto *LD = dyn_cast<LoadSDNode>(Val)) {
	EVT MemVT = LD->getMemoryVT();
	if ((MemVT == MVT::i8 \|\| MemVT == MVT::i16 \|\|
	(Subtarget.is64Bit() && MemVT == MVT::i32)) &&
	(LD->getExtensionType() == ISD::NON_EXTLOAD \|\|
	LD->getExtensionType() == ISD::ZEXTLOAD))
	return true;
	}

	return TargetLowering::isZExtFree(Val, VT2);
	}

	bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const {
	return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
	}

	bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
	return (VT == MVT::f32 && Subtarget.hasStdExtF()) \|\|
	(VT == MVT::f64 && Subtarget.hasStdExtD());
	}

	// Changes the condition code and swaps operands if necessary, so the SetCC
	// operation matches one of the comparisons supported directly in the RISC-V
	// ISA.
	static void normaliseSetCC(SDValue &LHS, SDValue &RHS, ISD::CondCode &CC) {
	switch (CC) {
	default:
	break;
	case ISD::SETGT:
	case ISD::SETLE:
	case ISD::SETUGT:
	case ISD::SETULE:
	CC = ISD::getSetCCSwappedOperands(CC);
	std::swap(LHS, RHS);
	break;
	}
	}

	// Return the RISC-V branch opcode that matches the given DAG integer
	// condition code. The CondCode must be one of those supported by the RISC-V
	// ISA (see normaliseSetCC).
	static unsigned getBranchOpcodeForIntCondCode(ISD::CondCode CC) {
	switch (CC) {
	default:
	llvm_unreachable("Unsupported CondCode");
	case ISD::SETEQ:
	return RISCV::BEQ;
	case ISD::SETNE:
	return RISCV::BNE;
	case ISD::SETLT:
	return RISCV::BLT;
	case ISD::SETGE:
	return RISCV::BGE;
	case ISD::SETULT:
	return RISCV::BLTU;
	case ISD::SETUGE:
	return RISCV::BGEU;
	}
	}

	SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
	SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default:
	report_fatal_error("unimplemented operand");
	case ISD::GlobalAddress:
	return lowerGlobalAddress(Op, DAG);
	case ISD::BlockAddress:
	return lowerBlockAddress(Op, DAG);
	case ISD::ConstantPool:
	return lowerConstantPool(Op, DAG);
	case ISD::GlobalTLSAddress:
	return lowerGlobalTLSAddress(Op, DAG);
	case ISD::SELECT:
	return lowerSELECT(Op, DAG);
	case ISD::VASTART:
	return lowerVASTART(Op, DAG);
	case ISD::FRAMEADDR:
	return lowerFRAMEADDR(Op, DAG);
	case ISD::RETURNADDR:
	return lowerRETURNADDR(Op, DAG);
	case ISD::SHL_PARTS:
	return lowerShiftLeftParts(Op, DAG);
	case ISD::SRA_PARTS:
	return lowerShiftRightParts(Op, DAG, true);
	case ISD::SRL_PARTS:
	return lowerShiftRightParts(Op, DAG, false);
	case ISD::BITCAST: {
	assert(Subtarget.is64Bit() && Subtarget.hasStdExtF() &&
	"Unexpected custom legalisation");
	SDLoc DL(Op);
	SDValue Op0 = Op.getOperand(0);
	if (Op.getValueType() != MVT::f32 \|\| Op0.getValueType() != MVT::i32)
	return SDValue();
	SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
	SDValue FPConv = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
	return FPConv;
	}
	}
	}

	static SDValue getTargetNode(GlobalAddressSDNode *N, SDLoc DL, EVT Ty,
	SelectionDAG &DAG, unsigned Flags) {
	return DAG.getTargetGlobalAddress(N->getGlobal(), DL, Ty, 0, Flags);
	}

	static SDValue getTargetNode(BlockAddressSDNode *N, SDLoc DL, EVT Ty,
	SelectionDAG &DAG, unsigned Flags) {
	return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, N->getOffset(),
	Flags);
	}

	static SDValue getTargetNode(ConstantPoolSDNode *N, SDLoc DL, EVT Ty,
	SelectionDAG &DAG, unsigned Flags) {
	return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
	N->getOffset(), Flags);
	}

	template <class NodeTy>
	SDValue RISCVTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
	bool IsLocal) const {
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());

	if (isPositionIndependent()) {
	SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
	if (IsLocal)
	// Use PC-relative addressing to access the symbol. This generates the
	// pattern (PseudoLLA sym), which expands to (addi (auipc %pcrel_hi(sym))
	// %pcrel_lo(auipc)).
	return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0);

	// Use PC-relative addressing to access the GOT for this symbol, then load
	// the address from the GOT. This generates the pattern (PseudoLA sym),
	// which expands to (ld (addi (auipc %got_pcrel_hi(sym)) %pcrel_lo(auipc))).
	return SDValue(DAG.getMachineNode(RISCV::PseudoLA, DL, Ty, Addr), 0);
	}

	switch (getTargetMachine().getCodeModel()) {
	default:
	report_fatal_error("Unsupported code model for lowering");
	case CodeModel::Small: {
	// Generate a sequence for accessing addresses within the first 2 GiB of
	// address space. This generates the pattern (addi (lui %hi(sym)) %lo(sym)).
	SDValue AddrHi = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_HI);
	SDValue AddrLo = getTargetNode(N, DL, Ty, DAG, RISCVII::MO_LO);
	SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0);
	return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNHi, AddrLo), 0);
	}
	case CodeModel::Medium: {
	// Generate a sequence for accessing addresses within any 2GiB range within
	// the address space. This generates the pattern (PseudoLLA sym), which
	// expands to (addi (auipc %pcrel_hi(sym)) %pcrel_lo(auipc)).
	SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0);
	return SDValue(DAG.getMachineNode(RISCV::PseudoLLA, DL, Ty, Addr), 0);
	}
	}
	}

	SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	EVT Ty = Op.getValueType();
	GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
	int64_t Offset = N->getOffset();
	MVT XLenVT = Subtarget.getXLenVT();

	const GlobalValue *GV = N->getGlobal();
	bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
	SDValue Addr = getAddr(N, DAG, IsLocal);

	// In order to maximise the opportunity for common subexpression elimination,
	// emit a separate ADD node for the global address offset instead of folding
	// it in the global address node. Later peephole optimisations may choose to
	// fold it back in when profitable.
	if (Offset != 0)
	return DAG.getNode(ISD::ADD, DL, Ty, Addr,
	DAG.getConstant(Offset, DL, XLenVT));
	return Addr;
	}

	SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);

	return getAddr(N, DAG);
	}

	SDValue RISCVTargetLowering::lowerConstantPool(SDValue Op,
	SelectionDAG &DAG) const {
	ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);

	return getAddr(N, DAG);
	}

	SDValue RISCVTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
	SelectionDAG &DAG,
	bool UseGOT) const {
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	const GlobalValue *GV = N->getGlobal();
	MVT XLenVT = Subtarget.getXLenVT();

	if (UseGOT) {
	// Use PC-relative addressing to access the GOT for this TLS symbol, then
	// load the address from the GOT and add the thread pointer. This generates
	// the pattern (PseudoLA_TLS_IE sym), which expands to
	// (ld (auipc %tls_ie_pcrel_hi(sym)) %pcrel_lo(auipc)).
	SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
	SDValue Load =
	SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_IE, DL, Ty, Addr), 0);

	// Add the thread pointer.
	SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT);
	return DAG.getNode(ISD::ADD, DL, Ty, Load, TPReg);
	}

	// Generate a sequence for accessing the address relative to the thread
	// pointer, with the appropriate adjustment for the thread pointer offset.
	// This generates the pattern
	// (add (add_tprel (lui %tprel_hi(sym)) tp %tprel_add(sym)) %tprel_lo(sym))
	SDValue AddrHi =
	DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_HI);
	SDValue AddrAdd =
	DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_ADD);
	SDValue AddrLo =
	DAG.getTargetGlobalAddress(GV, DL, Ty, 0, RISCVII::MO_TPREL_LO);

	SDValue MNHi = SDValue(DAG.getMachineNode(RISCV::LUI, DL, Ty, AddrHi), 0);
	SDValue TPReg = DAG.getRegister(RISCV::X4, XLenVT);
	SDValue MNAdd = SDValue(
	DAG.getMachineNode(RISCV::PseudoAddTPRel, DL, Ty, MNHi, TPReg, AddrAdd),
	0);
	return SDValue(DAG.getMachineNode(RISCV::ADDI, DL, Ty, MNAdd, AddrLo), 0);
	}

	SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
	SelectionDAG &DAG) const {
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	IntegerType CallTy = Type::getIntNTy(DAG.getContext(), Ty.getSizeInBits());
	const GlobalValue *GV = N->getGlobal();

	// Use a PC-relative addressing mode to access the global dynamic GOT address.
	// This generates the pattern (PseudoLA_TLS_GD sym), which expands to
	// (addi (auipc %tls_gd_pcrel_hi(sym)) %pcrel_lo(auipc)).
	SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
	SDValue Load =
	SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLS_GD, DL, Ty, Addr), 0);

	// Prepare argument list to generate call.
	ArgListTy Args;
	ArgListEntry Entry;
	Entry.Node = Load;
	Entry.Ty = CallTy;
	Args.push_back(Entry);

	// Setup call to __tls_get_addr.
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(DL)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::C, CallTy,
	DAG.getExternalSymbol("__tls_get_addr", Ty),
	std::move(Args));

	return LowerCallTo(CLI).first;
	}

	SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	EVT Ty = Op.getValueType();
	GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
	int64_t Offset = N->getOffset();
	MVT XLenVT = Subtarget.getXLenVT();

	// Non-PIC TLS lowering should always use the LocalExec model.
	TLSModel::Model Model = isPositionIndependent()
	? getTargetMachine().getTLSModel(N->getGlobal())
	: TLSModel::LocalExec;

	SDValue Addr;
	switch (Model) {
	case TLSModel::LocalExec:
	Addr = getStaticTLSAddr(N, DAG, /UseGOT=/false);
	break;
	case TLSModel::InitialExec:
	Addr = getStaticTLSAddr(N, DAG, /UseGOT=/true);
	break;
	case TLSModel::LocalDynamic:
	case TLSModel::GeneralDynamic:
	Addr = getDynamicTLSAddr(N, DAG);
	break;
	}

	// In order to maximise the opportunity for common subexpression elimination,
	// emit a separate ADD node for the global address offset instead of folding
	// it in the global address node. Later peephole optimisations may choose to
	// fold it back in when profitable.
	if (Offset != 0)
	return DAG.getNode(ISD::ADD, DL, Ty, Addr,
	DAG.getConstant(Offset, DL, XLenVT));
	return Addr;
	}

	SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	SDValue CondV = Op.getOperand(0);
	SDValue TrueV = Op.getOperand(1);
	SDValue FalseV = Op.getOperand(2);
	SDLoc DL(Op);
	MVT XLenVT = Subtarget.getXLenVT();

	// If the result type is XLenVT and CondV is the output of a SETCC node
	// which also operated on XLenVT inputs, then merge the SETCC node into the
	// lowered RISCVISD::SELECT_CC to take advantage of the integer
	// compare+branch instructions. i.e.:
	// (select (setcc lhs, rhs, cc), truev, falsev)
	// -> (riscvisd::select_cc lhs, rhs, cc, truev, falsev)
	if (Op.getSimpleValueType() == XLenVT && CondV.getOpcode() == ISD::SETCC &&
	CondV.getOperand(0).getSimpleValueType() == XLenVT) {
	SDValue LHS = CondV.getOperand(0);
	SDValue RHS = CondV.getOperand(1);
	auto CC = cast<CondCodeSDNode>(CondV.getOperand(2));
	ISD::CondCode CCVal = CC->get();

	normaliseSetCC(LHS, RHS, CCVal);

	SDValue TargetCC = DAG.getConstant(CCVal, DL, XLenVT);
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
	SDValue Ops[] = {LHS, RHS, TargetCC, TrueV, FalseV};
	return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
	}

	// Otherwise:
	// (select condv, truev, falsev)
	// -> (riscvisd::select_cc condv, zero, setne, truev, falsev)
	SDValue Zero = DAG.getConstant(0, DL, XLenVT);
	SDValue SetNE = DAG.getConstant(ISD::SETNE, DL, XLenVT);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
	SDValue Ops[] = {CondV, Zero, SetNE, TrueV, FalseV};

	return DAG.getNode(RISCVISD::SELECT_CC, DL, VTs, Ops);
	}

	SDValue RISCVTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	RISCVMachineFunctionInfo *FuncInfo = MF.getInfo<RISCVMachineFunctionInfo>();

	SDLoc DL(Op);
	SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
	getPointerTy(MF.getDataLayout()));

	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue RISCVTargetLowering::lowerFRAMEADDR(SDValue Op,
	SelectionDAG &DAG) const {
	const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo();
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setFrameAddressIsTaken(true);
	unsigned FrameReg = RI.getFrameRegister(MF);
	int XLenInBytes = Subtarget.getXLen() / 8;

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, VT);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	while (Depth--) {
	int Offset = -(XLenInBytes * 2);
	SDValue Ptr = DAG.getNode(ISD::ADD, DL, VT, FrameAddr,
	DAG.getIntPtrConstant(Offset, DL));
	FrameAddr =
	DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
	}
	return FrameAddr;
	}

	SDValue RISCVTargetLowering::lowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	const RISCVRegisterInfo &RI = *Subtarget.getRegisterInfo();
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);
	MVT XLenVT = Subtarget.getXLenVT();
	int XLenInBytes = Subtarget.getXLen() / 8;

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	if (Depth) {
	int Off = -XLenInBytes;
	SDValue FrameAddr = lowerFRAMEADDR(Op, DAG);
	SDValue Offset = DAG.getConstant(Off, DL, VT);
	return DAG.getLoad(VT, DL, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Return the value of the return address register, marking it an implicit
	// live-in.
	unsigned Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(XLenVT));
	return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, XLenVT);
	}

	SDValue RISCVTargetLowering::lowerShiftLeftParts(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Shamt = Op.getOperand(2);
	EVT VT = Lo.getValueType();

	// if Shamt-XLEN < 0: // Shamt < XLEN
	// Lo = Lo << Shamt
	// Hi = (Hi << Shamt) \| ((Lo >>u 1) >>u (XLEN-1 - Shamt))
	// else:
	// Lo = 0
	// Hi = Lo << (Shamt-XLEN)

	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue One = DAG.getConstant(1, DL, VT);
	SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT);
	SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT);
	SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen);
	SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt);

	SDValue LoTrue = DAG.getNode(ISD::SHL, DL, VT, Lo, Shamt);
	SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, VT, Lo, One);
	SDValue ShiftRightLo =
	DAG.getNode(ISD::SRL, DL, VT, ShiftRight1Lo, XLenMinus1Shamt);
	SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, VT, Hi, Shamt);
	SDValue HiTrue = DAG.getNode(ISD::OR, DL, VT, ShiftLeftHi, ShiftRightLo);
	SDValue HiFalse = DAG.getNode(ISD::SHL, DL, VT, Lo, ShamtMinusXLen);

	SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT);

	Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, Zero);
	Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);

	SDValue Parts[2] = {Lo, Hi};
	return DAG.getMergeValues(Parts, DL);
	}

	SDValue RISCVTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
	bool IsSRA) const {
	SDLoc DL(Op);
	SDValue Lo = Op.getOperand(0);
	SDValue Hi = Op.getOperand(1);
	SDValue Shamt = Op.getOperand(2);
	EVT VT = Lo.getValueType();

	// SRA expansion:
	// if Shamt-XLEN < 0: // Shamt < XLEN
	// Lo = (Lo >>u Shamt) \| ((Hi << 1) << (XLEN-1 - Shamt))
	// Hi = Hi >>s Shamt
	// else:
	// Lo = Hi >>s (Shamt-XLEN);
	// Hi = Hi >>s (XLEN-1)
	//
	// SRL expansion:
	// if Shamt-XLEN < 0: // Shamt < XLEN
	// Lo = (Lo >>u Shamt) \| ((Hi << 1) << (XLEN-1 - Shamt))
	// Hi = Hi >>u Shamt
	// else:
	// Lo = Hi >>u (Shamt-XLEN);
	// Hi = 0;

	unsigned ShiftRightOp = IsSRA ? ISD::SRA : ISD::SRL;

	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue One = DAG.getConstant(1, DL, VT);
	SDValue MinusXLen = DAG.getConstant(-(int)Subtarget.getXLen(), DL, VT);
	SDValue XLenMinus1 = DAG.getConstant(Subtarget.getXLen() - 1, DL, VT);
	SDValue ShamtMinusXLen = DAG.getNode(ISD::ADD, DL, VT, Shamt, MinusXLen);
	SDValue XLenMinus1Shamt = DAG.getNode(ISD::SUB, DL, VT, XLenMinus1, Shamt);

	SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, VT, Lo, Shamt);
	SDValue ShiftLeftHi1 = DAG.getNode(ISD::SHL, DL, VT, Hi, One);
	SDValue ShiftLeftHi =
	DAG.getNode(ISD::SHL, DL, VT, ShiftLeftHi1, XLenMinus1Shamt);
	SDValue LoTrue = DAG.getNode(ISD::OR, DL, VT, ShiftRightLo, ShiftLeftHi);
	SDValue HiTrue = DAG.getNode(ShiftRightOp, DL, VT, Hi, Shamt);
	SDValue LoFalse = DAG.getNode(ShiftRightOp, DL, VT, Hi, ShamtMinusXLen);
	SDValue HiFalse =
	IsSRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, XLenMinus1) : Zero;

	SDValue CC = DAG.getSetCC(DL, VT, ShamtMinusXLen, Zero, ISD::SETLT);

	Lo = DAG.getNode(ISD::SELECT, DL, VT, CC, LoTrue, LoFalse);
	Hi = DAG.getNode(ISD::SELECT, DL, VT, CC, HiTrue, HiFalse);

	SDValue Parts[2] = {Lo, Hi};
	return DAG.getMergeValues(Parts, DL);
	}

	// Returns the opcode of the target-specific SDNode that implements the 32-bit
	// form of the given Opcode.
	static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
	switch (Opcode) {
	default:
	llvm_unreachable("Unexpected opcode");
	case ISD::SHL:
	return RISCVISD::SLLW;
	case ISD::SRA:
	return RISCVISD::SRAW;
	case ISD::SRL:
	return RISCVISD::SRLW;
	case ISD::SDIV:
	return RISCVISD::DIVW;
	case ISD::UDIV:
	return RISCVISD::DIVUW;
	case ISD::UREM:
	return RISCVISD::REMUW;
	}
	}

	// Converts the given 32-bit operation to a target-specific SelectionDAG node.
	// Because i32 isn't a legal type for RV64, these operations would otherwise
	// be promoted to i64, making it difficult to select the SLLW/DIVUW/.../*W
	// later one because the fact the operation was originally of type i32 is
	// lost.
	static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	RISCVISD::NodeType WOpcode = getRISCVWOpcode(N->getOpcode());
	SDValue NewOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
	SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
	SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1);
	// ReplaceNodeResults requires we maintain the same type for the return value.
	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes);
	}

	void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDLoc DL(N);
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Don't know how to custom type legalize this operation!");
	case ISD::READCYCLECOUNTER: {
	assert(!Subtarget.is64Bit() &&
	"READCYCLECOUNTER only has custom type legalization on riscv32");

	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
	SDValue RCW =
	DAG.getNode(RISCVISD::READ_CYCLE_WIDE, DL, VTs, N->getOperand(0));

	Results.push_back(RCW);
	Results.push_back(RCW.getValue(1));
	Results.push_back(RCW.getValue(2));
	break;
	}
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
	"Unexpected custom legalisation");
	if (N->getOperand(1).getOpcode() == ISD::Constant)
	return;
	Results.push_back(customLegalizeToWOp(N, DAG));
	break;
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::UREM:
	assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
	Subtarget.hasStdExtM() && "Unexpected custom legalisation");
	if (N->getOperand(0).getOpcode() == ISD::Constant \|\|
	N->getOperand(1).getOpcode() == ISD::Constant)
	return;
	Results.push_back(customLegalizeToWOp(N, DAG));
	break;
	case ISD::BITCAST: {
	assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
	Subtarget.hasStdExtF() && "Unexpected custom legalisation");
	SDLoc DL(N);
	SDValue Op0 = N->getOperand(0);
	if (Op0.getValueType() != MVT::f32)
	return;
	SDValue FPConv =
	DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Op0);
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPConv));
	break;
	}
	}
	}

	SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;

	switch (N->getOpcode()) {
	default:
	break;
	case RISCVISD::SplitF64: {
	SDValue Op0 = N->getOperand(0);
	// If the input to SplitF64 is just BuildPairF64 then the operation is
	// redundant. Instead, use BuildPairF64's operands directly.
	if (Op0->getOpcode() == RISCVISD::BuildPairF64)
	return DCI.CombineTo(N, Op0.getOperand(0), Op0.getOperand(1));

	SDLoc DL(N);

	// It's cheaper to materialise two 32-bit integers than to load a double
	// from the constant pool and transfer it to integer registers through the
	// stack.
	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op0)) {
	APInt V = C->getValueAPF().bitcastToAPInt();
	SDValue Lo = DAG.getConstant(V.trunc(32), DL, MVT::i32);
	SDValue Hi = DAG.getConstant(V.lshr(32).trunc(32), DL, MVT::i32);
	return DCI.CombineTo(N, Lo, Hi);
	}

	// This is a target-specific version of a DAGCombine performed in
	// DAGCombiner::visitBITCAST. It performs the equivalent of:
	// fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
	// fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
	if (!(Op0.getOpcode() == ISD::FNEG \|\| Op0.getOpcode() == ISD::FABS) \|\|
	!Op0.getNode()->hasOneUse())
	break;
	SDValue NewSplitF64 =
	DAG.getNode(RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32),
	Op0.getOperand(0));
	SDValue Lo = NewSplitF64.getValue(0);
	SDValue Hi = NewSplitF64.getValue(1);
	APInt SignBit = APInt::getSignMask(32);
	if (Op0.getOpcode() == ISD::FNEG) {
	SDValue NewHi = DAG.getNode(ISD::XOR, DL, MVT::i32, Hi,
	DAG.getConstant(SignBit, DL, MVT::i32));
	return DCI.CombineTo(N, Lo, NewHi);
	}
	assert(Op0.getOpcode() == ISD::FABS);
	SDValue NewHi = DAG.getNode(ISD::AND, DL, MVT::i32, Hi,
	DAG.getConstant(~SignBit, DL, MVT::i32));
	return DCI.CombineTo(N, Lo, NewHi);
	}
	case RISCVISD::SLLW:
	case RISCVISD::SRAW:
	case RISCVISD::SRLW: {
	// Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
	APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5);
	if ((SimplifyDemandedBits(N->getOperand(0), LHSMask, DCI)) \|\|
	(SimplifyDemandedBits(N->getOperand(1), RHSMask, DCI)))
	return SDValue();
	break;
	}
	case RISCVISD::FMV_X_ANYEXTW_RV64: {
	SDLoc DL(N);
	SDValue Op0 = N->getOperand(0);
	// If the input to FMV_X_ANYEXTW_RV64 is just FMV_W_X_RV64 then the
	// conversion is unnecessary and can be replaced with an ANY_EXTEND
	// of the FMV_W_X_RV64 operand.
	if (Op0->getOpcode() == RISCVISD::FMV_W_X_RV64) {
	SDValue AExtOp =
	DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0.getOperand(0));
	return DCI.CombineTo(N, AExtOp);
	}

	// This is a target-specific version of a DAGCombine performed in
	// DAGCombiner::visitBITCAST. It performs the equivalent of:
	// fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
	// fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
	if (!(Op0.getOpcode() == ISD::FNEG \|\| Op0.getOpcode() == ISD::FABS) \|\|
	!Op0.getNode()->hasOneUse())
	break;
	SDValue NewFMV = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64,
	Op0.getOperand(0));
	APInt SignBit = APInt::getSignMask(32).sext(64);
	if (Op0.getOpcode() == ISD::FNEG) {
	return DCI.CombineTo(N,
	DAG.getNode(ISD::XOR, DL, MVT::i64, NewFMV,
	DAG.getConstant(SignBit, DL, MVT::i64)));
	}
	assert(Op0.getOpcode() == ISD::FABS);
	return DCI.CombineTo(N,
	DAG.getNode(ISD::AND, DL, MVT::i64, NewFMV,
	DAG.getConstant(~SignBit, DL, MVT::i64)));
	}
	}

	return SDValue();
	}

	bool RISCVTargetLowering::isDesirableToCommuteWithShift(
	const SDNode *N, CombineLevel Level) const {
	// The following folds are only desirable if `(OP _, c1 << c2)` can be
	// materialised in fewer instructions than `(OP _, c1)`:
	//
	// (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
	// (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
	SDValue N0 = N->getOperand(0);
	EVT Ty = N0.getValueType();
	if (Ty.isScalarInteger() &&
	(N0.getOpcode() == ISD::ADD \|\| N0.getOpcode() == ISD::OR)) {
	auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1));
	auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (C1 && C2) {
	APInt C1Int = C1->getAPIntValue();
	APInt ShiftedC1Int = C1Int << C2->getAPIntValue();

	// We can materialise `c1 << c2` into an add immediate, so it's "free",
	// and the combine should happen, to potentially allow further combines
	// later.
	- if (isLegalAddImmediate(ShiftedC1Int.getSExtValue()))
	+ if (ShiftedC1Int.getMinSignedBits() <= 64 &&
	+ isLegalAddImmediate(ShiftedC1Int.getSExtValue()))
	return true;

	// We can materialise `c1` in an add immediate, so it's "free", and the
	// combine should be prevented.
	- if (isLegalAddImmediate(C1Int.getSExtValue()))
	+ if (C1Int.getMinSignedBits() <= 64 &&
	+ isLegalAddImmediate(C1Int.getSExtValue()))
	return false;

	// Neither constant will fit into an immediate, so find materialisation
	// costs.
	int C1Cost = RISCVMatInt::getIntMatCost(C1Int, Ty.getSizeInBits(),
	Subtarget.is64Bit());
	int ShiftedC1Cost = RISCVMatInt::getIntMatCost(
	ShiftedC1Int, Ty.getSizeInBits(), Subtarget.is64Bit());

	// Materialising `c1` is cheaper than materialising `c1 << c2`, so the
	// combine should be prevented.
	if (C1Cost < ShiftedC1Cost)
	return false;
	}
	}
	return true;
	}

	unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	switch (Op.getOpcode()) {
	default:
	break;
	case RISCVISD::SLLW:
	case RISCVISD::SRAW:
	case RISCVISD::SRLW:
	case RISCVISD::DIVW:
	case RISCVISD::DIVUW:
	case RISCVISD::REMUW:
	// TODO: As the result is sign-extended, this is conservatively correct. A
	// more precise answer could be calculated for SRAW depending on known
	// bits in the shift amount.
	return 33;
	}

	return 1;
	}

	MachineBasicBlock *emitReadCycleWidePseudo(MachineInstr &MI,
	MachineBasicBlock *BB) {
	assert(MI.getOpcode() == RISCV::ReadCycleWide && "Unexpected instruction");

	// To read the 64-bit cycle CSR on a 32-bit target, we read the two halves.
	// Should the count have wrapped while it was being read, we need to try
	// again.
	// ...
	// read:
	// rdcycleh x3 # load high word of cycle
	// rdcycle x2 # load low word of cycle
	// rdcycleh x4 # load high word of cycle
	// bne x3, x4, read # check if high word reads match, otherwise try again
	// ...

	MachineFunction &MF = *BB->getParent();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction::iterator It = ++BB->getIterator();

	MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB);
	MF.insert(It, LoopMBB);

	MachineBasicBlock *DoneMBB = MF.CreateMachineBasicBlock(LLVM_BB);
	MF.insert(It, DoneMBB);

	// Transfer the remainder of BB and its successor edges to DoneMBB.
	DoneMBB->splice(DoneMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	DoneMBB->transferSuccessorsAndUpdatePHIs(BB);

	BB->addSuccessor(LoopMBB);

	MachineRegisterInfo &RegInfo = MF.getRegInfo();
	unsigned ReadAgainReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
	unsigned LoReg = MI.getOperand(0).getReg();
	unsigned HiReg = MI.getOperand(1).getReg();
	DebugLoc DL = MI.getDebugLoc();

	const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
	BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), HiReg)
	.addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding)
	.addReg(RISCV::X0);
	BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), LoReg)
	.addImm(RISCVSysReg::lookupSysRegByName("CYCLE")->Encoding)
	.addReg(RISCV::X0);
	BuildMI(LoopMBB, DL, TII->get(RISCV::CSRRS), ReadAgainReg)
	.addImm(RISCVSysReg::lookupSysRegByName("CYCLEH")->Encoding)
	.addReg(RISCV::X0);

	BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
	.addReg(HiReg)
	.addReg(ReadAgainReg)
	.addMBB(LoopMBB);

	LoopMBB->addSuccessor(LoopMBB);
	LoopMBB->addSuccessor(DoneMBB);

	MI.eraseFromParent();

	return DoneMBB;
	}

	static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
	MachineBasicBlock *BB) {
	assert(MI.getOpcode() == RISCV::SplitF64Pseudo && "Unexpected instruction");

	MachineFunction &MF = *BB->getParent();
	DebugLoc DL = MI.getDebugLoc();
	const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
	const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
	unsigned LoReg = MI.getOperand(0).getReg();
	unsigned HiReg = MI.getOperand(1).getReg();
	unsigned SrcReg = MI.getOperand(2).getReg();
	const TargetRegisterClass *SrcRC = &RISCV::FPR64RegClass;
	int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();

	TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC,
	RI);
	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
	MachineMemOperand::MOLoad, 8, 8);
	BuildMI(*BB, MI, DL, TII.get(RISCV::LW), LoReg)
	.addFrameIndex(FI)
	.addImm(0)
	.addMemOperand(MMO);
	BuildMI(*BB, MI, DL, TII.get(RISCV::LW), HiReg)
	.addFrameIndex(FI)
	.addImm(4)
	.addMemOperand(MMO);
	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
	MachineBasicBlock *BB) {
	assert(MI.getOpcode() == RISCV::BuildPairF64Pseudo &&
	"Unexpected instruction");

	MachineFunction &MF = *BB->getParent();
	DebugLoc DL = MI.getDebugLoc();
	const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
	const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
	unsigned DstReg = MI.getOperand(0).getReg();
	unsigned LoReg = MI.getOperand(1).getReg();
	unsigned HiReg = MI.getOperand(2).getReg();
	const TargetRegisterClass *DstRC = &RISCV::FPR64RegClass;
	int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
	MachineMemOperand::MOStore, 8, 8);
	BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
	.addReg(LoReg, getKillRegState(MI.getOperand(1).isKill()))
	.addFrameIndex(FI)
	.addImm(0)
	.addMemOperand(MMO);
	BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
	.addReg(HiReg, getKillRegState(MI.getOperand(2).isKill()))
	.addFrameIndex(FI)
	.addImm(4)
	.addMemOperand(MMO);
	TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI);
	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	static bool isSelectPseudo(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default:
	return false;
	case RISCV::Select_GPR_Using_CC_GPR:
	case RISCV::Select_FPR32_Using_CC_GPR:
	case RISCV::Select_FPR64_Using_CC_GPR:
	return true;
	}
	}

	static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
	MachineBasicBlock *BB) {
	// To "insert" Select_* instructions, we actually have to insert the triangle
	// control-flow pattern. The incoming instructions know the destination vreg
	// to set, the condition code register to branch on, the true/false values to
	// select between, and the condcode to use to select the appropriate branch.
	//
	// We produce the following control flow:
	// HeadMBB
	// \| \
	// \| IfFalseMBB
	// \| /
	// TailMBB
	//
	// When we find a sequence of selects we attempt to optimize their emission
	// by sharing the control flow. Currently we only handle cases where we have
	// multiple selects with the exact same condition (same LHS, RHS and CC).
	// The selects may be interleaved with other instructions if the other
	// instructions meet some requirements we deem safe:
	// - They are debug instructions. Otherwise,
	// - They do not have side-effects, do not access memory and their inputs do
	// not depend on the results of the select pseudo-instructions.
	// The TrueV/FalseV operands of the selects cannot depend on the result of
	// previous selects in the sequence.
	// These conditions could be further relaxed. See the X86 target for a
	// related approach and more information.
	unsigned LHS = MI.getOperand(1).getReg();
	unsigned RHS = MI.getOperand(2).getReg();
	auto CC = static_cast<ISD::CondCode>(MI.getOperand(3).getImm());

	SmallVector<MachineInstr *, 4> SelectDebugValues;
	SmallSet<unsigned, 4> SelectDests;
	SelectDests.insert(MI.getOperand(0).getReg());

	MachineInstr *LastSelectPseudo = &MI;

	for (auto E = BB->end(), SequenceMBBI = MachineBasicBlock::iterator(MI);
	SequenceMBBI != E; ++SequenceMBBI) {
	if (SequenceMBBI->isDebugInstr())
	continue;
	else if (isSelectPseudo(*SequenceMBBI)) {
	if (SequenceMBBI->getOperand(1).getReg() != LHS \|\|
	SequenceMBBI->getOperand(2).getReg() != RHS \|\|
	SequenceMBBI->getOperand(3).getImm() != CC \|\|
	SelectDests.count(SequenceMBBI->getOperand(4).getReg()) \|\|
	SelectDests.count(SequenceMBBI->getOperand(5).getReg()))
	break;
	LastSelectPseudo = &*SequenceMBBI;
	SequenceMBBI->collectDebugValues(SelectDebugValues);
	SelectDests.insert(SequenceMBBI->getOperand(0).getReg());
	} else {
	if (SequenceMBBI->hasUnmodeledSideEffects() \|\|
	SequenceMBBI->mayLoadOrStore())
	break;
	if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) {
	return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg());
	}))
	break;
	}
	}

	const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction::iterator I = ++BB->getIterator();

	MachineBasicBlock *HeadMBB = BB;
	MachineFunction *F = BB->getParent();
	MachineBasicBlock *TailMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *IfFalseMBB = F->CreateMachineBasicBlock(LLVM_BB);

	F->insert(I, IfFalseMBB);
	F->insert(I, TailMBB);

	// Transfer debug instructions associated with the selects to TailMBB.
	for (MachineInstr *DebugInstr : SelectDebugValues) {
	TailMBB->push_back(DebugInstr->removeFromParent());
	}

	// Move all instructions after the sequence to TailMBB.
	TailMBB->splice(TailMBB->end(), HeadMBB,
	std::next(LastSelectPseudo->getIterator()), HeadMBB->end());
	// Update machine-CFG edges by transferring all successors of the current
	// block to the new block which will contain the Phi nodes for the selects.
	TailMBB->transferSuccessorsAndUpdatePHIs(HeadMBB);
	// Set the successors for HeadMBB.
	HeadMBB->addSuccessor(IfFalseMBB);
	HeadMBB->addSuccessor(TailMBB);

	// Insert appropriate branch.
	unsigned Opcode = getBranchOpcodeForIntCondCode(CC);

	BuildMI(HeadMBB, DL, TII.get(Opcode))
	.addReg(LHS)
	.addReg(RHS)
	.addMBB(TailMBB);

	// IfFalseMBB just falls through to TailMBB.
	IfFalseMBB->addSuccessor(TailMBB);

	// Create PHIs for all of the select pseudo-instructions.
	auto SelectMBBI = MI.getIterator();
	auto SelectEnd = std::next(LastSelectPseudo->getIterator());
	auto InsertionPoint = TailMBB->begin();
	while (SelectMBBI != SelectEnd) {
	auto Next = std::next(SelectMBBI);
	if (isSelectPseudo(*SelectMBBI)) {
	// %Result = phi [ %TrueValue, HeadMBB ], [ %FalseValue, IfFalseMBB ]
	BuildMI(*TailMBB, InsertionPoint, SelectMBBI->getDebugLoc(),
	TII.get(RISCV::PHI), SelectMBBI->getOperand(0).getReg())
	.addReg(SelectMBBI->getOperand(4).getReg())
	.addMBB(HeadMBB)
	.addReg(SelectMBBI->getOperand(5).getReg())
	.addMBB(IfFalseMBB);
	SelectMBBI->eraseFromParent();
	}
	SelectMBBI = Next;
	}

	F->getProperties().reset(MachineFunctionProperties::Property::NoPHIs);
	return TailMBB;
	}

	MachineBasicBlock *
	RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	switch (MI.getOpcode()) {
	default:
	llvm_unreachable("Unexpected instr type to insert");
	case RISCV::ReadCycleWide:
	assert(!Subtarget.is64Bit() &&
	"ReadCycleWrite is only to be used on riscv32");
	return emitReadCycleWidePseudo(MI, BB);
	case RISCV::Select_GPR_Using_CC_GPR:
	case RISCV::Select_FPR32_Using_CC_GPR:
	case RISCV::Select_FPR64_Using_CC_GPR:
	return emitSelectPseudo(MI, BB);
	case RISCV::BuildPairF64Pseudo:
	return emitBuildPairF64Pseudo(MI, BB);
	case RISCV::SplitF64Pseudo:
	return emitSplitF64Pseudo(MI, BB);
	}
	}

	// Calling Convention Implementation.
	// The expectations for frontend ABI lowering vary from target to target.
	// Ideally, an LLVM frontend would be able to avoid worrying about many ABI
	// details, but this is a longer term goal. For now, we simply try to keep the
	// role of the frontend as simple and well-defined as possible. The rules can
	// be summarised as:
	// * Never split up large scalar arguments. We handle them here.
	// * If a hardfloat calling convention is being used, and the struct may be
	// passed in a pair of registers (fp+fp, int+fp), and both registers are
	// available, then pass as two separate arguments. If either the GPRs or FPRs
	// are exhausted, then pass according to the rule below.
	// * If a struct could never be passed in registers or directly in a stack
	// slot (as it is larger than 2*XLEN and the floating point rules don't
	// apply), then pass it using a pointer with the byval attribute.
	// * If a struct is less than 2*XLEN, then coerce to either a two-element
	// word-sized array or a 2*XLEN scalar (depending on alignment).
	// * The frontend can determine whether a struct is returned by reference or
	// not based on its size and fields. If it will be returned by reference, the
	// frontend must modify the prototype so a pointer with the sret annotation is
	// passed as the first argument. This is not necessary for large scalar
	// returns.
	// * Struct return values and varargs should be coerced to structs containing
	// register-size fields in the same situations they would be for fixed
	// arguments.

	static const MCPhysReg ArgGPRs[] = {
	RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13,
	RISCV::X14, RISCV::X15, RISCV::X16, RISCV::X17
	};
	static const MCPhysReg ArgFPR32s[] = {
	RISCV::F10_32, RISCV::F11_32, RISCV::F12_32, RISCV::F13_32,
	RISCV::F14_32, RISCV::F15_32, RISCV::F16_32, RISCV::F17_32
	};
	static const MCPhysReg ArgFPR64s[] = {
	RISCV::F10_64, RISCV::F11_64, RISCV::F12_64, RISCV::F13_64,
	RISCV::F14_64, RISCV::F15_64, RISCV::F16_64, RISCV::F17_64
	};

	// Pass a 2*XLEN argument that has been split into two XLEN values through
	// registers or the stack as necessary.
	static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
	ISD::ArgFlagsTy ArgFlags1, unsigned ValNo2,
	MVT ValVT2, MVT LocVT2,
	ISD::ArgFlagsTy ArgFlags2) {
	unsigned XLenInBytes = XLen / 8;
	if (unsigned Reg = State.AllocateReg(ArgGPRs)) {
	// At least one half can be passed via register.
	State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg,
	VA1.getLocVT(), CCValAssign::Full));
	} else {
	// Both halves must be passed on the stack, with proper alignment.
	unsigned StackAlign = std::max(XLenInBytes, ArgFlags1.getOrigAlign());
	State.addLoc(
	CCValAssign::getMem(VA1.getValNo(), VA1.getValVT(),
	State.AllocateStack(XLenInBytes, StackAlign),
	VA1.getLocVT(), CCValAssign::Full));
	State.addLoc(CCValAssign::getMem(
	ValNo2, ValVT2, State.AllocateStack(XLenInBytes, XLenInBytes), LocVT2,
	CCValAssign::Full));
	return false;
	}

	if (unsigned Reg = State.AllocateReg(ArgGPRs)) {
	// The second half can also be passed via register.
	State.addLoc(
	CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full));
	} else {
	// The second half is passed via the stack, without additional alignment.
	State.addLoc(CCValAssign::getMem(
	ValNo2, ValVT2, State.AllocateStack(XLenInBytes, XLenInBytes), LocVT2,
	CCValAssign::Full));
	}

	return false;
	}

	// Implements the RISC-V calling convention. Returns true upon failure.
	static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
	MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
	ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
	bool IsRet, Type *OrigTy) {
	unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
	assert(XLen == 32 \|\| XLen == 64);
	MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;

	// Any return value split in to more than two values can't be returned
	// directly.
	if (IsRet && ValNo > 1)
	return true;

	// UseGPRForF32 if targeting one of the soft-float ABIs, if passing a
	// variadic argument, or if no F32 argument registers are available.
	bool UseGPRForF32 = true;
	// UseGPRForF64 if targeting soft-float ABIs or an FLEN=32 ABI, if passing a
	// variadic argument, or if no F64 argument registers are available.
	bool UseGPRForF64 = true;

	switch (ABI) {
	default:
	llvm_unreachable("Unexpected ABI");
	case RISCVABI::ABI_ILP32:
	case RISCVABI::ABI_LP64:
	break;
	case RISCVABI::ABI_ILP32F:
	case RISCVABI::ABI_LP64F:
	UseGPRForF32 = !IsFixed;
	break;
	case RISCVABI::ABI_ILP32D:
	case RISCVABI::ABI_LP64D:
	UseGPRForF32 = !IsFixed;
	UseGPRForF64 = !IsFixed;
	break;
	}

	if (State.getFirstUnallocated(ArgFPR32s) == array_lengthof(ArgFPR32s))
	UseGPRForF32 = true;
	if (State.getFirstUnallocated(ArgFPR64s) == array_lengthof(ArgFPR64s))
	UseGPRForF64 = true;

	// From this point on, rely on UseGPRForF32, UseGPRForF64 and similar local
	// variables rather than directly checking against the target ABI.

	if (UseGPRForF32 && ValVT == MVT::f32) {
	LocVT = XLenVT;
	LocInfo = CCValAssign::BCvt;
	} else if (UseGPRForF64 && XLen == 64 && ValVT == MVT::f64) {
	LocVT = MVT::i64;
	LocInfo = CCValAssign::BCvt;
	}

	// If this is a variadic argument, the RISC-V calling convention requires
	// that it is assigned an 'even' or 'aligned' register if it has 8-byte
	// alignment (RV32) or 16-byte alignment (RV64). An aligned register should
	// be used regardless of whether the original argument was split during
	// legalisation or not. The argument will not be passed by registers if the
	// original type is larger than 2*XLEN, so the register alignment rule does
	// not apply.
	unsigned TwoXLenInBytes = (2 * XLen) / 8;
	if (!IsFixed && ArgFlags.getOrigAlign() == TwoXLenInBytes &&
	DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes) {
	unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
	// Skip 'odd' register if necessary.
	if (RegIdx != array_lengthof(ArgGPRs) && RegIdx % 2 == 1)
	State.AllocateReg(ArgGPRs);
	}

	SmallVectorImpl<CCValAssign> &PendingLocs = State.getPendingLocs();
	SmallVectorImpl<ISD::ArgFlagsTy> &PendingArgFlags =
	State.getPendingArgFlags();

	assert(PendingLocs.size() == PendingArgFlags.size() &&
	"PendingLocs and PendingArgFlags out of sync");

	// Handle passing f64 on RV32D with a soft float ABI or when floating point
	// registers are exhausted.
	if (UseGPRForF64 && XLen == 32 && ValVT == MVT::f64) {
	assert(!ArgFlags.isSplit() && PendingLocs.empty() &&
	"Can't lower f64 if it is split");
	// Depending on available argument GPRS, f64 may be passed in a pair of
	// GPRs, split between a GPR and the stack, or passed completely on the
	// stack. LowerCall/LowerFormalArguments/LowerReturn must recognise these
	// cases.
	unsigned Reg = State.AllocateReg(ArgGPRs);
	LocVT = MVT::i32;
	if (!Reg) {
	unsigned StackOffset = State.AllocateStack(8, 8);
	State.addLoc(
	CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
	return false;
	}
	if (!State.AllocateReg(ArgGPRs))
	State.AllocateStack(4, 4);
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	return false;
	}

	// Split arguments might be passed indirectly, so keep track of the pending
	// values.
	if (ArgFlags.isSplit() \|\| !PendingLocs.empty()) {
	LocVT = XLenVT;
	LocInfo = CCValAssign::Indirect;
	PendingLocs.push_back(
	CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
	PendingArgFlags.push_back(ArgFlags);
	if (!ArgFlags.isSplitEnd()) {
	return false;
	}
	}

	// If the split argument only had two elements, it should be passed directly
	// in registers or on the stack.
	if (ArgFlags.isSplitEnd() && PendingLocs.size() <= 2) {
	assert(PendingLocs.size() == 2 && "Unexpected PendingLocs.size()");
	// Apply the normal calling convention rules to the first half of the
	// split argument.
	CCValAssign VA = PendingLocs[0];
	ISD::ArgFlagsTy AF = PendingArgFlags[0];
	PendingLocs.clear();
	PendingArgFlags.clear();
	return CC_RISCVAssign2XLen(XLen, State, VA, AF, ValNo, ValVT, LocVT,
	ArgFlags);
	}

	// Allocate to a register if possible, or else a stack slot.
	unsigned Reg;
	if (ValVT == MVT::f32 && !UseGPRForF32)
	Reg = State.AllocateReg(ArgFPR32s, ArgFPR64s);
	else if (ValVT == MVT::f64 && !UseGPRForF64)
	Reg = State.AllocateReg(ArgFPR64s, ArgFPR32s);
	else
	Reg = State.AllocateReg(ArgGPRs);
	unsigned StackOffset = Reg ? 0 : State.AllocateStack(XLen / 8, XLen / 8);

	// If we reach this point and PendingLocs is non-empty, we must be at the
	// end of a split argument that must be passed indirectly.
	if (!PendingLocs.empty()) {
	assert(ArgFlags.isSplitEnd() && "Expected ArgFlags.isSplitEnd()");
	assert(PendingLocs.size() > 2 && "Unexpected PendingLocs.size()");

	for (auto &It : PendingLocs) {
	if (Reg)
	It.convertToReg(Reg);
	else
	It.convertToMem(StackOffset);
	State.addLoc(It);
	}
	PendingLocs.clear();
	PendingArgFlags.clear();
	return false;
	}

	assert((!UseGPRForF32 \|\| !UseGPRForF64 \|\| LocVT == XLenVT) &&
	"Expected an XLenVT at this stage");

	if (Reg) {
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	return false;
	}

	// When an f32 or f64 is passed on the stack, no bit-conversion is needed.
	if (ValVT == MVT::f32 \|\| ValVT == MVT::f64) {
	LocVT = ValVT;
	LocInfo = CCValAssign::Full;
	}
	State.addLoc(CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
	return false;
	}

	void RISCVTargetLowering::analyzeInputArgs(
	MachineFunction &MF, CCState &CCInfo,
	const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet) const {
	unsigned NumArgs = Ins.size();
	FunctionType *FType = MF.getFunction().getFunctionType();

	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ArgVT = Ins[i].VT;
	ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;

	Type *ArgTy = nullptr;
	if (IsRet)
	ArgTy = FType->getReturnType();
	else if (Ins[i].isOrigArg())
	ArgTy = FType->getParamType(Ins[i].getOrigArgIndex());

	RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
	if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
	ArgFlags, CCInfo, /IsRet=/true, IsRet, ArgTy)) {
	LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
	<< EVT(ArgVT).getEVTString() << '\n');
	llvm_unreachable(nullptr);
	}
	}
	}

	void RISCVTargetLowering::analyzeOutputArgs(
	MachineFunction &MF, CCState &CCInfo,
	const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsRet,
	CallLoweringInfo *CLI) const {
	unsigned NumArgs = Outs.size();

	for (unsigned i = 0; i != NumArgs; i++) {
	MVT ArgVT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr;

	RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
	if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
	ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) {
	LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
	<< EVT(ArgVT).getEVTString() << "\n");
	llvm_unreachable(nullptr);
	}
	}
	}

	// Convert Val to a ValVT. Should not be called for CCValAssign::Indirect
	// values.
	static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
	const CCValAssign &VA, const SDLoc &DL) {
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unexpected CCValAssign::LocInfo");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
	Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
	break;
	}
	Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
	break;
	}
	return Val;
	}

	// The caller is responsible for loading the full value if the argument is
	// passed with CCValAssign::Indirect.
	static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain,
	const CCValAssign &VA, const SDLoc &DL) {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineRegisterInfo &RegInfo = MF.getRegInfo();
	EVT LocVT = VA.getLocVT();
	SDValue Val;
	const TargetRegisterClass *RC;

	switch (LocVT.getSimpleVT().SimpleTy) {
	default:
	llvm_unreachable("Unexpected register type");
	case MVT::i32:
	case MVT::i64:
	RC = &RISCV::GPRRegClass;
	break;
	case MVT::f32:
	RC = &RISCV::FPR32RegClass;
	break;
	case MVT::f64:
	RC = &RISCV::FPR64RegClass;
	break;
	}

	unsigned VReg = RegInfo.createVirtualRegister(RC);
	RegInfo.addLiveIn(VA.getLocReg(), VReg);
	Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);

	if (VA.getLocInfo() == CCValAssign::Indirect)
	return Val;

	return convertLocVTToValVT(DAG, Val, VA, DL);
	}

	static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
	const CCValAssign &VA, const SDLoc &DL) {
	EVT LocVT = VA.getLocVT();

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unexpected CCValAssign::LocInfo");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
	Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
	break;
	}
	Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
	break;
	}
	return Val;
	}

	// The caller is responsible for loading the full value if the argument is
	// passed with CCValAssign::Indirect.
	static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
	const CCValAssign &VA, const SDLoc &DL) {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	EVT LocVT = VA.getLocVT();
	EVT ValVT = VA.getValVT();
	EVT PtrVT = MVT::getIntegerVT(DAG.getDataLayout().getPointerSizeInBits(0));
	int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), /Immutable=/true);
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val;

	ISD::LoadExtType ExtType;
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unexpected CCValAssign::LocInfo");
	case CCValAssign::Full:
	case CCValAssign::Indirect:
	case CCValAssign::BCvt:
	ExtType = ISD::NON_EXTLOAD;
	break;
	}
	Val = DAG.getExtLoad(
	ExtType, DL, LocVT, Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), ValVT);
	return Val;
	}

	static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
	const CCValAssign &VA, const SDLoc &DL) {
	assert(VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64 &&
	"Unexpected VA");
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MachineRegisterInfo &RegInfo = MF.getRegInfo();

	if (VA.isMemLoc()) {
	// f64 is passed on the stack.
	int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), /Immutable=/true);
	SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
	return DAG.getLoad(MVT::f64, DL, Chain, FIN,
	MachinePointerInfo::getFixedStack(MF, FI));
	}

	assert(VA.isRegLoc() && "Expected register VA assignment");

	unsigned LoVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
	RegInfo.addLiveIn(VA.getLocReg(), LoVReg);
	SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32);
	SDValue Hi;
	if (VA.getLocReg() == RISCV::X17) {
	// Second half of f64 is passed on the stack.
	int FI = MFI.CreateFixedObject(4, 0, /Immutable=/true);
	SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
	Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN,
	MachinePointerInfo::getFixedStack(MF, FI));
	} else {
	// Second half of f64 is passed in another GPR.
	unsigned HiVReg = RegInfo.createVirtualRegister(&RISCV::GPRRegClass);
	RegInfo.addLiveIn(VA.getLocReg() + 1, HiVReg);
	Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32);
	}
	return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
	}

	// Transform physical registers into virtual registers.
	SDValue RISCVTargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

	switch (CallConv) {
	default:
	report_fatal_error("Unsupported calling convention");
	case CallingConv::C:
	case CallingConv::Fast:
	break;
	}

	MachineFunction &MF = DAG.getMachineFunction();

	const Function &Func = MF.getFunction();
	if (Func.hasFnAttribute("interrupt")) {
	if (!Func.arg_empty())
	report_fatal_error(
	"Functions with the interrupt attribute cannot have arguments!");

	StringRef Kind =
	MF.getFunction().getFnAttribute("interrupt").getValueAsString();

	if (!(Kind == "user" \|\| Kind == "supervisor" \|\| Kind == "machine"))
	report_fatal_error(
	"Function interrupt attribute argument not supported!");
	}

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	MVT XLenVT = Subtarget.getXLenVT();
	unsigned XLenInBytes = Subtarget.getXLen() / 8;
	// Used with vargs to acumulate store chains.
	std::vector<SDValue> OutChains;

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
	analyzeInputArgs(MF, CCInfo, Ins, /IsRet=/false);

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue ArgValue;
	// Passing f64 on RV32D with a soft float ABI must be handled as a special
	// case.
	if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64)
	ArgValue = unpackF64OnRV32DSoftABI(DAG, Chain, VA, DL);
	else if (VA.isRegLoc())
	ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL);
	else
	ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL);

	if (VA.getLocInfo() == CCValAssign::Indirect) {
	// If the original argument was split and passed by reference (e.g. i128
	// on RV32), we need to load all parts of it here (using the same
	// address).
	InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
	MachinePointerInfo()));
	unsigned ArgIndex = Ins[i].OrigArgIndex;
	assert(Ins[i].PartOffset == 0);
	while (i + 1 != e && Ins[i + 1].OrigArgIndex == ArgIndex) {
	CCValAssign &PartVA = ArgLocs[i + 1];
	unsigned PartOffset = Ins[i + 1].PartOffset;
	SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue,
	DAG.getIntPtrConstant(PartOffset, DL));
	InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
	MachinePointerInfo()));
	++i;
	}
	continue;
	}
	InVals.push_back(ArgValue);
	}

	if (IsVarArg) {
	ArrayRef<MCPhysReg> ArgRegs = makeArrayRef(ArgGPRs);
	unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs);
	const TargetRegisterClass *RC = &RISCV::GPRRegClass;
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MachineRegisterInfo &RegInfo = MF.getRegInfo();
	RISCVMachineFunctionInfo *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();

	// Offset of the first variable argument from stack pointer, and size of
	// the vararg save area. For now, the varargs save area is either zero or
	// large enough to hold a0-a7.
	int VaArgOffset, VarArgsSaveSize;

	// If all registers are allocated, then all varargs must be passed on the
	// stack and we don't need to save any argregs.
	if (ArgRegs.size() == Idx) {
	VaArgOffset = CCInfo.getNextStackOffset();
	VarArgsSaveSize = 0;
	} else {
	VarArgsSaveSize = XLenInBytes * (ArgRegs.size() - Idx);
	VaArgOffset = -VarArgsSaveSize;
	}

	// Record the frame index of the first variable argument
	// which is a value necessary to VASTART.
	int FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
	RVFI->setVarArgsFrameIndex(FI);

	// If saving an odd number of registers then create an extra stack slot to
	// ensure that the frame pointer is 2*XLEN-aligned, which in turn ensures
	// offsets to even-numbered registered remain 2*XLEN-aligned.
	if (Idx % 2) {
	FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset - (int)XLenInBytes,
	true);
	VarArgsSaveSize += XLenInBytes;
	}

	// Copy the integer registers that may have been used for passing varargs
	// to the vararg save area.
	for (unsigned I = Idx; I < ArgRegs.size();
	++I, VaArgOffset += XLenInBytes) {
	const unsigned Reg = RegInfo.createVirtualRegister(RC);
	RegInfo.addLiveIn(ArgRegs[I], Reg);
	SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, XLenVT);
	FI = MFI.CreateFixedObject(XLenInBytes, VaArgOffset, true);
	SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
	MachinePointerInfo::getFixedStack(MF, FI));
	cast<StoreSDNode>(Store.getNode())
	->getMemOperand()
	->setValue((Value *)nullptr);
	OutChains.push_back(Store);
	}
	RVFI->setVarArgsSaveSize(VarArgsSaveSize);
	}

	// All stores are grouped in one node to allow the matching between
	// the size of Ins and InVals. This only happens for vararg functions.
	if (!OutChains.empty()) {
	OutChains.push_back(Chain);
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
	}

	return Chain;
	}

	/// isEligibleForTailCallOptimization - Check whether the call is eligible
	/// for tail call optimization.
	/// Note: This is modelled after ARM's IsEligibleForTailCallOptimization.
	bool RISCVTargetLowering::isEligibleForTailCallOptimization(
	CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
	const SmallVector<CCValAssign, 16> &ArgLocs) const {

	auto &Callee = CLI.Callee;
	auto CalleeCC = CLI.CallConv;
	auto IsVarArg = CLI.IsVarArg;
	auto &Outs = CLI.Outs;
	auto &Caller = MF.getFunction();
	auto CallerCC = Caller.getCallingConv();

	// Do not tail call opt functions with "disable-tail-calls" attribute.
	if (Caller.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
	return false;

	// Exception-handling functions need a special set of instructions to
	// indicate a return to the hardware. Tail-calling another function would
	// probably break this.
	// TODO: The "interrupt" attribute isn't currently defined by RISC-V. This
	// should be expanded as new function attributes are introduced.
	if (Caller.hasFnAttribute("interrupt"))
	return false;

	// Do not tail call opt functions with varargs.
	if (IsVarArg)
	return false;

	// Do not tail call opt if the stack is used to pass parameters.
	if (CCInfo.getNextStackOffset() != 0)
	return false;

	// Do not tail call opt if any parameters need to be passed indirectly.
	// Since long doubles (fp128) and i128 are larger than 2*XLEN, they are
	// passed indirectly. So the address of the value will be passed in a
	// register, or if not available, then the address is put on the stack. In
	// order to pass indirectly, space on the stack often needs to be allocated
	// in order to store the value. In this case the CCInfo.getNextStackOffset()
	// != 0 check is not enough and we need to check if any CCValAssign ArgsLocs
	// are passed CCValAssign::Indirect.
	for (auto &VA : ArgLocs)
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;

	// Do not tail call opt if either caller or callee uses struct return
	// semantics.
	auto IsCallerStructRet = Caller.hasStructRetAttr();
	auto IsCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
	if (IsCallerStructRet \|\| IsCalleeStructRet)
	return false;

	// Externally-defined functions with weak linkage should not be
	// tail-called. The behaviour of branch instructions in this situation (as
	// used for tail calls) is implementation-defined, so we cannot rely on the
	// linker replacing the tail call with a return.
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = G->getGlobal();
	if (GV->hasExternalWeakLinkage())
	return false;
	}

	// The callee has to preserve all registers the caller needs to preserve.
	const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (CalleeCC != CallerCC) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	// Byval parameters hand the function a pointer directly into the stack area
	// we want to reuse during a tail call. Working around this is possible
	// but less efficient and uglier in LowerCall.
	for (auto &Arg : Outs)
	if (Arg.Flags.isByVal())
	return false;

	return true;
	}

	// Lower a call to a callseq_start + CALL + callseq_end chain, and add input
	// and output parameter nodes.
	SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &DL = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &IsTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool IsVarArg = CLI.IsVarArg;
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	MVT XLenVT = Subtarget.getXLenVT();

	MachineFunction &MF = DAG.getMachineFunction();

	// Analyze the operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
	analyzeOutputArgs(MF, ArgCCInfo, Outs, /IsRet=/false, &CLI);

	// Check if it's really possible to do a tail call.
	if (IsTailCall)
	IsTailCall = isEligibleForTailCallOptimization(ArgCCInfo, CLI, MF, ArgLocs);

	if (IsTailCall)
	++NumTailCalls;
	else if (CLI.CS && CLI.CS.isMustTailCall())
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = ArgCCInfo.getNextStackOffset();

	// Create local copies for byval args
	SmallVector<SDValue, 8> ByValArgs;
	for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	if (!Flags.isByVal())
	continue;

	SDValue Arg = OutVals[i];
	unsigned Size = Flags.getByValSize();
	unsigned Align = Flags.getByValAlign();

	int FI = MF.getFrameInfo().CreateStackObject(Size, Align, /isSS=/false);
	SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	SDValue SizeNode = DAG.getConstant(Size, DL, XLenVT);

	Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Align,
	/IsVolatile=/false,
	/AlwaysInline=/false,
	IsTailCall, MachinePointerInfo(),
	MachinePointerInfo());
	ByValArgs.push_back(FIPtr);
	}

	if (!IsTailCall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);

	// Copy argument values to their designated locations.
	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	SDValue StackPtr;
	for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue ArgValue = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;

	// Handle passing f64 on RV32D with a soft float ABI as a special case.
	bool IsF64OnRV32DSoftABI =
	VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64;
	if (IsF64OnRV32DSoftABI && VA.isRegLoc()) {
	SDValue SplitF64 = DAG.getNode(
	RISCVISD::SplitF64, DL, DAG.getVTList(MVT::i32, MVT::i32), ArgValue);
	SDValue Lo = SplitF64.getValue(0);
	SDValue Hi = SplitF64.getValue(1);

	unsigned RegLo = VA.getLocReg();
	RegsToPass.push_back(std::make_pair(RegLo, Lo));

	if (RegLo == RISCV::X17) {
	// Second half of f64 is passed on the stack.
	// Work out the address of the stack slot.
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT);
	// Emit the store.
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, Hi, StackPtr, MachinePointerInfo()));
	} else {
	// Second half of f64 is passed in another GPR.
	unsigned RegHigh = RegLo + 1;
	RegsToPass.push_back(std::make_pair(RegHigh, Hi));
	}
	continue;
	}

	// IsF64OnRV32DSoftABI && VA.isMemLoc() is handled below in the same way
	// as any other MemLoc.

	// Promote the value if needed.
	// For now, only handle fully promoted and indirect arguments.
	if (VA.getLocInfo() == CCValAssign::Indirect) {
	// Store the argument in a stack slot and pass its address.
	SDValue SpillSlot = DAG.CreateStackTemporary(Outs[i].ArgVT);
	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, ArgValue, SpillSlot,
	MachinePointerInfo::getFixedStack(MF, FI)));
	// If the original argument was split (e.g. i128), we need
	// to store all parts of it here (and pass just one address).
	unsigned ArgIndex = Outs[i].OrigArgIndex;
	assert(Outs[i].PartOffset == 0);
	while (i + 1 != e && Outs[i + 1].OrigArgIndex == ArgIndex) {
	SDValue PartValue = OutVals[i + 1];
	unsigned PartOffset = Outs[i + 1].PartOffset;
	SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot,
	DAG.getIntPtrConstant(PartOffset, DL));
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, PartValue, Address,
	MachinePointerInfo::getFixedStack(MF, FI)));
	++i;
	}
	ArgValue = SpillSlot;
	} else {
	ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL);
	}

	// Use local copy if it is a byval arg.
	if (Flags.isByVal())
	ArgValue = ByValArgs[j++];

	if (VA.isRegLoc()) {
	// Queue up the argument copies and emit them at the end.
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
	} else {
	assert(VA.isMemLoc() && "Argument not register or memory");
	assert(!IsTailCall && "Tail call not allowed if stack is used "
	"for passing parameters");

	// Work out the address of the stack slot.
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, DL, RISCV::X2, PtrVT);
	SDValue Address =
	DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
	DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));

	// Emit the store.
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
	}
	}

	// Join the stores, which are independent of one another.
	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

	SDValue Glue;

	// Build a sequence of copy-to-reg nodes, chained and glued together.
	for (auto &Reg : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, Glue);
	Glue = Chain.getValue(1);
	}

	// If the callee is a GlobalAddress/ExternalSymbol node, turn it into a
	// TargetGlobalAddress/TargetExternalSymbol node so that legalize won't
	// split it and then direct call can be matched by PseudoCALL.
	if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = S->getGlobal();

	unsigned OpFlags = RISCVII::MO_CALL;
	if (!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV))
	OpFlags = RISCVII::MO_PLT;

	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	unsigned OpFlags = RISCVII::MO_CALL;

	if (!getTargetMachine().shouldAssumeDSOLocal(*MF.getFunction().getParent(),
	nullptr))
	OpFlags = RISCVII::MO_PLT;

	Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, OpFlags);
	}

	// The first call operand is the chain and the second is the target address.
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);

	// Add argument registers to the end of the list so that they are
	// known live into the call.
	for (auto &Reg : RegsToPass)
	Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));

	if (!IsTailCall) {
	// Add a register mask operand representing the call-preserved registers.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));
	}

	// Glue the call to the argument copies, if any.
	if (Glue.getNode())
	Ops.push_back(Glue);

	// Emit the call.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	if (IsTailCall) {
	MF.getFrameInfo().setHasTailCall();
	return DAG.getNode(RISCVISD::TAIL, DL, NodeTys, Ops);
	}

	Chain = DAG.getNode(RISCVISD::CALL, DL, NodeTys, Ops);
	Glue = Chain.getValue(1);

	// Mark the end of the call, which is glued to the call itself.
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getConstant(NumBytes, DL, PtrVT, true),
	DAG.getConstant(0, DL, PtrVT, true),
	Glue, DL);
	Glue = Chain.getValue(1);

	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
	analyzeInputArgs(MF, RetCCInfo, Ins, /IsRet=/true);

	// Copy all of the result registers out of their specified physreg.
	for (auto &VA : RVLocs) {
	// Copy the value out
	SDValue RetValue =
	DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue);
	// Glue the RetValue to the end of the call sequence
	Chain = RetValue.getValue(1);
	Glue = RetValue.getValue(2);

	if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
	assert(VA.getLocReg() == ArgGPRs[0] && "Unexpected reg assignment");
	SDValue RetValue2 =
	DAG.getCopyFromReg(Chain, DL, ArgGPRs[1], MVT::i32, Glue);
	Chain = RetValue2.getValue(1);
	Glue = RetValue2.getValue(2);
	RetValue = DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, RetValue,
	RetValue2);
	}

	RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL);

	InVals.push_back(RetValue);
	}

	return Chain;
	}

	bool RISCVTargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
	for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
	MVT VT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
	if (CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full,
	ArgFlags, CCInfo, /IsFixed=/true, /IsRet=/true, nullptr))
	return false;
	}
	return true;
	}

	SDValue
	RISCVTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	// Stores the assignment of the return value to a location.
	SmallVector<CCValAssign, 16> RVLocs;

	// Info about the registers and stack slot.
	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());

	analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /IsRet=/true,
	nullptr);

	SDValue Glue;
	SmallVector<SDValue, 4> RetOps(1, Chain);

	// Copy the result values into the output registers.
	for (unsigned i = 0, e = RVLocs.size(); i < e; ++i) {
	SDValue Val = OutVals[i];
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");

	if (VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64) {
	// Handle returning f64 on RV32D with a soft float ABI.
	assert(VA.isRegLoc() && "Expected return via registers");
	SDValue SplitF64 = DAG.getNode(RISCVISD::SplitF64, DL,
	DAG.getVTList(MVT::i32, MVT::i32), Val);
	SDValue Lo = SplitF64.getValue(0);
	SDValue Hi = SplitF64.getValue(1);
	unsigned RegLo = VA.getLocReg();
	unsigned RegHi = RegLo + 1;
	Chain = DAG.getCopyToReg(Chain, DL, RegLo, Lo, Glue);
	Glue = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(RegLo, MVT::i32));
	Chain = DAG.getCopyToReg(Chain, DL, RegHi, Hi, Glue);
	Glue = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(RegHi, MVT::i32));
	} else {
	// Handle a 'normal' return.
	Val = convertValVTToLocVT(DAG, Val, VA, DL);
	Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue);

	// Guarantee that all emitted copies are stuck together.
	Glue = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the glue node if we have it.
	if (Glue.getNode()) {
	RetOps.push_back(Glue);
	}

	// Interrupt service routines use different return instructions.
	const Function &Func = DAG.getMachineFunction().getFunction();
	if (Func.hasFnAttribute("interrupt")) {
	if (!Func.getReturnType()->isVoidTy())
	report_fatal_error(
	"Functions with the interrupt attribute must have void return type!");

	MachineFunction &MF = DAG.getMachineFunction();
	StringRef Kind =
	MF.getFunction().getFnAttribute("interrupt").getValueAsString();

	unsigned RetOpc;
	if (Kind == "user")
	RetOpc = RISCVISD::URET_FLAG;
	else if (Kind == "supervisor")
	RetOpc = RISCVISD::SRET_FLAG;
	else
	RetOpc = RISCVISD::MRET_FLAG;

	return DAG.getNode(RetOpc, DL, MVT::Other, RetOps);
	}

	return DAG.getNode(RISCVISD::RET_FLAG, DL, MVT::Other, RetOps);
	}

	const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((RISCVISD::NodeType)Opcode) {
	case RISCVISD::FIRST_NUMBER:
	break;
	case RISCVISD::RET_FLAG:
	return "RISCVISD::RET_FLAG";
	case RISCVISD::URET_FLAG:
	return "RISCVISD::URET_FLAG";
	case RISCVISD::SRET_FLAG:
	return "RISCVISD::SRET_FLAG";
	case RISCVISD::MRET_FLAG:
	return "RISCVISD::MRET_FLAG";
	case RISCVISD::CALL:
	return "RISCVISD::CALL";
	case RISCVISD::SELECT_CC:
	return "RISCVISD::SELECT_CC";
	case RISCVISD::BuildPairF64:
	return "RISCVISD::BuildPairF64";
	case RISCVISD::SplitF64:
	return "RISCVISD::SplitF64";
	case RISCVISD::TAIL:
	return "RISCVISD::TAIL";
	case RISCVISD::SLLW:
	return "RISCVISD::SLLW";
	case RISCVISD::SRAW:
	return "RISCVISD::SRAW";
	case RISCVISD::SRLW:
	return "RISCVISD::SRLW";
	case RISCVISD::DIVW:
	return "RISCVISD::DIVW";
	case RISCVISD::DIVUW:
	return "RISCVISD::DIVUW";
	case RISCVISD::REMUW:
	return "RISCVISD::REMUW";
	case RISCVISD::FMV_W_X_RV64:
	return "RISCVISD::FMV_W_X_RV64";
	case RISCVISD::FMV_X_ANYEXTW_RV64:
	return "RISCVISD::FMV_X_ANYEXTW_RV64";
	case RISCVISD::READ_CYCLE_WIDE:
	return "RISCVISD::READ_CYCLE_WIDE";
	}
	return nullptr;
	}

	+/// getConstraintType - Given a constraint letter, return the type of
	+/// constraint it is for this target.
	+RISCVTargetLowering::ConstraintType
	+RISCVTargetLowering::getConstraintType(StringRef Constraint) const {
	+ if (Constraint.size() == 1) {
	+ switch (Constraint[0]) {
	+ default:
	+ break;
	+ case 'f':
	+ return C_RegisterClass;
	+ case 'I':
	+ case 'J':
	+ case 'K':
	+ return C_Immediate;
	+ }
	+ }
	+ return TargetLowering::getConstraintType(Constraint);
	+}
	+
	std::pair<unsigned, const TargetRegisterClass *>
	RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	// First, see if this is a constraint that directly corresponds to a
	// RISCV register class.
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'r':
	return std::make_pair(0U, &RISCV::GPRRegClass);
	+ case 'f':
	+ if (Subtarget.hasStdExtF() && VT == MVT::f32)
	+ return std::make_pair(0U, &RISCV::FPR32RegClass);
	+ if (Subtarget.hasStdExtD() && VT == MVT::f64)
	+ return std::make_pair(0U, &RISCV::FPR64RegClass);
	+ break;
	default:
	break;
	}
	}

	return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
	}

	void RISCVTargetLowering::LowerAsmOperandForConstraint(
	SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const {
	// Currently only support length 1 constraints.
	if (Constraint.length() == 1) {
	switch (Constraint[0]) {
	case 'I':
	// Validate & create a 12-bit signed immediate operand.
	if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
	uint64_t CVal = C->getSExtValue();
	if (isInt<12>(CVal))
	Ops.push_back(
	DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT()));
	}
	return;
	case 'J':
	// Validate & create an integer zero operand.
	if (auto *C = dyn_cast<ConstantSDNode>(Op))
	if (C->getZExtValue() == 0)
	Ops.push_back(
	DAG.getTargetConstant(0, SDLoc(Op), Subtarget.getXLenVT()));
	return;
	case 'K':
	// Validate & create a 5-bit unsigned immediate operand.
	if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
	uint64_t CVal = C->getZExtValue();
	if (isUInt<5>(CVal))
	Ops.push_back(
	DAG.getTargetConstant(CVal, SDLoc(Op), Subtarget.getXLenVT()));
	}
	return;
	default:
	break;
	}
	}
	TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	if (isa<LoadInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
	return Builder.CreateFence(Ord);
	if (isa<StoreInst>(Inst) && isReleaseOrStronger(Ord))
	return Builder.CreateFence(AtomicOrdering::Release);
	return nullptr;
	}

	Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
	Instruction *Inst,
	AtomicOrdering Ord) const {
	if (isa<LoadInst>(Inst) && isAcquireOrStronger(Ord))
	return Builder.CreateFence(AtomicOrdering::Acquire);
	return nullptr;
	}

	TargetLowering::AtomicExpansionKind
	RISCVTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	// atomicrmw {fadd,fsub} must be expanded to use compare-exchange, as floating
	// point operations can't be used in an lr/sc sequence without breaking the
	// forward-progress guarantee.
	if (AI->isFloatingPointOperation())
	return AtomicExpansionKind::CmpXChg;

	unsigned Size = AI->getType()->getPrimitiveSizeInBits();
	if (Size == 8 \|\| Size == 16)
	return AtomicExpansionKind::MaskedIntrinsic;
	return AtomicExpansionKind::None;
	}

	static Intrinsic::ID
	getIntrinsicForMaskedAtomicRMWBinOp(unsigned XLen, AtomicRMWInst::BinOp BinOp) {
	if (XLen == 32) {
	switch (BinOp) {
	default:
	llvm_unreachable("Unexpected AtomicRMW BinOp");
	case AtomicRMWInst::Xchg:
	return Intrinsic::riscv_masked_atomicrmw_xchg_i32;
	case AtomicRMWInst::Add:
	return Intrinsic::riscv_masked_atomicrmw_add_i32;
	case AtomicRMWInst::Sub:
	return Intrinsic::riscv_masked_atomicrmw_sub_i32;
	case AtomicRMWInst::Nand:
	return Intrinsic::riscv_masked_atomicrmw_nand_i32;
	case AtomicRMWInst::Max:
	return Intrinsic::riscv_masked_atomicrmw_max_i32;
	case AtomicRMWInst::Min:
	return Intrinsic::riscv_masked_atomicrmw_min_i32;
	case AtomicRMWInst::UMax:
	return Intrinsic::riscv_masked_atomicrmw_umax_i32;
	case AtomicRMWInst::UMin:
	return Intrinsic::riscv_masked_atomicrmw_umin_i32;
	}
	}

	if (XLen == 64) {
	switch (BinOp) {
	default:
	llvm_unreachable("Unexpected AtomicRMW BinOp");
	case AtomicRMWInst::Xchg:
	return Intrinsic::riscv_masked_atomicrmw_xchg_i64;
	case AtomicRMWInst::Add:
	return Intrinsic::riscv_masked_atomicrmw_add_i64;
	case AtomicRMWInst::Sub:
	return Intrinsic::riscv_masked_atomicrmw_sub_i64;
	case AtomicRMWInst::Nand:
	return Intrinsic::riscv_masked_atomicrmw_nand_i64;
	case AtomicRMWInst::Max:
	return Intrinsic::riscv_masked_atomicrmw_max_i64;
	case AtomicRMWInst::Min:
	return Intrinsic::riscv_masked_atomicrmw_min_i64;
	case AtomicRMWInst::UMax:
	return Intrinsic::riscv_masked_atomicrmw_umax_i64;
	case AtomicRMWInst::UMin:
	return Intrinsic::riscv_masked_atomicrmw_umin_i64;
	}
	}

	llvm_unreachable("Unexpected XLen\n");
	}

	Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic(
	IRBuilder<> &Builder, AtomicRMWInst AI, Value AlignedAddr, Value *Incr,
	Value Mask, Value ShiftAmt, AtomicOrdering Ord) const {
	unsigned XLen = Subtarget.getXLen();
	Value *Ordering =
	Builder.getIntN(XLen, static_cast<uint64_t>(AI->getOrdering()));
	Type *Tys[] = {AlignedAddr->getType()};
	Function *LrwOpScwLoop = Intrinsic::getDeclaration(
	AI->getModule(),
	getIntrinsicForMaskedAtomicRMWBinOp(XLen, AI->getOperation()), Tys);

	if (XLen == 64) {
	Incr = Builder.CreateSExt(Incr, Builder.getInt64Ty());
	Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty());
	ShiftAmt = Builder.CreateSExt(ShiftAmt, Builder.getInt64Ty());
	}

	Value *Result;

	// Must pass the shift amount needed to sign extend the loaded value prior
	// to performing a signed comparison for min/max. ShiftAmt is the number of
	// bits to shift the value into position. Pass XLen-ShiftAmt-ValWidth, which
	// is the number of bits to left+right shift the value in order to
	// sign-extend.
	if (AI->getOperation() == AtomicRMWInst::Min \|\|
	AI->getOperation() == AtomicRMWInst::Max) {
	const DataLayout &DL = AI->getModule()->getDataLayout();
	unsigned ValWidth =
	DL.getTypeStoreSizeInBits(AI->getValOperand()->getType());
	Value *SextShamt =
	Builder.CreateSub(Builder.getIntN(XLen, XLen - ValWidth), ShiftAmt);
	Result = Builder.CreateCall(LrwOpScwLoop,
	{AlignedAddr, Incr, Mask, SextShamt, Ordering});
	} else {
	Result =
	Builder.CreateCall(LrwOpScwLoop, {AlignedAddr, Incr, Mask, Ordering});
	}

	if (XLen == 64)
	Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
	return Result;
	}

	TargetLowering::AtomicExpansionKind
	RISCVTargetLowering::shouldExpandAtomicCmpXchgInIR(
	AtomicCmpXchgInst *CI) const {
	unsigned Size = CI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
	if (Size == 8 \|\| Size == 16)
	return AtomicExpansionKind::MaskedIntrinsic;
	return AtomicExpansionKind::None;
	}

	Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
	IRBuilder<> &Builder, AtomicCmpXchgInst CI, Value AlignedAddr,
	Value CmpVal, Value NewVal, Value *Mask, AtomicOrdering Ord) const {
	unsigned XLen = Subtarget.getXLen();
	Value *Ordering = Builder.getIntN(XLen, static_cast<uint64_t>(Ord));
	Intrinsic::ID CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i32;
	if (XLen == 64) {
	CmpVal = Builder.CreateSExt(CmpVal, Builder.getInt64Ty());
	NewVal = Builder.CreateSExt(NewVal, Builder.getInt64Ty());
	Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty());
	CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i64;
	}
	Type *Tys[] = {AlignedAddr->getType()};
	Function *MaskedCmpXchg =
	Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys);
	Value *Result = Builder.CreateCall(
	MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, Ordering});
	if (XLen == 64)
	Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
	return Result;
	}

	unsigned RISCVTargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	return RISCV::X10;
	}

	unsigned RISCVTargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	return RISCV::X11;
	}
	Index: vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.h
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.h (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/RISCV/RISCVISelLowering.h (revision 351303)
	@@ -1,210 +1,211 @@
	//===-- RISCVISelLowering.h - RISCV DAG Lowering Interface ------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that RISCV uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_RISCV_RISCVISELLOWERING_H
	#define LLVM_LIB_TARGET_RISCV_RISCVISELLOWERING_H

	#include "RISCV.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/TargetLowering.h"

	namespace llvm {
	class RISCVSubtarget;
	namespace RISCVISD {
	enum NodeType : unsigned {
	FIRST_NUMBER = ISD::BUILTIN_OP_END,
	RET_FLAG,
	URET_FLAG,
	SRET_FLAG,
	MRET_FLAG,
	CALL,
	SELECT_CC,
	BuildPairF64,
	SplitF64,
	TAIL,
	// RV64I shifts, directly matching the semantics of the named RISC-V
	// instructions.
	SLLW,
	SRAW,
	SRLW,
	// 32-bit operations from RV64M that can't be simply matched with a pattern
	// at instruction selection time.
	DIVW,
	DIVUW,
	REMUW,
	// FPR32<->GPR transfer operations for RV64. Needed as an i32<->f32 bitcast
	// is not legal on RV64. FMV_W_X_RV64 matches the semantics of the FMV.W.X.
	// FMV_X_ANYEXTW_RV64 is similar to FMV.X.W but has an any-extended result.
	// This is a more convenient semantic for producing dagcombines that remove
	// unnecessary GPR->FPR->GPR moves.
	FMV_W_X_RV64,
	FMV_X_ANYEXTW_RV64,
	// READ_CYCLE_WIDE - A read of the 64-bit cycle CSR on a 32-bit target
	// (returns (Lo, Hi)). It takes a chain operand.
	READ_CYCLE_WIDE
	};
	}

	class RISCVTargetLowering : public TargetLowering {
	const RISCVSubtarget &Subtarget;

	public:
	explicit RISCVTargetLowering(const TargetMachine &TM,
	const RISCVSubtarget &STI);

	bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const override;
	bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
	unsigned AS,
	Instruction *I = nullptr) const override;
	bool isLegalICmpImmediate(int64_t Imm) const override;
	bool isLegalAddImmediate(int64_t Imm) const override;
	bool isTruncateFree(Type SrcTy, Type DstTy) const override;
	bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
	bool isZExtFree(SDValue Val, EVT VT2) const override;
	bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;

	bool hasBitPreservingFPLogic(EVT VT) const override;

	// Provide custom lowering hooks for some operations.
	SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
	void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const override;

	SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;

	unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const override;

	// This method returns the name of a target specific DAG node.
	const char *getTargetNodeName(unsigned Opcode) const override;

	+ ConstraintType getConstraintType(StringRef Constraint) const override;
	std::pair<unsigned, const TargetRegisterClass *>
	getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint, MVT VT) const override;

	void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
	std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const override;

	MachineBasicBlock *
	EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const override;

	EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
	EVT VT) const override;

	bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
	return VT.isScalarInteger();
	}

	bool shouldInsertFencesForAtomic(const Instruction *I) const override {
	return isa<LoadInst>(I) \|\| isa<StoreInst>(I);
	}
	Instruction emitLeadingFence(IRBuilder<> &Builder, Instruction Inst,
	AtomicOrdering Ord) const override;
	Instruction emitTrailingFence(IRBuilder<> &Builder, Instruction Inst,
	AtomicOrdering Ord) const override;

	ISD::NodeType getExtendForAtomicOps() const override {
	return ISD::SIGN_EXTEND;
	}

	bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
	if (DAG.getMachineFunction().getFunction().hasMinSize())
	return false;
	return true;
	}
	bool isDesirableToCommuteWithShift(const SDNode *N,
	CombineLevel Level) const override;

	/// If a physical register, this returns the register that receives the
	/// exception address on entry to an EH pad.
	unsigned
	getExceptionPointerRegister(const Constant *PersonalityFn) const override;

	/// If a physical register, this returns the register that receives the
	/// exception typeid on entry to a landing pad.
	unsigned
	getExceptionSelectorRegister(const Constant *PersonalityFn) const override;

	private:
	void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	bool IsRet) const;
	void analyzeOutputArgs(MachineFunction &MF, CCState &CCInfo,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	bool IsRet, CallLoweringInfo *CLI) const;
	// Lower incoming arguments, copy physregs into vregs
	SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
	bool IsVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &DL, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &InVals) const override;
	bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
	bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const override;
	SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
	SelectionDAG &DAG) const override;
	SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const override;
	bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const override {
	return true;
	}

	template <class NodeTy>
	SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const;

	SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
	bool UseGOT) const;
	SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const;

	bool shouldConsiderGEPOffsetSplit() const override { return true; }
	SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
	SDValue lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const;

	bool isEligibleForTailCallOptimization(
	CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
	const SmallVector<CCValAssign, 16> &ArgLocs) const;

	TargetLowering::AtomicExpansionKind
	shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
	virtual Value *emitMaskedAtomicRMWIntrinsic(
	IRBuilder<> &Builder, AtomicRMWInst AI, Value AlignedAddr, Value *Incr,
	Value Mask, Value ShiftAmt, AtomicOrdering Ord) const override;
	TargetLowering::AtomicExpansionKind
	shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CI) const override;
	virtual Value *
	emitMaskedAtomicCmpXchgIntrinsic(IRBuilder<> &Builder, AtomicCmpXchgInst *CI,
	Value AlignedAddr, Value CmpVal,
	Value NewVal, Value Mask,
	AtomicOrdering Ord) const override;
	};
	}

	#endif
	Index: vendor/llvm/dist-release_90/lib/Target/Sparc/SparcISelLowering.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/Sparc/SparcISelLowering.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/Sparc/SparcISelLowering.cpp (revision 351303)
	@@ -1,3417 +1,3417 @@
	//===-- SparcISelLowering.cpp - Sparc DAG Lowering Implementation ---------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the interfaces that Sparc uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "SparcISelLowering.h"
	#include "MCTargetDesc/SparcMCExpr.h"
	#include "SparcMachineFunctionInfo.h"
	#include "SparcRegisterInfo.h"
	#include "SparcTargetMachine.h"
	#include "SparcTargetObjectFile.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Module.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	using namespace llvm;


	//===----------------------------------------------------------------------===//
	// Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	static bool CC_Sparc_Assign_SRet(unsigned &ValNo, MVT &ValVT,
	MVT &LocVT, CCValAssign::LocInfo &LocInfo,
	ISD::ArgFlagsTy &ArgFlags, CCState &State)
	{
	assert (ArgFlags.isSRet());

	// Assign SRet argument.
	State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
	0,
	LocVT, LocInfo));
	return true;
	}

	static bool CC_Sparc_Assign_Split_64(unsigned &ValNo, MVT &ValVT,
	MVT &LocVT, CCValAssign::LocInfo &LocInfo,
	ISD::ArgFlagsTy &ArgFlags, CCState &State)
	{
	static const MCPhysReg RegList[] = {
	SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
	};
	// Try to get first reg.
	if (unsigned Reg = State.AllocateReg(RegList)) {
	State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	} else {
	// Assign whole thing in stack.
	State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
	State.AllocateStack(8,4),
	LocVT, LocInfo));
	return true;
	}

	// Try to get second reg.
	if (unsigned Reg = State.AllocateReg(RegList))
	State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	else
	State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
	State.AllocateStack(4,4),
	LocVT, LocInfo));
	return true;
	}

	static bool CC_Sparc_Assign_Ret_Split_64(unsigned &ValNo, MVT &ValVT,
	MVT &LocVT, CCValAssign::LocInfo &LocInfo,
	ISD::ArgFlagsTy &ArgFlags, CCState &State)
	{
	static const MCPhysReg RegList[] = {
	SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
	};

	// Try to get first reg.
	if (unsigned Reg = State.AllocateReg(RegList))
	State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	else
	return false;

	// Try to get second reg.
	if (unsigned Reg = State.AllocateReg(RegList))
	State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	else
	return false;

	return true;
	}

	// Allocate a full-sized argument for the 64-bit ABI.
	static bool CC_Sparc64_Full(unsigned &ValNo, MVT &ValVT,
	MVT &LocVT, CCValAssign::LocInfo &LocInfo,
	ISD::ArgFlagsTy &ArgFlags, CCState &State) {
	assert((LocVT == MVT::f32 \|\| LocVT == MVT::f128
	\|\| LocVT.getSizeInBits() == 64) &&
	"Can't handle non-64 bits locations");

	// Stack space is allocated for all arguments starting from [%fp+BIAS+128].
	unsigned size = (LocVT == MVT::f128) ? 16 : 8;
	unsigned alignment = (LocVT == MVT::f128) ? 16 : 8;
	unsigned Offset = State.AllocateStack(size, alignment);
	unsigned Reg = 0;

	if (LocVT == MVT::i64 && Offset < 6*8)
	// Promote integers to %i0-%i5.
	Reg = SP::I0 + Offset/8;
	else if (LocVT == MVT::f64 && Offset < 16*8)
	// Promote doubles to %d0-%d30. (Which LLVM calls D0-D15).
	Reg = SP::D0 + Offset/8;
	else if (LocVT == MVT::f32 && Offset < 16*8)
	// Promote floats to %f1, %f3, ...
	Reg = SP::F1 + Offset/4;
	else if (LocVT == MVT::f128 && Offset < 16*8)
	// Promote long doubles to %q0-%q28. (Which LLVM calls Q0-Q7).
	Reg = SP::Q0 + Offset/16;

	// Promote to register when possible, otherwise use the stack slot.
	if (Reg) {
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	return true;
	}

	// This argument goes on the stack in an 8-byte slot.
	// When passing floats, LocVT is smaller than 8 bytes. Adjust the offset to
	// the right-aligned float. The first 4 bytes of the stack slot are undefined.
	if (LocVT == MVT::f32)
	Offset += 4;

	State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
	return true;
	}

	// Allocate a half-sized argument for the 64-bit ABI.
	//
	// This is used when passing { float, int } structs by value in registers.
	static bool CC_Sparc64_Half(unsigned &ValNo, MVT &ValVT,
	MVT &LocVT, CCValAssign::LocInfo &LocInfo,
	ISD::ArgFlagsTy &ArgFlags, CCState &State) {
	assert(LocVT.getSizeInBits() == 32 && "Can't handle non-32 bits locations");
	unsigned Offset = State.AllocateStack(4, 4);

	if (LocVT == MVT::f32 && Offset < 16*8) {
	// Promote floats to %f0-%f31.
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, SP::F0 + Offset/4,
	LocVT, LocInfo));
	return true;
	}

	if (LocVT == MVT::i32 && Offset < 6*8) {
	// Promote integers to %i0-%i5, using half the register.
	unsigned Reg = SP::I0 + Offset/8;
	LocVT = MVT::i64;
	LocInfo = CCValAssign::AExt;

	// Set the Custom bit if this i32 goes in the high bits of a register.
	if (Offset % 8 == 0)
	State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg,
	LocVT, LocInfo));
	else
	State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
	return true;
	}

	State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
	return true;
	}

	#include "SparcGenCallingConv.inc"

	// The calling conventions in SparcCallingConv.td are described in terms of the
	// callee's register window. This function translates registers to the
	// corresponding caller window %o register.
	static unsigned toCallerWindow(unsigned Reg) {
	static_assert(SP::I0 + 7 == SP::I7 && SP::O0 + 7 == SP::O7,
	"Unexpected enum");
	if (Reg >= SP::I0 && Reg <= SP::I7)
	return Reg - SP::I0 + SP::O0;
	return Reg;
	}

	SDValue
	SparcTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	if (Subtarget->is64Bit())
	return LowerReturn_64(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
	return LowerReturn_32(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
	}

	SDValue
	SparcTargetLowering::LowerReturn_32(SDValue Chain, CallingConv::ID CallConv,
	bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();

	// CCValAssign - represent the assignment of the return value to locations.
	SmallVector<CCValAssign, 16> RVLocs;

	// CCState - Info about the registers and stack slot.
	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());

	// Analyze return values.
	CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32);

	SDValue Flag;
	SmallVector<SDValue, 4> RetOps(1, Chain);
	// Make room for the return address offset.
	RetOps.push_back(SDValue());

	// Copy the result values into the output registers.
	for (unsigned i = 0, realRVLocIdx = 0;
	i != RVLocs.size();
	++i, ++realRVLocIdx) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");

	SDValue Arg = OutVals[realRVLocIdx];

	if (VA.needsCustom()) {
	assert(VA.getLocVT() == MVT::v2i32);
	// Legalize ret v2i32 -> ret 2 x i32 (Basically: do what would
	// happen by default if this wasn't a legal type)

	SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
	Arg,
	DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
	SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
	Arg,
	DAG.getConstant(1, DL, getVectorIdxTy(DAG.getDataLayout())));

	Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part0, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	VA = RVLocs[++i]; // skip ahead to next loc
	Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part1,
	Flag);
	} else
	Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);

	// Guarantee that all emitted copies are stuck together with flags.
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}

	unsigned RetAddrOffset = 8; // Call Inst + Delay Slot
	// If the function returns a struct, copy the SRetReturnReg to I0
	if (MF.getFunction().hasStructRetAttr()) {
	SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
	unsigned Reg = SFI->getSRetReturnReg();
	if (!Reg)
	llvm_unreachable("sret virtual register not created in the entry block");
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, PtrVT);
	Chain = DAG.getCopyToReg(Chain, DL, SP::I0, Val, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(SP::I0, PtrVT));
	RetAddrOffset = 12; // CallInst + Delay Slot + Unimp
	}

	RetOps[0] = Chain; // Update chain.
	RetOps[1] = DAG.getConstant(RetAddrOffset, DL, MVT::i32);

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
	}

	// Lower return values for the 64-bit ABI.
	// Return values are passed the exactly the same way as function arguments.
	SDValue
	SparcTargetLowering::LowerReturn_64(SDValue Chain, CallingConv::ID CallConv,
	bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	// CCValAssign - represent the assignment of the return value to locations.
	SmallVector<CCValAssign, 16> RVLocs;

	// CCState - Info about the registers and stack slot.
	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());

	// Analyze return values.
	CCInfo.AnalyzeReturn(Outs, RetCC_Sparc64);

	SDValue Flag;
	SmallVector<SDValue, 4> RetOps(1, Chain);

	// The second operand on the return instruction is the return address offset.
	// The return address is always %i7+8 with the 64-bit ABI.
	RetOps.push_back(DAG.getConstant(8, DL, MVT::i32));

	// Copy the result values into the output registers.
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");
	SDValue OutVal = OutVals[i];

	// Integer return values must be sign or zero extended by the callee.
	switch (VA.getLocInfo()) {
	case CCValAssign::Full: break;
	case CCValAssign::SExt:
	OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
	break;
	case CCValAssign::ZExt:
	OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
	break;
	case CCValAssign::AExt:
	OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
	break;
	default:
	llvm_unreachable("Unknown loc info!");
	}

	// The custom bit on an i32 return value indicates that it should be passed
	// in the high bits of the register.
	if (VA.getValVT() == MVT::i32 && VA.needsCustom()) {
	OutVal = DAG.getNode(ISD::SHL, DL, MVT::i64, OutVal,
	DAG.getConstant(32, DL, MVT::i32));

	// The next value may go in the low bits of the same register.
	// Handle both at once.
	if (i+1 < RVLocs.size() && RVLocs[i+1].getLocReg() == VA.getLocReg()) {
	SDValue NV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, OutVals[i+1]);
	OutVal = DAG.getNode(ISD::OR, DL, MVT::i64, OutVal, NV);
	// Skip the next value, it's already done.
	++i;
	}
	}

	Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);

	// Guarantee that all emitted copies are stuck together with flags.
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
	}

	SDValue SparcTargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	if (Subtarget->is64Bit())
	return LowerFormalArguments_64(Chain, CallConv, IsVarArg, Ins,
	DL, DAG, InVals);
	return LowerFormalArguments_32(Chain, CallConv, IsVarArg, Ins,
	DL, DAG, InVals);
	}

	/// LowerFormalArguments32 - V8 uses a very simple ABI, where all values are
	/// passed in either one or two GPRs, including FP values. TODO: we should
	/// pass FP values in FP registers for fastcc functions.
	SDValue SparcTargetLowering::LowerFormalArguments_32(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineRegisterInfo &RegInfo = MF.getRegInfo();
	SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());
	CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);

	const unsigned StackOffset = 92;
	bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();

	unsigned InIdx = 0;
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i, ++InIdx) {
	CCValAssign &VA = ArgLocs[i];

	if (Ins[InIdx].Flags.isSRet()) {
	if (InIdx != 0)
	report_fatal_error("sparc only supports sret on the first parameter");
	// Get SRet from [%fp+64].
	int FrameIdx = MF.getFrameInfo().CreateFixedObject(4, 64, true);
	SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
	SDValue Arg =
	DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
	InVals.push_back(Arg);
	continue;
	}

	if (VA.isRegLoc()) {
	if (VA.needsCustom()) {
	assert(VA.getLocVT() == MVT::f64 \|\| VA.getLocVT() == MVT::v2i32);

	unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
	MF.getRegInfo().addLiveIn(VA.getLocReg(), VRegHi);
	SDValue HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32);

	assert(i+1 < e);
	CCValAssign &NextVA = ArgLocs[++i];

	SDValue LoVal;
	if (NextVA.isMemLoc()) {
	int FrameIdx = MF.getFrameInfo().
	CreateFixedObject(4, StackOffset+NextVA.getLocMemOffset(),true);
	SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
	LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
	} else {
	unsigned loReg = MF.addLiveIn(NextVA.getLocReg(),
	&SP::IntRegsRegClass);
	LoVal = DAG.getCopyFromReg(Chain, dl, loReg, MVT::i32);
	}

	if (IsLittleEndian)
	std::swap(LoVal, HiVal);

	SDValue WholeValue =
	DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
	WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), WholeValue);
	InVals.push_back(WholeValue);
	continue;
	}
	unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
	MF.getRegInfo().addLiveIn(VA.getLocReg(), VReg);
	SDValue Arg = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
	if (VA.getLocVT() == MVT::f32)
	Arg = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Arg);
	else if (VA.getLocVT() != MVT::i32) {
	Arg = DAG.getNode(ISD::AssertSext, dl, MVT::i32, Arg,
	DAG.getValueType(VA.getLocVT()));
	Arg = DAG.getNode(ISD::TRUNCATE, dl, VA.getLocVT(), Arg);
	}
	InVals.push_back(Arg);
	continue;
	}

	assert(VA.isMemLoc());

	unsigned Offset = VA.getLocMemOffset()+StackOffset;
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::f64 \|\| VA.getValVT() == MVT::v2i32);
	// If it is double-word aligned, just load.
	if (Offset % 8 == 0) {
	int FI = MF.getFrameInfo().CreateFixedObject(8,
	Offset,
	true);
	SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
	SDValue Load =
	DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr, MachinePointerInfo());
	InVals.push_back(Load);
	continue;
	}

	int FI = MF.getFrameInfo().CreateFixedObject(4,
	Offset,
	true);
	SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
	SDValue HiVal =
	DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
	int FI2 = MF.getFrameInfo().CreateFixedObject(4,
	Offset+4,
	true);
	SDValue FIPtr2 = DAG.getFrameIndex(FI2, PtrVT);

	SDValue LoVal =
	DAG.getLoad(MVT::i32, dl, Chain, FIPtr2, MachinePointerInfo());

	if (IsLittleEndian)
	std::swap(LoVal, HiVal);

	SDValue WholeValue =
	DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
	WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), WholeValue);
	InVals.push_back(WholeValue);
	continue;
	}

	int FI = MF.getFrameInfo().CreateFixedObject(4,
	Offset,
	true);
	SDValue FIPtr = DAG.getFrameIndex(FI, PtrVT);
	SDValue Load ;
	if (VA.getValVT() == MVT::i32 \|\| VA.getValVT() == MVT::f32) {
	Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr, MachinePointerInfo());
	} else if (VA.getValVT() == MVT::f128) {
	report_fatal_error("SPARCv8 does not handle f128 in calls; "
	"pass indirectly");
	} else {
	// We shouldn't see any other value types here.
	llvm_unreachable("Unexpected ValVT encountered in frame lowering.");
	}
	InVals.push_back(Load);
	}

	if (MF.getFunction().hasStructRetAttr()) {
	// Copy the SRet Argument to SRetReturnReg.
	SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
	unsigned Reg = SFI->getSRetReturnReg();
	if (!Reg) {
	Reg = MF.getRegInfo().createVirtualRegister(&SP::IntRegsRegClass);
	SFI->setSRetReturnReg(Reg);
	}
	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
	}

	// Store remaining ArgRegs to the stack if this is a varargs function.
	if (isVarArg) {
	static const MCPhysReg ArgRegs[] = {
	SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
	};
	unsigned NumAllocated = CCInfo.getFirstUnallocated(ArgRegs);
	const MCPhysReg CurArgReg = ArgRegs+NumAllocated, ArgRegEnd = ArgRegs+6;
	unsigned ArgOffset = CCInfo.getNextStackOffset();
	if (NumAllocated == 6)
	ArgOffset += StackOffset;
	else {
	assert(!ArgOffset);
	ArgOffset = 68+4*NumAllocated;
	}

	// Remember the vararg offset for the va_start implementation.
	FuncInfo->setVarArgsFrameOffset(ArgOffset);

	std::vector<SDValue> OutChains;

	for (; CurArgReg != ArgRegEnd; ++CurArgReg) {
	unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
	MF.getRegInfo().addLiveIn(*CurArgReg, VReg);
	SDValue Arg = DAG.getCopyFromReg(DAG.getRoot(), dl, VReg, MVT::i32);

	int FrameIdx = MF.getFrameInfo().CreateFixedObject(4, ArgOffset,
	true);
	SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);

	OutChains.push_back(
	DAG.getStore(DAG.getRoot(), dl, Arg, FIPtr, MachinePointerInfo()));
	ArgOffset += 4;
	}

	if (!OutChains.empty()) {
	OutChains.push_back(Chain);
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}
	}

	return Chain;
	}

	// Lower formal arguments for the 64 bit ABI.
	SDValue SparcTargetLowering::LowerFormalArguments_64(
	SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();

	// Analyze arguments according to CC_Sparc64.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());
	CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc64);

	// The argument array begins at %fp+BIAS+128, after the register save area.
	const unsigned ArgArea = 128;

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	if (VA.isRegLoc()) {
	// This argument is passed in a register.
	// All integer register arguments are promoted by the caller to i64.

	// Create a virtual register for the promoted live-in value.
	unsigned VReg = MF.addLiveIn(VA.getLocReg(),
	getRegClassFor(VA.getLocVT()));
	SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());

	// Get the high bits for i32 struct elements.
	if (VA.getValVT() == MVT::i32 && VA.needsCustom())
	Arg = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Arg,
	DAG.getConstant(32, DL, MVT::i32));

	// The caller promoted the argument, so insert an Assert?ext SDNode so we
	// won't promote the value again in this function.
	switch (VA.getLocInfo()) {
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
	DAG.getValueType(VA.getValVT()));
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
	DAG.getValueType(VA.getValVT()));
	break;
	default:
	break;
	}

	// Truncate the register down to the argument type.
	if (VA.isExtInLoc())
	Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);

	InVals.push_back(Arg);
	continue;
	}

	// The registers are exhausted. This argument was passed on the stack.
	assert(VA.isMemLoc());
	// The CC_Sparc64_Full/Half functions compute stack offsets relative to the
	// beginning of the arguments area at %fp+BIAS+128.
	unsigned Offset = VA.getLocMemOffset() + ArgArea;
	unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
	// Adjust offset for extended arguments, SPARC is big-endian.
	// The caller will have written the full slot with extended bytes, but we
	// prefer our own extending loads.
	if (VA.isExtInLoc())
	Offset += 8 - ValSize;
	int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
	InVals.push_back(
	DAG.getLoad(VA.getValVT(), DL, Chain,
	DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
	MachinePointerInfo::getFixedStack(MF, FI)));
	}

	if (!IsVarArg)
	return Chain;

	// This function takes variable arguments, some of which may have been passed
	// in registers %i0-%i5. Variable floating point arguments are never passed
	// in floating point registers. They go on %i0-%i5 or on the stack like
	// integer arguments.
	//
	// The va_start intrinsic needs to know the offset to the first variable
	// argument.
	unsigned ArgOffset = CCInfo.getNextStackOffset();
	SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
	// Skip the 128 bytes of register save area.
	FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgArea +
	Subtarget->getStackPointerBias());

	// Save the variable arguments that were passed in registers.
	// The caller is required to reserve stack space for 6 arguments regardless
	// of how many arguments were actually passed.
	SmallVector<SDValue, 8> OutChains;
	for (; ArgOffset < 6*8; ArgOffset += 8) {
	unsigned VReg = MF.addLiveIn(SP::I0 + ArgOffset/8, &SP::I64RegsRegClass);
	SDValue VArg = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
	int FI = MF.getFrameInfo().CreateFixedObject(8, ArgOffset + ArgArea, true);
	auto PtrVT = getPointerTy(MF.getDataLayout());
	OutChains.push_back(
	DAG.getStore(Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT),
	MachinePointerInfo::getFixedStack(MF, FI)));
	}

	if (!OutChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);

	return Chain;
	}

	SDValue
	SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	if (Subtarget->is64Bit())
	return LowerCall_64(CLI, InVals);
	return LowerCall_32(CLI, InVals);
	}

	static bool hasReturnsTwiceAttr(SelectionDAG &DAG, SDValue Callee,
	ImmutableCallSite CS) {
	if (CS)
	return CS.hasFnAttr(Attribute::ReturnsTwice);

	const Function *CalleeFn = nullptr;
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	CalleeFn = dyn_cast<Function>(G->getGlobal());
	} else if (ExternalSymbolSDNode *E =
	dyn_cast<ExternalSymbolSDNode>(Callee)) {
	const Function &Fn = DAG.getMachineFunction().getFunction();
	const Module *M = Fn.getParent();
	const char *CalleeName = E->getSymbol();
	CalleeFn = M->getFunction(CalleeName);
	}

	if (!CalleeFn)
	return false;
	return CalleeFn->hasFnAttribute(Attribute::ReturnsTwice);
	}

	// Lower a call for the 32-bit ABI.
	SDValue
	SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &isTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool isVarArg = CLI.IsVarArg;

	// Sparc target does not yet support tail call optimization.
	isTailCall = false;

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32);

	// Get the size of the outgoing arguments stack space requirement.
	unsigned ArgsSize = CCInfo.getNextStackOffset();

	// Keep stack frames 8-byte aligned.
	ArgsSize = (ArgsSize+7) & ~7;

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

	// Create local copies for byval args.
	SmallVector<SDValue, 8> ByValArgs;
	for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	if (!Flags.isByVal())
	continue;

	SDValue Arg = OutVals[i];
	unsigned Size = Flags.getByValSize();
	unsigned Align = Flags.getByValAlign();

	if (Size > 0U) {
	int FI = MFI.CreateStackObject(Size, Align, false);
	SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	SDValue SizeNode = DAG.getConstant(Size, dl, MVT::i32);

	Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Align,
	false, // isVolatile,
	(Size <= 32), // AlwaysInline if size <= 32,
	false, // isTailCall
	MachinePointerInfo(), MachinePointerInfo());
	ByValArgs.push_back(FIPtr);
	}
	else {
	SDValue nullVal;
	ByValArgs.push_back(nullVal);
	}
	}

	Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, dl);

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;

	const unsigned StackOffset = 92;
	bool hasStructRetAttr = false;
	unsigned SRetArgSize = 0;
	// Walk the register/memloc assignments, inserting copies/loads.
	for (unsigned i = 0, realArgIdx = 0, byvalArgIdx = 0, e = ArgLocs.size();
	i != e;
	++i, ++realArgIdx) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[realArgIdx];

	ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;

	// Use local copy if it is a byval arg.
	if (Flags.isByVal()) {
	Arg = ByValArgs[byvalArgIdx++];
	if (!Arg) {
	continue;
	}
	}

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExt:
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
	break;
	}

	if (Flags.isSRet()) {
	assert(VA.needsCustom());
	// store SRet argument in %sp+64
	SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
	SDValue PtrOff = DAG.getIntPtrConstant(64, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
	MemOpChains.push_back(
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
	hasStructRetAttr = true;
	// sret only allowed on first argument
	assert(Outs[realArgIdx].OrigArgIndex == 0);
	PointerType *Ty = cast<PointerType>(CLI.getArgs()[0].Ty);
	Type *ElementTy = Ty->getElementType();
	SRetArgSize = DAG.getDataLayout().getTypeAllocSize(ElementTy);
	continue;
	}

	if (VA.needsCustom()) {
	assert(VA.getLocVT() == MVT::f64 \|\| VA.getLocVT() == MVT::v2i32);

	if (VA.isMemLoc()) {
	unsigned Offset = VA.getLocMemOffset() + StackOffset;
	// if it is double-word aligned, just store.
	if (Offset % 8 == 0) {
	SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
	MemOpChains.push_back(
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
	continue;
	}
	}

	if (VA.getLocVT() == MVT::f64) {
	// Move from the float value from float registers into the
	// integer registers.
	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg))
	Arg = bitcastConstantFPToInt(C, dl, DAG);
	else
	Arg = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, Arg);
	}

	SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	Arg,
	DAG.getConstant(0, dl, getVectorIdxTy(DAG.getDataLayout())));
	SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	Arg,
	DAG.getConstant(1, dl, getVectorIdxTy(DAG.getDataLayout())));

	if (VA.isRegLoc()) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Part0));
	assert(i+1 != e);
	CCValAssign &NextVA = ArgLocs[++i];
	if (NextVA.isRegLoc()) {
	RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Part1));
	} else {
	// Store the second part in stack.
	unsigned Offset = NextVA.getLocMemOffset() + StackOffset;
	SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
	MemOpChains.push_back(
	DAG.getStore(Chain, dl, Part1, PtrOff, MachinePointerInfo()));
	}
	} else {
	unsigned Offset = VA.getLocMemOffset() + StackOffset;
	// Store the first part.
	SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
	MemOpChains.push_back(
	DAG.getStore(Chain, dl, Part0, PtrOff, MachinePointerInfo()));
	// Store the second part.
	PtrOff = DAG.getIntPtrConstant(Offset + 4, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
	MemOpChains.push_back(
	DAG.getStore(Chain, dl, Part1, PtrOff, MachinePointerInfo()));
	}
	continue;
	}

	// Arguments that can be passed on register must be kept at
	// RegsToPass vector
	if (VA.isRegLoc()) {
	if (VA.getLocVT() != MVT::f32) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	continue;
	}
	Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	continue;
	}

	assert(VA.isMemLoc());

	// Create a store off the stack pointer for this argument.
	SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
	SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() + StackOffset,
	dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
	MemOpChains.push_back(
	DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
	}


	// Emit all stores, make sure the occur before any copies into physregs.
	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	// Build a sequence of copy-to-reg nodes chained together with token
	// chain and flag operands which copy the outgoing args into registers.
	// The InFlag in necessary since all emitted instructions must be
	// stuck together.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	unsigned Reg = toCallerWindow(RegsToPass[i].first);
	Chain = DAG.getCopyToReg(Chain, dl, Reg, RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);

	// If the callee is a GlobalAddress node (quite common, every direct call is)
	// turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
	// Likewise ExternalSymbol -> TargetExternalSymbol.
	unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
	Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32, 0, TF);
	else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
	Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32, TF);

	// Returns a chain & a flag for retval copy to use
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);
	if (hasStructRetAttr)
	Ops.push_back(DAG.getTargetConstant(SRetArgSize, dl, MVT::i32));
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(toCallerWindow(RegsToPass[i].first),
	RegsToPass[i].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *Mask =
	((hasReturnsTwice)
	? TRI->getRTCallPreservedMask(CallConv)
	: TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv));
	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	Chain = DAG.getNode(SPISD::CALL, dl, NodeTys, Ops);
	InFlag = Chain.getValue(1);

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	InFlag = Chain.getValue(1);

	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());

	RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	if (RVLocs[i].getLocVT() == MVT::v2i32) {
	SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2i32);
	SDValue Lo = DAG.getCopyFromReg(
	Chain, dl, toCallerWindow(RVLocs[i++].getLocReg()), MVT::i32, InFlag);
	Chain = Lo.getValue(1);
	InFlag = Lo.getValue(2);
	Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2i32, Vec, Lo,
	DAG.getConstant(0, dl, MVT::i32));
	SDValue Hi = DAG.getCopyFromReg(
	Chain, dl, toCallerWindow(RVLocs[i].getLocReg()), MVT::i32, InFlag);
	Chain = Hi.getValue(1);
	InFlag = Hi.getValue(2);
	Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2i32, Vec, Hi,
	DAG.getConstant(1, dl, MVT::i32));
	InVals.push_back(Vec);
	} else {
	Chain =
	DAG.getCopyFromReg(Chain, dl, toCallerWindow(RVLocs[i].getLocReg()),
	RVLocs[i].getValVT(), InFlag)
	.getValue(1);
	InFlag = Chain.getValue(2);
	InVals.push_back(Chain.getValue(0));
	}
	}

	return Chain;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned SparcTargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("i0", SP::I0).Case("i1", SP::I1).Case("i2", SP::I2).Case("i3", SP::I3)
	.Case("i4", SP::I4).Case("i5", SP::I5).Case("i6", SP::I6).Case("i7", SP::I7)
	.Case("o0", SP::O0).Case("o1", SP::O1).Case("o2", SP::O2).Case("o3", SP::O3)
	.Case("o4", SP::O4).Case("o5", SP::O5).Case("o6", SP::O6).Case("o7", SP::O7)
	.Case("l0", SP::L0).Case("l1", SP::L1).Case("l2", SP::L2).Case("l3", SP::L3)
	.Case("l4", SP::L4).Case("l5", SP::L5).Case("l6", SP::L6).Case("l7", SP::L7)
	.Case("g0", SP::G0).Case("g1", SP::G1).Case("g2", SP::G2).Case("g3", SP::G3)
	.Case("g4", SP::G4).Case("g5", SP::G5).Case("g6", SP::G6).Case("g7", SP::G7)
	.Default(0);

	if (Reg)
	return Reg;

	report_fatal_error("Invalid register name global variable");
	}

	// Fixup floating point arguments in the ... part of a varargs call.
	//
	// The SPARC v9 ABI requires that floating point arguments are treated the same
	// as integers when calling a varargs function. This does not apply to the
	// fixed arguments that are part of the function's prototype.
	//
	// This function post-processes a CCValAssign array created by
	// AnalyzeCallOperands().
	static void fixupVariableFloatArgs(SmallVectorImpl<CCValAssign> &ArgLocs,
	ArrayRef<ISD::OutputArg> Outs) {
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	const CCValAssign &VA = ArgLocs[i];
	MVT ValTy = VA.getLocVT();
	// FIXME: What about f32 arguments? C promotes them to f64 when calling
	// varargs functions.
	if (!VA.isRegLoc() \|\| (ValTy != MVT::f64 && ValTy != MVT::f128))
	continue;
	// The fixed arguments to a varargs function still go in FP registers.
	if (Outs[VA.getValNo()].IsFixed)
	continue;

	// This floating point argument should be reassigned.
	CCValAssign NewVA;

	// Determine the offset into the argument array.
	unsigned firstReg = (ValTy == MVT::f64) ? SP::D0 : SP::Q0;
	unsigned argSize = (ValTy == MVT::f64) ? 8 : 16;
	unsigned Offset = argSize * (VA.getLocReg() - firstReg);
	assert(Offset < 16*8 && "Offset out of range, bad register enum?");

	if (Offset < 6*8) {
	// This argument should go in %i0-%i5.
	unsigned IReg = SP::I0 + Offset/8;
	if (ValTy == MVT::f64)
	// Full register, just bitconvert into i64.
	NewVA = CCValAssign::getReg(VA.getValNo(), VA.getValVT(),
	IReg, MVT::i64, CCValAssign::BCvt);
	else {
	assert(ValTy == MVT::f128 && "Unexpected type!");
	// Full register, just bitconvert into i128 -- We will lower this into
	// two i64s in LowerCall_64.
	NewVA = CCValAssign::getCustomReg(VA.getValNo(), VA.getValVT(),
	IReg, MVT::i128, CCValAssign::BCvt);
	}
	} else {
	// This needs to go to memory, we're out of integer registers.
	NewVA = CCValAssign::getMem(VA.getValNo(), VA.getValVT(),
	Offset, VA.getLocVT(), VA.getLocInfo());
	}
	ArgLocs[i] = NewVA;
	}
	}

	// Lower a call for the 64-bit ABI.
	SDValue
	SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc DL = CLI.DL;
	SDValue Chain = CLI.Chain;
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	// Sparc target does not yet support tail call optimization.
	CLI.IsTailCall = false;

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallOperands(CLI.Outs, CC_Sparc64);

	// Get the size of the outgoing arguments stack space requirement.
	// The stack offset computed by CC_Sparc64 includes all arguments.
	// Called functions expect 6 argument words to exist in the stack frame, used
	// or not.
	unsigned ArgsSize = std::max(6*8u, CCInfo.getNextStackOffset());

	// Keep stack frames 16-byte aligned.
	ArgsSize = alignTo(ArgsSize, 16);

	// Varargs calls require special treatment.
	if (CLI.IsVarArg)
	fixupVariableFloatArgs(ArgLocs, CLI.Outs);

	// Adjust the stack pointer to make room for the arguments.
	// FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
	// with more than 6 arguments.
	Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);

	// Collect the set of registers to pass to the function and their values.
	// This will be emitted as a sequence of CopyToReg nodes glued to the call
	// instruction.
	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

	// Collect chains from all the memory opeations that copy arguments to the
	// stack. They must follow the stack pointer adjustment above and precede the
	// call instruction itself.
	SmallVector<SDValue, 8> MemOpChains;

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	const CCValAssign &VA = ArgLocs[i];
	SDValue Arg = CLI.OutVals[i];

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown location info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExt:
	Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::BCvt:
	// fixupVariableFloatArgs() may create bitcasts from f128 to i128. But
	// SPARC does not support i128 natively. Lower it into two i64, see below.
	if (!VA.needsCustom() \|\| VA.getValVT() != MVT::f128
	\|\| VA.getLocVT() != MVT::i128)
	Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
	break;
	}

	if (VA.isRegLoc()) {
	if (VA.needsCustom() && VA.getValVT() == MVT::f128
	&& VA.getLocVT() == MVT::i128) {
	// Store and reload into the integer register reg and reg+1.
	unsigned Offset = 8 * (VA.getLocReg() - SP::I0);
	unsigned StackOffset = Offset + Subtarget->getStackPointerBias() + 128;
	SDValue StackPtr = DAG.getRegister(SP::O6, PtrVT);
	SDValue HiPtrOff = DAG.getIntPtrConstant(StackOffset, DL);
	HiPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, HiPtrOff);
	SDValue LoPtrOff = DAG.getIntPtrConstant(StackOffset + 8, DL);
	LoPtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, LoPtrOff);

	// Store to %sp+BIAS+128+Offset
	SDValue Store =
	DAG.getStore(Chain, DL, Arg, HiPtrOff, MachinePointerInfo());
	// Load into Reg and Reg+1
	SDValue Hi64 =
	DAG.getLoad(MVT::i64, DL, Store, HiPtrOff, MachinePointerInfo());
	SDValue Lo64 =
	DAG.getLoad(MVT::i64, DL, Store, LoPtrOff, MachinePointerInfo());
	RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()),
	Hi64));
	RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()+1),
	Lo64));
	continue;
	}

	// The custom bit on an i32 return value indicates that it should be
	// passed in the high bits of the register.
	if (VA.getValVT() == MVT::i32 && VA.needsCustom()) {
	Arg = DAG.getNode(ISD::SHL, DL, MVT::i64, Arg,
	DAG.getConstant(32, DL, MVT::i32));

	// The next value may go in the low bits of the same register.
	// Handle both at once.
	if (i+1 < ArgLocs.size() && ArgLocs[i+1].isRegLoc() &&
	ArgLocs[i+1].getLocReg() == VA.getLocReg()) {
	SDValue NV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64,
	CLI.OutVals[i+1]);
	Arg = DAG.getNode(ISD::OR, DL, MVT::i64, Arg, NV);
	// Skip the next value, it's already done.
	++i;
	}
	}
	RegsToPass.push_back(std::make_pair(toCallerWindow(VA.getLocReg()), Arg));
	continue;
	}

	assert(VA.isMemLoc());

	// Create a store off the stack pointer for this argument.
	SDValue StackPtr = DAG.getRegister(SP::O6, PtrVT);
	// The argument area starts at %fp+BIAS+128 in the callee frame,
	// %sp+BIAS+128 in ours.
	SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() +
	Subtarget->getStackPointerBias() +
	128, DL);
	PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
	}

	// Emit all stores, make sure they occur before the call.
	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

	// Build a sequence of CopyToReg nodes glued together with token chain and
	// glue operands which copy the outgoing args into registers. The InGlue is
	// necessary since all emitted instructions must be stuck together in order
	// to pass the live physical registers.
	SDValue InGlue;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, DL,
	RegsToPass[i].first, RegsToPass[i].second, InGlue);
	InGlue = Chain.getValue(1);
	}

	// If the callee is a GlobalAddress node (quite common, every direct call is)
	// turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
	// Likewise ExternalSymbol -> TargetExternalSymbol.
	SDValue Callee = CLI.Callee;
	bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
	unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
	Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT, 0, TF);
	else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
	Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, TF);

	// Build the operands for the call instruction itself.
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	const SparcRegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *Mask =
	((hasReturnsTwice) ? TRI->getRTCallPreservedMask(CLI.CallConv)
	: TRI->getCallPreservedMask(DAG.getMachineFunction(),
	CLI.CallConv));
	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	// Make sure the CopyToReg nodes are glued to the call instruction which
	// consumes the registers.
	if (InGlue.getNode())
	Ops.push_back(InGlue);

	// Now the call itself.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getNode(SPISD::CALL, DL, NodeTys, Ops);
	InGlue = Chain.getValue(1);

	// Revert the stack pointer immediately after the call.
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
	DAG.getIntPtrConstant(0, DL, true), InGlue, DL);
	InGlue = Chain.getValue(1);

	// Now extract the return values. This is more or less the same as
	// LowerFormalArguments_64.

	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());

	// Set inreg flag manually for codegen generated library calls that
	// return float.
	if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CS)
	CLI.Ins[0].Flags.setInReg();

	RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_Sparc64);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	CCValAssign &VA = RVLocs[i];
	unsigned Reg = toCallerWindow(VA.getLocReg());

	// When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
	// reside in the same register in the high and low bits. Reuse the
	// CopyFromReg previous node to avoid duplicate copies.
	SDValue RV;
	if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
	if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
	RV = Chain.getValue(0);

	// But usually we'll create a new CopyFromReg for a different register.
	if (!RV.getNode()) {
	RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
	Chain = RV.getValue(1);
	InGlue = Chain.getValue(2);
	}

	// Get the high bits for i32 struct elements.
	if (VA.getValVT() == MVT::i32 && VA.needsCustom())
	RV = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), RV,
	DAG.getConstant(32, DL, MVT::i32));

	// The callee promoted the return value, so insert an Assert?ext SDNode so
	// we won't promote the value again in this function.
	switch (VA.getLocInfo()) {
	case CCValAssign::SExt:
	RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
	DAG.getValueType(VA.getValVT()));
	break;
	case CCValAssign::ZExt:
	RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
	DAG.getValueType(VA.getValVT()));
	break;
	default:
	break;
	}

	// Truncate the register down to the return value type.
	if (VA.isExtInLoc())
	RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);

	InVals.push_back(RV);
	}

	return Chain;
	}

	//===----------------------------------------------------------------------===//
	// TargetLowering Implementation
	//===----------------------------------------------------------------------===//

	TargetLowering::AtomicExpansionKind SparcTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	if (AI->getOperation() == AtomicRMWInst::Xchg &&
	AI->getType()->getPrimitiveSizeInBits() == 32)
	return AtomicExpansionKind::None; // Uses xchg instruction

	return AtomicExpansionKind::CmpXChg;
	}

	/// IntCondCCodeToICC - Convert a DAG integer condition code to a SPARC ICC
	/// condition.
	static SPCC::CondCodes IntCondCCodeToICC(ISD::CondCode CC) {
	switch (CC) {
	default: llvm_unreachable("Unknown integer condition code!");
	case ISD::SETEQ: return SPCC::ICC_E;
	case ISD::SETNE: return SPCC::ICC_NE;
	case ISD::SETLT: return SPCC::ICC_L;
	case ISD::SETGT: return SPCC::ICC_G;
	case ISD::SETLE: return SPCC::ICC_LE;
	case ISD::SETGE: return SPCC::ICC_GE;
	case ISD::SETULT: return SPCC::ICC_CS;
	case ISD::SETULE: return SPCC::ICC_LEU;
	case ISD::SETUGT: return SPCC::ICC_GU;
	case ISD::SETUGE: return SPCC::ICC_CC;
	}
	}

	/// FPCondCCodeToFCC - Convert a DAG floatingp oint condition code to a SPARC
	/// FCC condition.
	static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
	switch (CC) {
	default: llvm_unreachable("Unknown fp condition code!");
	case ISD::SETEQ:
	case ISD::SETOEQ: return SPCC::FCC_E;
	case ISD::SETNE:
	case ISD::SETUNE: return SPCC::FCC_NE;
	case ISD::SETLT:
	case ISD::SETOLT: return SPCC::FCC_L;
	case ISD::SETGT:
	case ISD::SETOGT: return SPCC::FCC_G;
	case ISD::SETLE:
	case ISD::SETOLE: return SPCC::FCC_LE;
	case ISD::SETGE:
	case ISD::SETOGE: return SPCC::FCC_GE;
	case ISD::SETULT: return SPCC::FCC_UL;
	case ISD::SETULE: return SPCC::FCC_ULE;
	case ISD::SETUGT: return SPCC::FCC_UG;
	case ISD::SETUGE: return SPCC::FCC_UGE;
	case ISD::SETUO: return SPCC::FCC_U;
	case ISD::SETO: return SPCC::FCC_O;
	case ISD::SETONE: return SPCC::FCC_LG;
	case ISD::SETUEQ: return SPCC::FCC_UE;
	}
	}

	SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
	const SparcSubtarget &STI)
	: TargetLowering(TM), Subtarget(&STI) {
	MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize(0));

	// Instructions which use registers as conditionals examine all the
	// bits (as does the pseudo SELECT_CC expansion). I don't think it
	// matters much whether it's ZeroOrOneBooleanContent, or
	// ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
	// former.
	setBooleanContents(ZeroOrOneBooleanContent);
	setBooleanVectorContents(ZeroOrOneBooleanContent);

	// Set up the register classes.
	addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
	if (!Subtarget->useSoftFloat()) {
	addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
	addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
	addRegisterClass(MVT::f128, &SP::QFPRegsRegClass);
	}
	if (Subtarget->is64Bit()) {
	addRegisterClass(MVT::i64, &SP::I64RegsRegClass);
	} else {
	// On 32bit sparc, we define a double-register 32bit register
	// class, as well. This is modeled in LLVM as a 2-vector of i32.
	addRegisterClass(MVT::v2i32, &SP::IntPairRegClass);

	// ...but almost all operations must be expanded, so set that as
	// the default.
	for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
	setOperationAction(Op, MVT::v2i32, Expand);
	}
	// Truncating/extending stores/loads are also not supported.
	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i32, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Expand);

	setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, VT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, VT, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, VT, Expand);

	setTruncStoreAction(VT, MVT::v2i32, Expand);
	setTruncStoreAction(MVT::v2i32, VT, Expand);
	}
	// However, load and store are legal.
	setOperationAction(ISD::LOAD, MVT::v2i32, Legal);
	setOperationAction(ISD::STORE, MVT::v2i32, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Legal);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Legal);

	// And we need to promote i64 loads/stores into vector load/store
	setOperationAction(ISD::LOAD, MVT::i64, Custom);
	setOperationAction(ISD::STORE, MVT::i64, Custom);

	// Sadly, this doesn't work:
	// AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
	// AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
	}

	// Turn FP extload into load/fpextend
	for (MVT VT : MVT::fp_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
	}

	// Sparc doesn't have i1 sign extending load
	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

	// Turn FP truncstore into trunc + store.
	setTruncStoreAction(MVT::f64, MVT::f32, Expand);
	setTruncStoreAction(MVT::f128, MVT::f32, Expand);
	setTruncStoreAction(MVT::f128, MVT::f64, Expand);

	// Custom legalize GlobalAddress nodes into LO/HI parts.
	setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
	setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
	setOperationAction(ISD::ConstantPool, PtrVT, Custom);
	setOperationAction(ISD::BlockAddress, PtrVT, Custom);

	// Sparc doesn't have sext_inreg, replace them with shl/sra
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);

	// Sparc has no REM or DIVREM operations.
	setOperationAction(ISD::UREM, MVT::i32, Expand);
	setOperationAction(ISD::SREM, MVT::i32, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i32, Expand);

	// ... nor does SparcV9.
	if (Subtarget->is64Bit()) {
	setOperationAction(ISD::UREM, MVT::i64, Expand);
	setOperationAction(ISD::SREM, MVT::i64, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
	}

	// Custom expand fp<->sint
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);

	// Custom Expand fp<->uint
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

	setOperationAction(ISD::BITCAST, MVT::f32, Expand);
	setOperationAction(ISD::BITCAST, MVT::i32, Expand);

	// Sparc has no select or setcc: expand to SELECT_CC.
	setOperationAction(ISD::SELECT, MVT::i32, Expand);
	setOperationAction(ISD::SELECT, MVT::f32, Expand);
	setOperationAction(ISD::SELECT, MVT::f64, Expand);
	setOperationAction(ISD::SELECT, MVT::f128, Expand);

	setOperationAction(ISD::SETCC, MVT::i32, Expand);
	setOperationAction(ISD::SETCC, MVT::f32, Expand);
	setOperationAction(ISD::SETCC, MVT::f64, Expand);
	setOperationAction(ISD::SETCC, MVT::f128, Expand);

	// Sparc doesn't have BRCOND either, it has BR_CC.
	setOperationAction(ISD::BRCOND, MVT::Other, Expand);
	setOperationAction(ISD::BRIND, MVT::Other, Expand);
	setOperationAction(ISD::BR_JT, MVT::Other, Expand);
	setOperationAction(ISD::BR_CC, MVT::i32, Custom);
	setOperationAction(ISD::BR_CC, MVT::f32, Custom);
	setOperationAction(ISD::BR_CC, MVT::f64, Custom);
	setOperationAction(ISD::BR_CC, MVT::f128, Custom);

	setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);

	setOperationAction(ISD::ADDC, MVT::i32, Custom);
	setOperationAction(ISD::ADDE, MVT::i32, Custom);
	setOperationAction(ISD::SUBC, MVT::i32, Custom);
	setOperationAction(ISD::SUBE, MVT::i32, Custom);

	if (Subtarget->is64Bit()) {
	setOperationAction(ISD::ADDC, MVT::i64, Custom);
	setOperationAction(ISD::ADDE, MVT::i64, Custom);
	setOperationAction(ISD::SUBC, MVT::i64, Custom);
	setOperationAction(ISD::SUBE, MVT::i64, Custom);
	setOperationAction(ISD::BITCAST, MVT::f64, Expand);
	setOperationAction(ISD::BITCAST, MVT::i64, Expand);
	setOperationAction(ISD::SELECT, MVT::i64, Expand);
	setOperationAction(ISD::SETCC, MVT::i64, Expand);
	setOperationAction(ISD::BR_CC, MVT::i64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);

	setOperationAction(ISD::CTPOP, MVT::i64,
	Subtarget->usePopc() ? Legal : Expand);
	setOperationAction(ISD::CTTZ , MVT::i64, Expand);
	setOperationAction(ISD::CTLZ , MVT::i64, Expand);
	setOperationAction(ISD::BSWAP, MVT::i64, Expand);
	setOperationAction(ISD::ROTL , MVT::i64, Expand);
	setOperationAction(ISD::ROTR , MVT::i64, Expand);
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
	}

	// ATOMICs.
	// Atomics are supported on SparcV9. 32-bit atomics are also
	// supported by some Leon SparcV8 variants. Otherwise, atomics
	// are unsupported.
	if (Subtarget->isV9())
	setMaxAtomicSizeInBitsSupported(64);
	else if (Subtarget->hasLeonCasa())
	setMaxAtomicSizeInBitsSupported(32);
	else
	setMaxAtomicSizeInBitsSupported(0);

	setMinCmpXchgSizeInBits(32);

	setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Legal);

	setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Legal);

	// Custom Lower Atomic LOAD/STORE
	setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);

	if (Subtarget->is64Bit()) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Legal);
	setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Legal);
	setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
	setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Custom);
	}

	if (!Subtarget->is64Bit()) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	}

	if (!Subtarget->isV9()) {
	// SparcV8 does not have FNEGD and FABSD.
	setOperationAction(ISD::FNEG, MVT::f64, Custom);
	setOperationAction(ISD::FABS, MVT::f64, Custom);
	}

	setOperationAction(ISD::FSIN , MVT::f128, Expand);
	setOperationAction(ISD::FCOS , MVT::f128, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
	setOperationAction(ISD::FREM , MVT::f128, Expand);
	setOperationAction(ISD::FMA , MVT::f128, Expand);
	setOperationAction(ISD::FSIN , MVT::f64, Expand);
	setOperationAction(ISD::FCOS , MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	setOperationAction(ISD::FREM , MVT::f64, Expand);
	setOperationAction(ISD::FMA , MVT::f64, Expand);
	setOperationAction(ISD::FSIN , MVT::f32, Expand);
	setOperationAction(ISD::FCOS , MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
	setOperationAction(ISD::FREM , MVT::f32, Expand);
	setOperationAction(ISD::FMA , MVT::f32, Expand);
	setOperationAction(ISD::CTTZ , MVT::i32, Expand);
	setOperationAction(ISD::CTLZ , MVT::i32, Expand);
	setOperationAction(ISD::ROTL , MVT::i32, Expand);
	setOperationAction(ISD::ROTR , MVT::i32, Expand);
	setOperationAction(ISD::BSWAP, MVT::i32, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
	setOperationAction(ISD::FPOW , MVT::f128, Expand);
	setOperationAction(ISD::FPOW , MVT::f64, Expand);
	setOperationAction(ISD::FPOW , MVT::f32, Expand);

	setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
	setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
	setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);

	// Expands to [SU]MUL_LOHI.
	setOperationAction(ISD::MULHU, MVT::i32, Expand);
	setOperationAction(ISD::MULHS, MVT::i32, Expand);
	setOperationAction(ISD::MUL, MVT::i32, Expand);

	if (Subtarget->useSoftMulDiv()) {
	// .umul works for both signed and unsigned
	setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
	setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
	setLibcallName(RTLIB::MUL_I32, ".umul");

	setOperationAction(ISD::SDIV, MVT::i32, Expand);
	setLibcallName(RTLIB::SDIV_I32, ".div");

	setOperationAction(ISD::UDIV, MVT::i32, Expand);
	setLibcallName(RTLIB::UDIV_I32, ".udiv");

	setLibcallName(RTLIB::SREM_I32, ".rem");
	setLibcallName(RTLIB::UREM_I32, ".urem");
	}

	if (Subtarget->is64Bit()) {
	setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::MULHU, MVT::i64, Expand);
	setOperationAction(ISD::MULHS, MVT::i64, Expand);

	setOperationAction(ISD::UMULO, MVT::i64, Custom);
	setOperationAction(ISD::SMULO, MVT::i64, Custom);

	setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
	setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
	setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
	}

	// VASTART needs to be custom lowered to use the VarArgsFrameIndex.
	setOperationAction(ISD::VASTART , MVT::Other, Custom);
	// VAARG needs to be lowered to not do unaligned accesses for doubles.
	setOperationAction(ISD::VAARG , MVT::Other, Custom);

	setOperationAction(ISD::TRAP , MVT::Other, Legal);
	setOperationAction(ISD::DEBUGTRAP , MVT::Other, Legal);

	// Use the default implementation.
	setOperationAction(ISD::VACOPY , MVT::Other, Expand);
	setOperationAction(ISD::VAEND , MVT::Other, Expand);
	setOperationAction(ISD::STACKSAVE , MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE , MVT::Other, Expand);
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);

	setStackPointerRegisterToSaveRestore(SP::O6);

	setOperationAction(ISD::CTPOP, MVT::i32,
	Subtarget->usePopc() ? Legal : Expand);

	if (Subtarget->isV9() && Subtarget->hasHardQuad()) {
	setOperationAction(ISD::LOAD, MVT::f128, Legal);
	setOperationAction(ISD::STORE, MVT::f128, Legal);
	} else {
	setOperationAction(ISD::LOAD, MVT::f128, Custom);
	setOperationAction(ISD::STORE, MVT::f128, Custom);
	}

	if (Subtarget->hasHardQuad()) {
	setOperationAction(ISD::FADD, MVT::f128, Legal);
	setOperationAction(ISD::FSUB, MVT::f128, Legal);
	setOperationAction(ISD::FMUL, MVT::f128, Legal);
	setOperationAction(ISD::FDIV, MVT::f128, Legal);
	setOperationAction(ISD::FSQRT, MVT::f128, Legal);
	setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
	setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
	if (Subtarget->isV9()) {
	setOperationAction(ISD::FNEG, MVT::f128, Legal);
	setOperationAction(ISD::FABS, MVT::f128, Legal);
	} else {
	setOperationAction(ISD::FNEG, MVT::f128, Custom);
	setOperationAction(ISD::FABS, MVT::f128, Custom);
	}

	if (!Subtarget->is64Bit()) {
	setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
	setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
	setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
	setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
	}

	} else {
	// Custom legalize f128 operations.

	setOperationAction(ISD::FADD, MVT::f128, Custom);
	setOperationAction(ISD::FSUB, MVT::f128, Custom);
	setOperationAction(ISD::FMUL, MVT::f128, Custom);
	setOperationAction(ISD::FDIV, MVT::f128, Custom);
	setOperationAction(ISD::FSQRT, MVT::f128, Custom);
	setOperationAction(ISD::FNEG, MVT::f128, Custom);
	setOperationAction(ISD::FABS, MVT::f128, Custom);

	setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);

	// Setup Runtime library names.
	if (Subtarget->is64Bit() && !Subtarget->useSoftFloat()) {
	setLibcallName(RTLIB::ADD_F128, "_Qp_add");
	setLibcallName(RTLIB::SUB_F128, "_Qp_sub");
	setLibcallName(RTLIB::MUL_F128, "_Qp_mul");
	setLibcallName(RTLIB::DIV_F128, "_Qp_div");
	setLibcallName(RTLIB::SQRT_F128, "_Qp_sqrt");
	setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Qp_qtoi");
	setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Qp_qtoui");
	setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Qp_itoq");
	setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Qp_uitoq");
	setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Qp_qtox");
	setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Qp_qtoux");
	setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Qp_xtoq");
	setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Qp_uxtoq");
	setLibcallName(RTLIB::FPEXT_F32_F128, "_Qp_stoq");
	setLibcallName(RTLIB::FPEXT_F64_F128, "_Qp_dtoq");
	setLibcallName(RTLIB::FPROUND_F128_F32, "_Qp_qtos");
	setLibcallName(RTLIB::FPROUND_F128_F64, "_Qp_qtod");
	} else if (!Subtarget->useSoftFloat()) {
	setLibcallName(RTLIB::ADD_F128, "_Q_add");
	setLibcallName(RTLIB::SUB_F128, "_Q_sub");
	setLibcallName(RTLIB::MUL_F128, "_Q_mul");
	setLibcallName(RTLIB::DIV_F128, "_Q_div");
	setLibcallName(RTLIB::SQRT_F128, "_Q_sqrt");
	setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Q_qtoi");
	setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Q_qtou");
	setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Q_itoq");
	setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Q_utoq");
	setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
	setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
	setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
	setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
	setLibcallName(RTLIB::FPEXT_F32_F128, "_Q_stoq");
	setLibcallName(RTLIB::FPEXT_F64_F128, "_Q_dtoq");
	setLibcallName(RTLIB::FPROUND_F128_F32, "_Q_qtos");
	setLibcallName(RTLIB::FPROUND_F128_F64, "_Q_qtod");
	}
	}

	if (Subtarget->fixAllFDIVSQRT()) {
	// Promote FDIVS and FSQRTS to FDIVD and FSQRTD instructions instead as
	// the former instructions generate errata on LEON processors.
	setOperationAction(ISD::FDIV, MVT::f32, Promote);
	setOperationAction(ISD::FSQRT, MVT::f32, Promote);
	}

	if (Subtarget->hasNoFMULS()) {
	setOperationAction(ISD::FMUL, MVT::f32, Promote);
	}

	// Custom combine bitcast between f64 and v2i32
	if (!Subtarget->is64Bit())
	setTargetDAGCombine(ISD::BITCAST);

	if (Subtarget->hasLeonCycleCounter())
	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);

	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

	setMinFunctionAlignment(2);

	computeRegisterProperties(Subtarget->getRegisterInfo());
	}

	bool SparcTargetLowering::useSoftFloat() const {
	return Subtarget->useSoftFloat();
	}

	const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((SPISD::NodeType)Opcode) {
	case SPISD::FIRST_NUMBER: break;
	case SPISD::CMPICC: return "SPISD::CMPICC";
	case SPISD::CMPFCC: return "SPISD::CMPFCC";
	case SPISD::BRICC: return "SPISD::BRICC";
	case SPISD::BRXCC: return "SPISD::BRXCC";
	case SPISD::BRFCC: return "SPISD::BRFCC";
	case SPISD::SELECT_ICC: return "SPISD::SELECT_ICC";
	case SPISD::SELECT_XCC: return "SPISD::SELECT_XCC";
	case SPISD::SELECT_FCC: return "SPISD::SELECT_FCC";
	case SPISD::Hi: return "SPISD::Hi";
	case SPISD::Lo: return "SPISD::Lo";
	case SPISD::FTOI: return "SPISD::FTOI";
	case SPISD::ITOF: return "SPISD::ITOF";
	case SPISD::FTOX: return "SPISD::FTOX";
	case SPISD::XTOF: return "SPISD::XTOF";
	case SPISD::CALL: return "SPISD::CALL";
	case SPISD::RET_FLAG: return "SPISD::RET_FLAG";
	case SPISD::GLOBAL_BASE_REG: return "SPISD::GLOBAL_BASE_REG";
	case SPISD::FLUSHW: return "SPISD::FLUSHW";
	case SPISD::TLS_ADD: return "SPISD::TLS_ADD";
	case SPISD::TLS_LD: return "SPISD::TLS_LD";
	case SPISD::TLS_CALL: return "SPISD::TLS_CALL";
	}
	return nullptr;
	}

	EVT SparcTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i32;
	return VT.changeVectorElementTypeToInteger();
	}

	/// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
	/// be zero. Op is expected to be a target specific node. Used by DAG
	/// combiner.
	void SparcTargetLowering::computeKnownBitsForTargetNode
	(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	KnownBits Known2;
	Known.resetAll();

	switch (Op.getOpcode()) {
	default: break;
	case SPISD::SELECT_ICC:
	case SPISD::SELECT_XCC:
	case SPISD::SELECT_FCC:
	Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
	Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	}
	}

	// Look at LHS/RHS/CC and see if they are a lowered setcc instruction. If so
	// set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition.
	static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
	ISD::CondCode CC, unsigned &SPCC) {
	if (isNullConstant(RHS) &&
	CC == ISD::SETNE &&
	(((LHS.getOpcode() == SPISD::SELECT_ICC \|\|
	LHS.getOpcode() == SPISD::SELECT_XCC) &&
	LHS.getOperand(3).getOpcode() == SPISD::CMPICC) \|\|
	(LHS.getOpcode() == SPISD::SELECT_FCC &&
	LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) &&
	isOneConstant(LHS.getOperand(0)) &&
	isNullConstant(LHS.getOperand(1))) {
	SDValue CMPCC = LHS.getOperand(3);
	SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
	LHS = CMPCC.getOperand(0);
	RHS = CMPCC.getOperand(1);
	}
	}

	// Convert to a target node and set target flags.
	SDValue SparcTargetLowering::withTargetFlags(SDValue Op, unsigned TF,
	SelectionDAG &DAG) const {
	if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
	return DAG.getTargetGlobalAddress(GA->getGlobal(),
	SDLoc(GA),
	GA->getValueType(0),
	GA->getOffset(), TF);

	if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
	return DAG.getTargetConstantPool(CP->getConstVal(),
	CP->getValueType(0),
	CP->getAlignment(),
	CP->getOffset(), TF);

	if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
	return DAG.getTargetBlockAddress(BA->getBlockAddress(),
	Op.getValueType(),
	0,
	TF);

	if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
	return DAG.getTargetExternalSymbol(ES->getSymbol(),
	ES->getValueType(0), TF);

	llvm_unreachable("Unhandled address SDNode");
	}

	// Split Op into high and low parts according to HiTF and LoTF.
	// Return an ADD node combining the parts.
	SDValue SparcTargetLowering::makeHiLoPair(SDValue Op,
	unsigned HiTF, unsigned LoTF,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	EVT VT = Op.getValueType();
	SDValue Hi = DAG.getNode(SPISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
	SDValue Lo = DAG.getNode(SPISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
	return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
	}

	// Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
	// or ExternalSymbol SDNode.
	SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	EVT VT = getPointerTy(DAG.getDataLayout());

	// Handle PIC mode first. SPARC needs a got load for every variable!
	if (isPositionIndependent()) {
	const Module *M = DAG.getMachineFunction().getFunction().getParent();
	PICLevel::Level picLevel = M->getPICLevel();
	SDValue Idx;

	if (picLevel == PICLevel::SmallPIC) {
	// This is the pic13 code model, the GOT is known to be smaller than 8KiB.
	Idx = DAG.getNode(SPISD::Lo, DL, Op.getValueType(),
	withTargetFlags(Op, SparcMCExpr::VK_Sparc_GOT13, DAG));
	} else {
	// This is the pic32 code model, the GOT is known to be smaller than 4GB.
	Idx = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_GOT22,
	SparcMCExpr::VK_Sparc_GOT10, DAG);
	}

	SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, VT);
	SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Idx);
	// GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
	// function has calls.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setHasCalls(true);
	return DAG.getLoad(VT, DL, DAG.getEntryNode(), AbsAddr,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}

	// This is one of the absolute code models.
	switch(getTargetMachine().getCodeModel()) {
	default:
	llvm_unreachable("Unsupported absolute code model");
	case CodeModel::Small:
	// abs32.
	return makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HI,
	SparcMCExpr::VK_Sparc_LO, DAG);
	case CodeModel::Medium: {
	// abs44.
	SDValue H44 = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_H44,
	SparcMCExpr::VK_Sparc_M44, DAG);
	H44 = DAG.getNode(ISD::SHL, DL, VT, H44, DAG.getConstant(12, DL, MVT::i32));
	SDValue L44 = withTargetFlags(Op, SparcMCExpr::VK_Sparc_L44, DAG);
	L44 = DAG.getNode(SPISD::Lo, DL, VT, L44);
	return DAG.getNode(ISD::ADD, DL, VT, H44, L44);
	}
	case CodeModel::Large: {
	// abs64.
	SDValue Hi = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HH,
	SparcMCExpr::VK_Sparc_HM, DAG);
	Hi = DAG.getNode(ISD::SHL, DL, VT, Hi, DAG.getConstant(32, DL, MVT::i32));
	SDValue Lo = makeHiLoPair(Op, SparcMCExpr::VK_Sparc_HI,
	SparcMCExpr::VK_Sparc_LO, DAG);
	return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
	}
	}
	}

	SDValue SparcTargetLowering::LowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	return makeAddress(Op, DAG);
	}

	SDValue SparcTargetLowering::LowerConstantPool(SDValue Op,
	SelectionDAG &DAG) const {
	return makeAddress(Op, DAG);
	}

	SDValue SparcTargetLowering::LowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	return makeAddress(Op, DAG);
	}

	SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {

	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
	if (DAG.getTarget().useEmulatedTLS())
	return LowerToTLSEmulatedModel(GA, DAG);

	SDLoc DL(GA);
	const GlobalValue *GV = GA->getGlobal();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	TLSModel::Model model = getTargetMachine().getTLSModel(GV);

	if (model == TLSModel::GeneralDynamic \|\| model == TLSModel::LocalDynamic) {
	unsigned HiTF = ((model == TLSModel::GeneralDynamic)
	? SparcMCExpr::VK_Sparc_TLS_GD_HI22
	: SparcMCExpr::VK_Sparc_TLS_LDM_HI22);
	unsigned LoTF = ((model == TLSModel::GeneralDynamic)
	? SparcMCExpr::VK_Sparc_TLS_GD_LO10
	: SparcMCExpr::VK_Sparc_TLS_LDM_LO10);
	unsigned addTF = ((model == TLSModel::GeneralDynamic)
	? SparcMCExpr::VK_Sparc_TLS_GD_ADD
	: SparcMCExpr::VK_Sparc_TLS_LDM_ADD);
	unsigned callTF = ((model == TLSModel::GeneralDynamic)
	? SparcMCExpr::VK_Sparc_TLS_GD_CALL
	: SparcMCExpr::VK_Sparc_TLS_LDM_CALL);

	SDValue HiLo = makeHiLoPair(Op, HiTF, LoTF, DAG);
	SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
	SDValue Argument = DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Base, HiLo,
	withTargetFlags(Op, addTF, DAG));

	SDValue Chain = DAG.getEntryNode();
	SDValue InFlag;

	Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL);
	Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag);
	InFlag = Chain.getValue(1);
	SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
	SDValue Symbol = withTargetFlags(Op, callTF, DAG);

	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
	DAG.getMachineFunction(), CallingConv::C);
	assert(Mask && "Missing call preserved mask for calling convention");
	SDValue Ops[] = {Chain,
	Callee,
	Symbol,
	DAG.getRegister(SP::O0, PtrVT),
	DAG.getRegisterMask(Mask),
	InFlag};
	Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops);
	InFlag = Chain.getValue(1);
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(1, DL, true),
	DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
	InFlag = Chain.getValue(1);
	SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InFlag);

	if (model != TLSModel::LocalDynamic)
	return Ret;

	SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
	withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_HIX22, DAG));
	SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
	withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_LOX10, DAG));
	HiLo = DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
	return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Ret, HiLo,
	withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LDO_ADD, DAG));
	}

	if (model == TLSModel::InitialExec) {
	unsigned ldTF = ((PtrVT == MVT::i64)? SparcMCExpr::VK_Sparc_TLS_IE_LDX
	: SparcMCExpr::VK_Sparc_TLS_IE_LD);

	SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);

	// GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
	// function has calls.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setHasCalls(true);

	SDValue TGA = makeHiLoPair(Op,
	SparcMCExpr::VK_Sparc_TLS_IE_HI22,
	SparcMCExpr::VK_Sparc_TLS_IE_LO10, DAG);
	SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, TGA);
	SDValue Offset = DAG.getNode(SPISD::TLS_LD,
	DL, PtrVT, Ptr,
	withTargetFlags(Op, ldTF, DAG));
	return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT,
	DAG.getRegister(SP::G7, PtrVT), Offset,
	withTargetFlags(Op,
	SparcMCExpr::VK_Sparc_TLS_IE_ADD, DAG));
	}

	assert(model == TLSModel::LocalExec);
	SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
	withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LE_HIX22, DAG));
	SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
	withTargetFlags(Op, SparcMCExpr::VK_Sparc_TLS_LE_LOX10, DAG));
	SDValue Offset = DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);

	return DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getRegister(SP::G7, PtrVT), Offset);
	}

	SDValue SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain,
	ArgListTy &Args, SDValue Arg,
	const SDLoc &DL,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	ArgListEntry Entry;
	Entry.Node = Arg;
	Entry.Ty = ArgTy;

	if (ArgTy->isFP128Ty()) {
	// Create a stack object and pass the pointer to the library function.
	int FI = MFI.CreateStackObject(16, 8, false);
	SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	Chain = DAG.getStore(Chain, DL, Entry.Node, FIPtr, MachinePointerInfo(),
	/* Alignment = */ 8);

	Entry.Node = FIPtr;
	Entry.Ty = PointerType::getUnqual(ArgTy);
	}
	Args.push_back(Entry);
	return Chain;
	}

	SDValue
	SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
	const char *LibFuncName,
	unsigned numArgs) const {

	ArgListTy Args;

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	SDValue Callee = DAG.getExternalSymbol(LibFuncName, PtrVT);
	Type RetTy = Op.getValueType().getTypeForEVT(DAG.getContext());
	Type *RetTyABI = RetTy;
	SDValue Chain = DAG.getEntryNode();
	SDValue RetPtr;

	if (RetTy->isFP128Ty()) {
	// Create a Stack Object to receive the return value of type f128.
	ArgListEntry Entry;
	int RetFI = MFI.CreateStackObject(16, 8, false);
	RetPtr = DAG.getFrameIndex(RetFI, PtrVT);
	Entry.Node = RetPtr;
	Entry.Ty = PointerType::getUnqual(RetTy);
	if (!Subtarget->is64Bit())
	Entry.IsSRet = true;
	Entry.IsReturned = false;
	Args.push_back(Entry);
	RetTyABI = Type::getVoidTy(*DAG.getContext());
	}

	assert(Op->getNumOperands() >= numArgs && "Not enough operands!");
	for (unsigned i = 0, e = numArgs; i != e; ++i) {
	Chain = LowerF128_LibCallArg(Chain, Args, Op.getOperand(i), SDLoc(Op), DAG);
	}
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(SDLoc(Op)).setChain(Chain)
	.setCallee(CallingConv::C, RetTyABI, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);

	// chain is in second result.
	if (RetTyABI == RetTy)
	return CallInfo.first;

	assert (RetTy->isFP128Ty() && "Unexpected return type!");

	Chain = CallInfo.second;

	// Load RetPtr to get the return value.
	return DAG.getLoad(Op.getValueType(), SDLoc(Op), Chain, RetPtr,
	MachinePointerInfo(), /* Alignment = */ 8);
	}

	SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
	unsigned &SPCC, const SDLoc &DL,
	SelectionDAG &DAG) const {

	const char *LibCall = nullptr;
	bool is64Bit = Subtarget->is64Bit();
	switch(SPCC) {
	default: llvm_unreachable("Unhandled conditional code!");
	case SPCC::FCC_E : LibCall = is64Bit? "_Qp_feq" : "_Q_feq"; break;
	case SPCC::FCC_NE : LibCall = is64Bit? "_Qp_fne" : "_Q_fne"; break;
	case SPCC::FCC_L : LibCall = is64Bit? "_Qp_flt" : "_Q_flt"; break;
	case SPCC::FCC_G : LibCall = is64Bit? "_Qp_fgt" : "_Q_fgt"; break;
	case SPCC::FCC_LE : LibCall = is64Bit? "_Qp_fle" : "_Q_fle"; break;
	case SPCC::FCC_GE : LibCall = is64Bit? "_Qp_fge" : "_Q_fge"; break;
	case SPCC::FCC_UL :
	case SPCC::FCC_ULE:
	case SPCC::FCC_UG :
	case SPCC::FCC_UGE:
	case SPCC::FCC_U :
	case SPCC::FCC_O :
	case SPCC::FCC_LG :
	case SPCC::FCC_UE : LibCall = is64Bit? "_Qp_cmp" : "_Q_cmp"; break;
	}

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Callee = DAG.getExternalSymbol(LibCall, PtrVT);
	Type RetTy = Type::getInt32Ty(DAG.getContext());
	ArgListTy Args;
	SDValue Chain = DAG.getEntryNode();
	Chain = LowerF128_LibCallArg(Chain, Args, LHS, DL, DAG);
	Chain = LowerF128_LibCallArg(Chain, Args, RHS, DL, DAG);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(DL).setChain(Chain)
	.setCallee(CallingConv::C, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);

	// result is in first, and chain is in second result.
	SDValue Result = CallInfo.first;

	switch(SPCC) {
	default: {
	SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
	SPCC = SPCC::ICC_NE;
	return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
	}
	case SPCC::FCC_UL : {
	SDValue Mask = DAG.getTargetConstant(1, DL, Result.getValueType());
	Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
	SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
	SPCC = SPCC::ICC_NE;
	return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
	}
	case SPCC::FCC_ULE: {
	SDValue RHS = DAG.getTargetConstant(2, DL, Result.getValueType());
	SPCC = SPCC::ICC_NE;
	return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
	}
	case SPCC::FCC_UG : {
	SDValue RHS = DAG.getTargetConstant(1, DL, Result.getValueType());
	SPCC = SPCC::ICC_G;
	return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
	}
	case SPCC::FCC_UGE: {
	SDValue RHS = DAG.getTargetConstant(1, DL, Result.getValueType());
	SPCC = SPCC::ICC_NE;
	return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
	}

	case SPCC::FCC_U : {
	SDValue RHS = DAG.getTargetConstant(3, DL, Result.getValueType());
	SPCC = SPCC::ICC_E;
	return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
	}
	case SPCC::FCC_O : {
	SDValue RHS = DAG.getTargetConstant(3, DL, Result.getValueType());
	SPCC = SPCC::ICC_NE;
	return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
	}
	case SPCC::FCC_LG : {
	SDValue Mask = DAG.getTargetConstant(3, DL, Result.getValueType());
	Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
	SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
	SPCC = SPCC::ICC_NE;
	return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
	}
	case SPCC::FCC_UE : {
	SDValue Mask = DAG.getTargetConstant(3, DL, Result.getValueType());
	Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
	SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
	SPCC = SPCC::ICC_E;
	return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
	}
	}
	}

	static SDValue
	LowerF128_FPEXTEND(SDValue Op, SelectionDAG &DAG,
	const SparcTargetLowering &TLI) {

	if (Op.getOperand(0).getValueType() == MVT::f64)
	return TLI.LowerF128Op(Op, DAG,
	TLI.getLibcallName(RTLIB::FPEXT_F64_F128), 1);

	if (Op.getOperand(0).getValueType() == MVT::f32)
	return TLI.LowerF128Op(Op, DAG,
	TLI.getLibcallName(RTLIB::FPEXT_F32_F128), 1);

	llvm_unreachable("fpextend with non-float operand!");
	return SDValue();
	}

	static SDValue
	LowerF128_FPROUND(SDValue Op, SelectionDAG &DAG,
	const SparcTargetLowering &TLI) {
	// FP_ROUND on f64 and f32 are legal.
	if (Op.getOperand(0).getValueType() != MVT::f128)
	return Op;

	if (Op.getValueType() == MVT::f64)
	return TLI.LowerF128Op(Op, DAG,
	TLI.getLibcallName(RTLIB::FPROUND_F128_F64), 1);
	if (Op.getValueType() == MVT::f32)
	return TLI.LowerF128Op(Op, DAG,
	TLI.getLibcallName(RTLIB::FPROUND_F128_F32), 1);

	llvm_unreachable("fpround to non-float!");
	return SDValue();
	}

	static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
	const SparcTargetLowering &TLI,
	bool hasHardQuad) {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	assert(VT == MVT::i32 \|\| VT == MVT::i64);

	// Expand f128 operations to fp128 abi calls.
	if (Op.getOperand(0).getValueType() == MVT::f128
	&& (!hasHardQuad \|\| !TLI.isTypeLegal(VT))) {
	const char *libName = TLI.getLibcallName(VT == MVT::i32
	? RTLIB::FPTOSINT_F128_I32
	: RTLIB::FPTOSINT_F128_I64);
	return TLI.LowerF128Op(Op, DAG, libName, 1);
	}

	// Expand if the resulting type is illegal.
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	// Otherwise, Convert the fp value to integer in an FP register.
	if (VT == MVT::i32)
	Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
	else
	Op = DAG.getNode(SPISD::FTOX, dl, MVT::f64, Op.getOperand(0));

	return DAG.getNode(ISD::BITCAST, dl, VT, Op);
	}

	static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG,
	const SparcTargetLowering &TLI,
	bool hasHardQuad) {
	SDLoc dl(Op);
	EVT OpVT = Op.getOperand(0).getValueType();
	assert(OpVT == MVT::i32 \|\| (OpVT == MVT::i64));

	EVT floatVT = (OpVT == MVT::i32) ? MVT::f32 : MVT::f64;

	// Expand f128 operations to fp128 ABI calls.
	if (Op.getValueType() == MVT::f128
	&& (!hasHardQuad \|\| !TLI.isTypeLegal(OpVT))) {
	const char *libName = TLI.getLibcallName(OpVT == MVT::i32
	? RTLIB::SINTTOFP_I32_F128
	: RTLIB::SINTTOFP_I64_F128);
	return TLI.LowerF128Op(Op, DAG, libName, 1);
	}

	// Expand if the operand type is illegal.
	if (!TLI.isTypeLegal(OpVT))
	return SDValue();

	// Otherwise, Convert the int value to FP in an FP register.
	SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, floatVT, Op.getOperand(0));
	unsigned opcode = (OpVT == MVT::i32)? SPISD::ITOF : SPISD::XTOF;
	return DAG.getNode(opcode, dl, Op.getValueType(), Tmp);
	}

	static SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG,
	const SparcTargetLowering &TLI,
	bool hasHardQuad) {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();

	// Expand if it does not involve f128 or the target has support for
	// quad floating point instructions and the resulting type is legal.
	if (Op.getOperand(0).getValueType() != MVT::f128 \|\|
	(hasHardQuad && TLI.isTypeLegal(VT)))
	return SDValue();

	assert(VT == MVT::i32 \|\| VT == MVT::i64);

	return TLI.LowerF128Op(Op, DAG,
	TLI.getLibcallName(VT == MVT::i32
	? RTLIB::FPTOUINT_F128_I32
	: RTLIB::FPTOUINT_F128_I64),
	1);
	}

	static SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG,
	const SparcTargetLowering &TLI,
	bool hasHardQuad) {
	SDLoc dl(Op);
	EVT OpVT = Op.getOperand(0).getValueType();
	assert(OpVT == MVT::i32 \|\| OpVT == MVT::i64);

	// Expand if it does not involve f128 or the target has support for
	// quad floating point instructions and the operand type is legal.
	if (Op.getValueType() != MVT::f128 \|\| (hasHardQuad && TLI.isTypeLegal(OpVT)))
	return SDValue();

	return TLI.LowerF128Op(Op, DAG,
	TLI.getLibcallName(OpVT == MVT::i32
	? RTLIB::UINTTOFP_I32_F128
	: RTLIB::UINTTOFP_I64_F128),
	1);
	}

	static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG,
	const SparcTargetLowering &TLI,
	bool hasHardQuad) {
	SDValue Chain = Op.getOperand(0);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
	SDValue LHS = Op.getOperand(2);
	SDValue RHS = Op.getOperand(3);
	SDValue Dest = Op.getOperand(4);
	SDLoc dl(Op);
	unsigned Opc, SPCC = ~0U;

	// If this is a br_cc of a "setcc", and if the setcc got lowered into
	// an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
	LookThroughSetCC(LHS, RHS, CC, SPCC);

	// Get the condition flag.
	SDValue CompareFlag;
	if (LHS.getValueType().isInteger()) {
	CompareFlag = DAG.getNode(SPISD::CMPICC, dl, MVT::Glue, LHS, RHS);
	if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
	// 32-bit compares use the icc flags, 64-bit uses the xcc flags.
	Opc = LHS.getValueType() == MVT::i32 ? SPISD::BRICC : SPISD::BRXCC;
	} else {
	if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
	if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
	CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
	Opc = SPISD::BRICC;
	} else {
	CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
	if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
	Opc = SPISD::BRFCC;
	}
	}
	return DAG.getNode(Opc, dl, MVT::Other, Chain, Dest,
	DAG.getConstant(SPCC, dl, MVT::i32), CompareFlag);
	}

	static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
	const SparcTargetLowering &TLI,
	bool hasHardQuad) {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
	SDValue TrueVal = Op.getOperand(2);
	SDValue FalseVal = Op.getOperand(3);
	SDLoc dl(Op);
	unsigned Opc, SPCC = ~0U;

	// If this is a select_cc of a "setcc", and if the setcc got lowered into
	// an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
	LookThroughSetCC(LHS, RHS, CC, SPCC);

	SDValue CompareFlag;
	if (LHS.getValueType().isInteger()) {
	CompareFlag = DAG.getNode(SPISD::CMPICC, dl, MVT::Glue, LHS, RHS);
	Opc = LHS.getValueType() == MVT::i32 ?
	SPISD::SELECT_ICC : SPISD::SELECT_XCC;
	if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
	} else {
	if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
	if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
	CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
	Opc = SPISD::SELECT_ICC;
	} else {
	CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
	Opc = SPISD::SELECT_FCC;
	if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
	}
	}
	return DAG.getNode(Opc, dl, TrueVal.getValueType(), TrueVal, FalseVal,
	DAG.getConstant(SPCC, dl, MVT::i32), CompareFlag);
	}

	static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
	const SparcTargetLowering &TLI) {
	MachineFunction &MF = DAG.getMachineFunction();
	SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
	auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());

	// Need frame address to find the address of VarArgsFrameIndex.
	MF.getFrameInfo().setFrameAddressIsTaken(true);

	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDLoc DL(Op);
	SDValue Offset =
	DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(SP::I6, PtrVT),
	DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
	SDNode *Node = Op.getNode();
	EVT VT = Node->getValueType(0);
	SDValue InChain = Node->getOperand(0);
	SDValue VAListPtr = Node->getOperand(1);
	EVT PtrVT = VAListPtr.getValueType();
	const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
	SDLoc DL(Node);
	SDValue VAList =
	DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
	// Increment the pointer, VAList, to the next vaarg.
	SDValue NextPtr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getIntPtrConstant(VT.getSizeInBits()/8,
	DL));
	// Store the incremented VAList to the legalized pointer.
	InChain = DAG.getStore(VAList.getValue(1), DL, NextPtr, VAListPtr,
	MachinePointerInfo(SV));
	// Load the actual argument out of the pointer VAList.
	// We can't count on greater alignment than the word size.
	return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
	std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
	}

	static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
	const SparcSubtarget *Subtarget) {
	SDValue Chain = Op.getOperand(0); // Legalize the chain.
	SDValue Size = Op.getOperand(1); // Legalize the size.
	unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	unsigned StackAlign = Subtarget->getFrameLowering()->getStackAlignment();
	EVT VT = Size->getValueType(0);
	SDLoc dl(Op);

	// TODO: implement over-aligned alloca. (Note: also implies
	// supporting support for overaligned function frames + dynamic
	// allocations, at all, which currently isn't supported)
	if (Align > StackAlign) {
	const MachineFunction &MF = DAG.getMachineFunction();
	report_fatal_error("Function \"" + Twine(MF.getName()) + "\": "
	"over-aligned dynamic alloca not supported.");
	}

	// The resultant pointer needs to be above the register spill area
	// at the bottom of the stack.
	unsigned regSpillArea;
	if (Subtarget->is64Bit()) {
	regSpillArea = 128;
	} else {
	// On Sparc32, the size of the spill area is 92. Unfortunately,
	// that's only 4-byte aligned, not 8-byte aligned (the stack
	// pointer is 8-byte aligned). So, if the user asked for an 8-byte
	// aligned dynamic allocation, we actually need to add 96 to the
	// bottom of the stack, instead of 92, to ensure 8-byte alignment.

	// That also means adding 4 to the size of the allocation --
	// before applying the 8-byte rounding. Unfortunately, we the
	// value we get here has already had rounding applied. So, we need
	// to add 8, instead, wasting a bit more memory.

	// Further, this only actually needs to be done if the required
	// alignment is > 4, but, we've lost that info by this point, too,
	// so we always apply it.

	// (An alternative approach would be to always reserve 96 bytes
	// instead of the required 92, but then we'd waste 4 extra bytes
	// in every frame, not just those with dynamic stack allocations)

	// TODO: modify code in SelectionDAGBuilder to make this less sad.

	Size = DAG.getNode(ISD::ADD, dl, VT, Size,
	DAG.getConstant(8, dl, VT));
	regSpillArea = 96;
	}

	unsigned SPReg = SP::O6;
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
	SDValue NewSP = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
	Chain = DAG.getCopyToReg(SP.getValue(1), dl, SPReg, NewSP); // Output chain

	regSpillArea += Subtarget->getStackPointerBias();

	SDValue NewVal = DAG.getNode(ISD::ADD, dl, VT, NewSP,
	DAG.getConstant(regSpillArea, dl, VT));
	SDValue Ops[2] = { NewVal, Chain };
	return DAG.getMergeValues(Ops, dl);
	}


	static SDValue getFLUSHW(SDValue Op, SelectionDAG &DAG) {
	SDLoc dl(Op);
	SDValue Chain = DAG.getNode(SPISD::FLUSHW,
	dl, MVT::Other, DAG.getEntryNode());
	return Chain;
	}

	static SDValue getFRAMEADDR(uint64_t depth, SDValue Op, SelectionDAG &DAG,
	const SparcSubtarget *Subtarget,
	bool AlwaysFlush = false) {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setFrameAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	unsigned FrameReg = SP::I6;
	unsigned stackBias = Subtarget->getStackPointerBias();

	SDValue FrameAddr;
	SDValue Chain;

	// flush first to make sure the windowed registers' values are in stack
	Chain = (depth \|\| AlwaysFlush) ? getFLUSHW(Op, DAG) : DAG.getEntryNode();

	FrameAddr = DAG.getCopyFromReg(Chain, dl, FrameReg, VT);

	unsigned Offset = (Subtarget->is64Bit()) ? (stackBias + 112) : 56;

	while (depth--) {
	SDValue Ptr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
	DAG.getIntPtrConstant(Offset, dl));
	FrameAddr = DAG.getLoad(VT, dl, Chain, Ptr, MachinePointerInfo());
	}
	if (Subtarget->is64Bit())
	FrameAddr = DAG.getNode(ISD::ADD, dl, VT, FrameAddr,
	DAG.getIntPtrConstant(stackBias, dl));
	return FrameAddr;
	}


	static SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
	const SparcSubtarget *Subtarget) {

	uint64_t depth = Op.getConstantOperandVal(0);

	return getFRAMEADDR(depth, Op, DAG, Subtarget);

	}

	static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
	const SparcTargetLowering &TLI,
	const SparcSubtarget *Subtarget) {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	uint64_t depth = Op.getConstantOperandVal(0);

	SDValue RetAddr;
	if (depth == 0) {
	auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	unsigned RetReg = MF.addLiveIn(SP::I7, TLI.getRegClassFor(PtrVT));
	RetAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, RetReg, VT);
	return RetAddr;
	}

	// Need frame address to find return address of the caller.
	SDValue FrameAddr = getFRAMEADDR(depth - 1, Op, DAG, Subtarget, true);

	unsigned Offset = (Subtarget->is64Bit()) ? 120 : 60;
	SDValue Ptr = DAG.getNode(ISD::ADD,
	dl, VT,
	FrameAddr,
	DAG.getIntPtrConstant(Offset, dl));
	RetAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), Ptr, MachinePointerInfo());

	return RetAddr;
	}

	static SDValue LowerF64Op(SDValue SrcReg64, const SDLoc &dl, SelectionDAG &DAG,
	unsigned opcode) {
	assert(SrcReg64.getValueType() == MVT::f64 && "LowerF64Op called on non-double!");
	assert(opcode == ISD::FNEG \|\| opcode == ISD::FABS);

	// Lower fneg/fabs on f64 to fneg/fabs on f32.
	// fneg f64 => fneg f32:sub_even, fmov f32:sub_odd.
	// fabs f64 => fabs f32:sub_even, fmov f32:sub_odd.

	// Note: in little-endian, the floating-point value is stored in the
	// registers are in the opposite order, so the subreg with the sign
	// bit is the highest-numbered (odd), rather than the
	// lowest-numbered (even).

	SDValue Hi32 = DAG.getTargetExtractSubreg(SP::sub_even, dl, MVT::f32,
	SrcReg64);
	SDValue Lo32 = DAG.getTargetExtractSubreg(SP::sub_odd, dl, MVT::f32,
	SrcReg64);

	if (DAG.getDataLayout().isLittleEndian())
	Lo32 = DAG.getNode(opcode, dl, MVT::f32, Lo32);
	else
	Hi32 = DAG.getNode(opcode, dl, MVT::f32, Hi32);

	SDValue DstReg64 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
	dl, MVT::f64), 0);
	DstReg64 = DAG.getTargetInsertSubreg(SP::sub_even, dl, MVT::f64,
	DstReg64, Hi32);
	DstReg64 = DAG.getTargetInsertSubreg(SP::sub_odd, dl, MVT::f64,
	DstReg64, Lo32);
	return DstReg64;
	}

	// Lower a f128 load into two f64 loads.
	static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
	{
	SDLoc dl(Op);
	LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
	assert(LdNode && LdNode->getOffset().isUndef()
	&& "Unexpected node type");

	unsigned alignment = LdNode->getAlignment();
	if (alignment > 8)
	alignment = 8;

	SDValue Hi64 =
	DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LdNode->getBasePtr(),
	LdNode->getPointerInfo(), alignment);
	EVT addrVT = LdNode->getBasePtr().getValueType();
	SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
	LdNode->getBasePtr(),
	DAG.getConstant(8, dl, addrVT));
	SDValue Lo64 = DAG.getLoad(MVT::f64, dl, LdNode->getChain(), LoPtr,
	LdNode->getPointerInfo(), alignment);

	SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, dl, MVT::i32);
	SDValue SubRegOdd = DAG.getTargetConstant(SP::sub_odd64, dl, MVT::i32);

	SDNode *InFP128 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
	dl, MVT::f128);
	InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
	MVT::f128,
	SDValue(InFP128, 0),
	Hi64,
	SubRegEven);
	InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
	MVT::f128,
	SDValue(InFP128, 0),
	Lo64,
	SubRegOdd);
	SDValue OutChains[2] = { SDValue(Hi64.getNode(), 1),
	SDValue(Lo64.getNode(), 1) };
	SDValue OutChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	SDValue Ops[2] = {SDValue(InFP128,0), OutChain};
	return DAG.getMergeValues(Ops, dl);
	}

	static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG)
	{
	LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());

	EVT MemVT = LdNode->getMemoryVT();
	if (MemVT == MVT::f128)
	return LowerF128Load(Op, DAG);

	return Op;
	}

	// Lower a f128 store into two f64 stores.
	static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
	SDLoc dl(Op);
	StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
	assert(StNode && StNode->getOffset().isUndef()
	&& "Unexpected node type");
	SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, dl, MVT::i32);
	SDValue SubRegOdd = DAG.getTargetConstant(SP::sub_odd64, dl, MVT::i32);

	SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
	dl,
	MVT::f64,
	StNode->getValue(),
	SubRegEven);
	SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
	dl,
	MVT::f64,
	StNode->getValue(),
	SubRegOdd);

	unsigned alignment = StNode->getAlignment();
	if (alignment > 8)
	alignment = 8;

	SDValue OutChains[2];
	OutChains[0] =
	DAG.getStore(StNode->getChain(), dl, SDValue(Hi64, 0),
	StNode->getBasePtr(), MachinePointerInfo(), alignment);
	EVT addrVT = StNode->getBasePtr().getValueType();
	SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
	StNode->getBasePtr(),
	DAG.getConstant(8, dl, addrVT));
	OutChains[1] = DAG.getStore(StNode->getChain(), dl, SDValue(Lo64, 0), LoPtr,
	MachinePointerInfo(), alignment);
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}

	static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG)
	{
	SDLoc dl(Op);
	StoreSDNode *St = cast<StoreSDNode>(Op.getNode());

	EVT MemVT = St->getMemoryVT();
	if (MemVT == MVT::f128)
	return LowerF128Store(Op, DAG);

	if (MemVT == MVT::i64) {
	// Custom handling for i64 stores: turn it into a bitcast and a
	// v2i32 store.
	SDValue Val = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, St->getValue());
	SDValue Chain = DAG.getStore(
	St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags(), St->getAAInfo());
	return Chain;
	}

	return SDValue();
	}

	static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
	assert((Op.getOpcode() == ISD::FNEG \|\| Op.getOpcode() == ISD::FABS)
	&& "invalid opcode");

	SDLoc dl(Op);

	if (Op.getValueType() == MVT::f64)
	return LowerF64Op(Op.getOperand(0), dl, DAG, Op.getOpcode());
	if (Op.getValueType() != MVT::f128)
	return Op;

	// Lower fabs/fneg on f128 to fabs/fneg on f64
	// fabs/fneg f128 => fabs/fneg f64:sub_even64, fmov f64:sub_odd64
	// (As with LowerF64Op, on little-endian, we need to negate the odd
	// subreg)

	SDValue SrcReg128 = Op.getOperand(0);
	SDValue Hi64 = DAG.getTargetExtractSubreg(SP::sub_even64, dl, MVT::f64,
	SrcReg128);
	SDValue Lo64 = DAG.getTargetExtractSubreg(SP::sub_odd64, dl, MVT::f64,
	SrcReg128);

	if (DAG.getDataLayout().isLittleEndian()) {
	if (isV9)
	Lo64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Lo64);
	else
	Lo64 = LowerF64Op(Lo64, dl, DAG, Op.getOpcode());
	} else {
	if (isV9)
	Hi64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Hi64);
	else
	Hi64 = LowerF64Op(Hi64, dl, DAG, Op.getOpcode());
	}

	SDValue DstReg128 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
	dl, MVT::f128), 0);
	DstReg128 = DAG.getTargetInsertSubreg(SP::sub_even64, dl, MVT::f128,
	DstReg128, Hi64);
	DstReg128 = DAG.getTargetInsertSubreg(SP::sub_odd64, dl, MVT::f128,
	DstReg128, Lo64);
	return DstReg128;
	}

	static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {

	if (Op.getValueType() != MVT::i64)
	return Op;

	SDLoc dl(Op);
	SDValue Src1 = Op.getOperand(0);
	SDValue Src1Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1);
	SDValue Src1Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src1,
	DAG.getConstant(32, dl, MVT::i64));
	Src1Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1Hi);

	SDValue Src2 = Op.getOperand(1);
	SDValue Src2Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2);
	SDValue Src2Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src2,
	DAG.getConstant(32, dl, MVT::i64));
	Src2Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2Hi);


	bool hasChain = false;
	unsigned hiOpc = Op.getOpcode();
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case ISD::ADDC: hiOpc = ISD::ADDE; break;
	case ISD::ADDE: hasChain = true; break;
	case ISD::SUBC: hiOpc = ISD::SUBE; break;
	case ISD::SUBE: hasChain = true; break;
	}
	SDValue Lo;
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Glue);
	if (hasChain) {
	Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo,
	Op.getOperand(2));
	} else {
	Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo);
	}
	SDValue Hi = DAG.getNode(hiOpc, dl, VTs, Src1Hi, Src2Hi, Lo.getValue(1));
	SDValue Carry = Hi.getValue(1);

	Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Lo);
	Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Hi);
	Hi = DAG.getNode(ISD::SHL, dl, MVT::i64, Hi,
	DAG.getConstant(32, dl, MVT::i64));

	SDValue Dst = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, Lo);
	SDValue Ops[2] = { Dst, Carry };
	return DAG.getMergeValues(Ops, dl);
	}

	// Custom lower UMULO/SMULO for SPARC. This code is similar to ExpandNode()
	// in LegalizeDAG.cpp except the order of arguments to the library function.
	static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
	const SparcTargetLowering &TLI)
	{
	unsigned opcode = Op.getOpcode();
	assert((opcode == ISD::UMULO \|\| opcode == ISD::SMULO) && "Invalid Opcode.");

	bool isSigned = (opcode == ISD::SMULO);
	EVT VT = MVT::i64;
	EVT WideVT = MVT::i128;
	SDLoc dl(Op);
	SDValue LHS = Op.getOperand(0);

	if (LHS.getValueType() != VT)
	return Op;

	SDValue ShiftAmt = DAG.getConstant(63, dl, VT);

	SDValue RHS = Op.getOperand(1);
	SDValue HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, ShiftAmt);
	SDValue HiRHS = DAG.getNode(ISD::SRA, dl, MVT::i64, RHS, ShiftAmt);
	SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };

	SDValue MulResult = TLI.makeLibCall(DAG,
	RTLIB::MUL_I128, WideVT,
	Args, isSigned, dl).first;
	SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
	MulResult, DAG.getIntPtrConstant(0, dl));
	SDValue TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT,
	MulResult, DAG.getIntPtrConstant(1, dl));
	if (isSigned) {
	SDValue Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, ShiftAmt);
	TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, Tmp1, ISD::SETNE);
	} else {
	TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, DAG.getConstant(0, dl, VT),
	ISD::SETNE);
	}
	// MulResult is a node with an illegal type. Because such things are not
	// generally permitted during this phase of legalization, ensure that
	// nothing is left using the node. The above EXTRACT_ELEMENT nodes should have
	// been folded.
	assert(MulResult->use_empty() && "Illegally typed node still in use!");

	SDValue Ops[2] = { BottomHalf, TopHalf } ;
	return DAG.getMergeValues(Ops, dl);
	}

	static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
	if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
	// Expand with a fence.
	return SDValue();

	// Monotonic load/stores are legal.
	return Op;
	}

	SDValue SparcTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.
	case Intrinsic::thread_pointer: {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	return DAG.getRegister(SP::G7, PtrVT);
	}
	}
	}

	SDValue SparcTargetLowering::
	LowerOperation(SDValue Op, SelectionDAG &DAG) const {

	bool hasHardQuad = Subtarget->hasHardQuad();
	bool isV9 = Subtarget->isV9();

	switch (Op.getOpcode()) {
	default: llvm_unreachable("Should not custom lower this!");

	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG, *this,
	Subtarget);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG,
	Subtarget);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG, *this,
	hasHardQuad);
	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG, *this,
	hasHardQuad);
	case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG, *this,
	hasHardQuad);
	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG, *this,
	hasHardQuad);
	case ISD::BR_CC: return LowerBR_CC(Op, DAG, *this,
	hasHardQuad);
	case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG, *this,
	hasHardQuad);
	case ISD::VASTART: return LowerVASTART(Op, DAG, *this);
	case ISD::VAARG: return LowerVAARG(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG,
	Subtarget);

	case ISD::LOAD: return LowerLOAD(Op, DAG);
	case ISD::STORE: return LowerSTORE(Op, DAG);
	case ISD::FADD: return LowerF128Op(Op, DAG,
	getLibcallName(RTLIB::ADD_F128), 2);
	case ISD::FSUB: return LowerF128Op(Op, DAG,
	getLibcallName(RTLIB::SUB_F128), 2);
	case ISD::FMUL: return LowerF128Op(Op, DAG,
	getLibcallName(RTLIB::MUL_F128), 2);
	case ISD::FDIV: return LowerF128Op(Op, DAG,
	getLibcallName(RTLIB::DIV_F128), 2);
	case ISD::FSQRT: return LowerF128Op(Op, DAG,
	getLibcallName(RTLIB::SQRT_F128),1);
	case ISD::FABS:
	case ISD::FNEG: return LowerFNEGorFABS(Op, DAG, isV9);
	case ISD::FP_EXTEND: return LowerF128_FPEXTEND(Op, DAG, *this);
	case ISD::FP_ROUND: return LowerF128_FPROUND(Op, DAG, *this);
	case ISD::ADDC:
	case ISD::ADDE:
	case ISD::SUBC:
	case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
	case ISD::UMULO:
	case ISD::SMULO: return LowerUMULO_SMULO(Op, DAG, *this);
	case ISD::ATOMIC_LOAD:
	case ISD::ATOMIC_STORE: return LowerATOMIC_LOAD_STORE(Op, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	}
	}

	SDValue SparcTargetLowering::bitcastConstantFPToInt(ConstantFPSDNode *C,
	const SDLoc &DL,
	SelectionDAG &DAG) const {
	APInt V = C->getValueAPF().bitcastToAPInt();
	SDValue Lo = DAG.getConstant(V.zextOrTrunc(32), DL, MVT::i32);
	SDValue Hi = DAG.getConstant(V.lshr(32).zextOrTrunc(32), DL, MVT::i32);
	if (DAG.getDataLayout().isLittleEndian())
	std::swap(Lo, Hi);
	return DAG.getBuildVector(MVT::v2i32, DL, {Hi, Lo});
	}

	SDValue SparcTargetLowering::PerformBITCASTCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SDLoc dl(N);
	SDValue Src = N->getOperand(0);

	if (isa<ConstantFPSDNode>(Src) && N->getSimpleValueType(0) == MVT::v2i32 &&
	Src.getSimpleValueType() == MVT::f64)
	return bitcastConstantFPToInt(cast<ConstantFPSDNode>(Src), dl, DCI.DAG);

	return SDValue();
	}

	SDValue SparcTargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	switch (N->getOpcode()) {
	default:
	break;
	case ISD::BITCAST:
	return PerformBITCASTCombine(N, DCI);
	}
	return SDValue();
	}

	MachineBasicBlock *
	SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unknown SELECT_CC!");
	case SP::SELECT_CC_Int_ICC:
	case SP::SELECT_CC_FP_ICC:
	case SP::SELECT_CC_DFP_ICC:
	case SP::SELECT_CC_QFP_ICC:
	return expandSelectCC(MI, BB, SP::BCOND);
	case SP::SELECT_CC_Int_FCC:
	case SP::SELECT_CC_FP_FCC:
	case SP::SELECT_CC_DFP_FCC:
	case SP::SELECT_CC_QFP_FCC:
	return expandSelectCC(MI, BB, SP::FBCOND);
	}
	}

	MachineBasicBlock *
	SparcTargetLowering::expandSelectCC(MachineInstr &MI, MachineBasicBlock *BB,
	unsigned BROpcode) const {
	const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
	DebugLoc dl = MI.getDebugLoc();
	unsigned CC = (SPCC::CondCodes)MI.getOperand(3).getImm();

	// To "insert" a SELECT_CC instruction, we actually have to insert the
	// triangle control-flow pattern. The incoming instruction knows the
	// destination vreg to set, the condition code register to branch on, the
	// true/false values to select between, and the condition code for the branch.
	//
	// We produce the following control flow:
	// ThisMBB
	// \| \
	// \| IfFalseMBB
	// \| /
	// SinkMBB
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction::iterator It = ++BB->getIterator();

	MachineBasicBlock *ThisMBB = BB;
	MachineFunction *F = BB->getParent();
	MachineBasicBlock *IfFalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, IfFalseMBB);
	F->insert(It, SinkMBB);

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->begin(), ThisMBB,
	std::next(MachineBasicBlock::iterator(MI)), ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Set the new successors for ThisMBB.
	ThisMBB->addSuccessor(IfFalseMBB);
	ThisMBB->addSuccessor(SinkMBB);

	BuildMI(ThisMBB, dl, TII.get(BROpcode))
	.addMBB(SinkMBB)
	.addImm(CC);

	// IfFalseMBB just falls through to SinkMBB.
	IfFalseMBB->addSuccessor(SinkMBB);

	// %Result = phi [ %TrueValue, ThisMBB ], [ %FalseValue, IfFalseMBB ]
	BuildMI(*SinkMBB, SinkMBB->begin(), dl, TII.get(SP::PHI),
	MI.getOperand(0).getReg())
	.addReg(MI.getOperand(1).getReg())
	.addMBB(ThisMBB)
	.addReg(MI.getOperand(2).getReg())
	.addMBB(IfFalseMBB);

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return SinkMBB;
	}

	//===----------------------------------------------------------------------===//
	// Sparc Inline Assembly Support
	//===----------------------------------------------------------------------===//

	/// getConstraintType - Given a constraint letter, return the type of
	/// constraint it is for this target.
	SparcTargetLowering::ConstraintType
	SparcTargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	default: break;
	case 'r':
	case 'f':
	case 'e':
	return C_RegisterClass;
	case 'I': // SIMM13
	- return C_Other;
	+ return C_Immediate;
	}
	}

	return TargetLowering::getConstraintType(Constraint);
	}

	TargetLowering::ConstraintWeight SparcTargetLowering::
	getSingleConstraintMatchWeight(AsmOperandInfo &info,
	const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;

	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	break;
	case 'I': // SIMM13
	if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
	if (isInt<13>(C->getSExtValue()))
	weight = CW_Constant;
	}
	break;
	}
	return weight;
	}

	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
	/// vector. If it is invalid, don't add anything to Ops.
	void SparcTargetLowering::
	LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const {
	SDValue Result(nullptr, 0);

	// Only support length 1 constraints for now.
	if (Constraint.length() > 1)
	return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default: break;
	case 'I':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (isInt<13>(C->getSExtValue())) {
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	return;
	}
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}
	TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	std::pair<unsigned, const TargetRegisterClass *>
	SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'r':
	if (VT == MVT::v2i32)
	return std::make_pair(0U, &SP::IntPairRegClass);
	else if (Subtarget->is64Bit())
	return std::make_pair(0U, &SP::I64RegsRegClass);
	else
	return std::make_pair(0U, &SP::IntRegsRegClass);
	case 'f':
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	return std::make_pair(0U, &SP::FPRegsRegClass);
	else if (VT == MVT::f64 \|\| VT == MVT::i64)
	return std::make_pair(0U, &SP::LowDFPRegsRegClass);
	else if (VT == MVT::f128)
	return std::make_pair(0U, &SP::LowQFPRegsRegClass);
	// This will generate an error message
	return std::make_pair(0U, nullptr);
	case 'e':
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	return std::make_pair(0U, &SP::FPRegsRegClass);
	else if (VT == MVT::f64 \|\| VT == MVT::i64 )
	return std::make_pair(0U, &SP::DFPRegsRegClass);
	else if (VT == MVT::f128)
	return std::make_pair(0U, &SP::QFPRegsRegClass);
	// This will generate an error message
	return std::make_pair(0U, nullptr);
	}
	} else if (!Constraint.empty() && Constraint.size() <= 5
	&& Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
	// constraint = '{r<d>}'
	// Remove the braces from around the name.
	StringRef name(Constraint.data()+1, Constraint.size()-2);
	// Handle register aliases:
	// r0-r7 -> g0-g7
	// r8-r15 -> o0-o7
	// r16-r23 -> l0-l7
	// r24-r31 -> i0-i7
	uint64_t intVal = 0;
	if (name.substr(0, 1).equals("r")
	&& !name.substr(1).getAsInteger(10, intVal) && intVal <= 31) {
	const char regTypes[] = { 'g', 'o', 'l', 'i' };
	char regType = regTypes[intVal/8];
	char regIdx = '0' + (intVal % 8);
	char tmp[] = { '{', regType, regIdx, '}', 0 };
	std::string newConstraint = std::string(tmp);
	return TargetLowering::getRegForInlineAsmConstraint(TRI, newConstraint,
	VT);
	}
	if (name.substr(0, 1).equals("f") &&
	!name.substr(1).getAsInteger(10, intVal) && intVal <= 63) {
	std::string newConstraint;

	if (VT == MVT::f32 \|\| VT == MVT::Other) {
	newConstraint = "{f" + utostr(intVal) + "}";
	} else if (VT == MVT::f64 && (intVal % 2 == 0)) {
	newConstraint = "{d" + utostr(intVal / 2) + "}";
	} else if (VT == MVT::f128 && (intVal % 4 == 0)) {
	newConstraint = "{q" + utostr(intVal / 4) + "}";
	} else {
	return std::make_pair(0U, nullptr);
	}
	return TargetLowering::getRegForInlineAsmConstraint(TRI, newConstraint,
	VT);
	}
	}

	return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
	}

	bool
	SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
	// The Sparc target isn't yet aware of offsets.
	return false;
	}

	void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue>& Results,
	SelectionDAG &DAG) const {

	SDLoc dl(N);

	RTLIB::Libcall libCall = RTLIB::UNKNOWN_LIBCALL;

	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Do not know how to custom type legalize this operation!");

	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	// Custom lower only if it involves f128 or i64.
	if (N->getOperand(0).getValueType() != MVT::f128
	\|\| N->getValueType(0) != MVT::i64)
	return;
	libCall = ((N->getOpcode() == ISD::FP_TO_SINT)
	? RTLIB::FPTOSINT_F128_I64
	: RTLIB::FPTOUINT_F128_I64);

	Results.push_back(LowerF128Op(SDValue(N, 0),
	DAG,
	getLibcallName(libCall),
	1));
	return;
	case ISD::READCYCLECOUNTER: {
	assert(Subtarget->hasLeonCycleCounter());
	SDValue Lo = DAG.getCopyFromReg(N->getOperand(0), dl, SP::ASR23, MVT::i32);
	SDValue Hi = DAG.getCopyFromReg(Lo, dl, SP::G0, MVT::i32);
	SDValue Ops[] = { Lo, Hi };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(N->getOperand(0));
	return;
	}
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	// Custom lower only if it involves f128 or i64.
	if (N->getValueType(0) != MVT::f128
	\|\| N->getOperand(0).getValueType() != MVT::i64)
	return;

	libCall = ((N->getOpcode() == ISD::SINT_TO_FP)
	? RTLIB::SINTTOFP_I64_F128
	: RTLIB::UINTTOFP_I64_F128);

	Results.push_back(LowerF128Op(SDValue(N, 0),
	DAG,
	getLibcallName(libCall),
	1));
	return;
	case ISD::LOAD: {
	LoadSDNode *Ld = cast<LoadSDNode>(N);
	// Custom handling only for i64: turn i64 load into a v2i32 load,
	// and a bitcast.
	if (Ld->getValueType(0) != MVT::i64 \|\| Ld->getMemoryVT() != MVT::i64)
	return;

	SDLoc dl(N);
	SDValue LoadRes = DAG.getExtLoad(
	Ld->getExtensionType(), dl, MVT::v2i32, Ld->getChain(),
	Ld->getBasePtr(), Ld->getPointerInfo(), MVT::v2i32, Ld->getAlignment(),
	Ld->getMemOperand()->getFlags(), Ld->getAAInfo());

	SDValue Res = DAG.getNode(ISD::BITCAST, dl, MVT::i64, LoadRes);
	Results.push_back(Res);
	Results.push_back(LoadRes.getValue(1));
	return;
	}
	}
	}

	// Override to enable LOAD_STACK_GUARD lowering on Linux.
	bool SparcTargetLowering::useLoadStackGuardNode() const {
	if (!Subtarget->isTargetLinux())
	return TargetLowering::useLoadStackGuardNode();
	return true;
	}

	// Override to disable global variable loading on Linux.
	void SparcTargetLowering::insertSSPDeclarations(Module &M) const {
	if (!Subtarget->isTargetLinux())
	return TargetLowering::insertSSPDeclarations(M);
	}
	Index: vendor/llvm/dist-release_90/lib/Target/SystemZ/SystemZISelLowering.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/SystemZ/SystemZISelLowering.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/SystemZ/SystemZISelLowering.cpp (revision 351303)
	@@ -1,7768 +1,7768 @@
	//===-- SystemZISelLowering.cpp - SystemZ DAG lowering implementation -----===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the SystemZTargetLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "SystemZISelLowering.h"
	#include "SystemZCallingConv.h"
	#include "SystemZConstantPoolValue.h"
	#include "SystemZMachineFunctionInfo.h"
	#include "SystemZTargetMachine.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/KnownBits.h"
	#include <cctype>

	using namespace llvm;

	#define DEBUG_TYPE "systemz-lower"

	namespace {
	// Represents information about a comparison.
	struct Comparison {
	Comparison(SDValue Op0In, SDValue Op1In)
	: Op0(Op0In), Op1(Op1In), Opcode(0), ICmpType(0), CCValid(0), CCMask(0) {}

	// The operands to the comparison.
	SDValue Op0, Op1;

	// The opcode that should be used to compare Op0 and Op1.
	unsigned Opcode;

	// A SystemZICMP value. Only used for integer comparisons.
	unsigned ICmpType;

	// The mask of CC values that Opcode can produce.
	unsigned CCValid;

	// The mask of CC values for which the original condition is true.
	unsigned CCMask;
	};
	} // end anonymous namespace

	// Classify VT as either 32 or 64 bit.
	static bool is32Bit(EVT VT) {
	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::i32:
	return true;
	case MVT::i64:
	return false;
	default:
	llvm_unreachable("Unsupported type");
	}
	}

	// Return a version of MachineOperand that can be safely used before the
	// final use.
	static MachineOperand earlyUseOperand(MachineOperand Op) {
	if (Op.isReg())
	Op.setIsKill(false);
	return Op;
	}

	SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
	const SystemZSubtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize(0));

	// Set up the register classes.
	if (Subtarget.hasHighWord())
	addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
	else
	addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
	addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
	if (Subtarget.hasVector()) {
	addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
	addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
	} else {
	addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
	addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
	}
	if (Subtarget.hasVectorEnhancements1())
	addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass);
	else
	addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);

	if (Subtarget.hasVector()) {
	addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass);
	addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
	addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
	addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
	addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
	addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
	}

	// Compute derived properties from the register classes
	computeRegisterProperties(Subtarget.getRegisterInfo());

	// Set up special registers.
	setStackPointerRegisterToSaveRestore(SystemZ::R15D);

	// TODO: It may be better to default to latency-oriented scheduling, however
	// LLVM's current latency-oriented scheduler can't handle physreg definitions
	// such as SystemZ has with CC, so set this to the register-pressure
	// scheduler, because it can.
	setSchedulingPreference(Sched::RegPressure);

	setBooleanContents(ZeroOrOneBooleanContent);
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// Instructions are strings of 2-byte aligned 2-byte values.
	setMinFunctionAlignment(2);
	// For performance reasons we prefer 16-byte alignment.
	setPrefFunctionAlignment(4);

	// Handle operations that are handled in a similar way for all types.
	for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
	I <= MVT::LAST_FP_VALUETYPE;
	++I) {
	MVT VT = MVT::SimpleValueType(I);
	if (isTypeLegal(VT)) {
	// Lower SET_CC into an IPM-based sequence.
	setOperationAction(ISD::SETCC, VT, Custom);

	// Expand SELECT(C, A, B) into SELECT_CC(X, 0, A, B, NE).
	setOperationAction(ISD::SELECT, VT, Expand);

	// Lower SELECT_CC and BR_CC into separate comparisons and branches.
	setOperationAction(ISD::SELECT_CC, VT, Custom);
	setOperationAction(ISD::BR_CC, VT, Custom);
	}
	}

	// Expand jump table branches as address arithmetic followed by an
	// indirect jump.
	setOperationAction(ISD::BR_JT, MVT::Other, Expand);

	// Expand BRCOND into a BR_CC (see above).
	setOperationAction(ISD::BRCOND, MVT::Other, Expand);

	// Handle integer types.
	for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
	I <= MVT::LAST_INTEGER_VALUETYPE;
	++I) {
	MVT VT = MVT::SimpleValueType(I);
	if (isTypeLegal(VT)) {
	// Expand individual DIV and REMs into DIVREMs.
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Custom);
	setOperationAction(ISD::UDIVREM, VT, Custom);

	// Support addition/subtraction with overflow.
	setOperationAction(ISD::SADDO, VT, Custom);
	setOperationAction(ISD::SSUBO, VT, Custom);

	// Support addition/subtraction with carry.
	setOperationAction(ISD::UADDO, VT, Custom);
	setOperationAction(ISD::USUBO, VT, Custom);

	// Support carry in as value rather than glue.
	setOperationAction(ISD::ADDCARRY, VT, Custom);
	setOperationAction(ISD::SUBCARRY, VT, Custom);

	// Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
	// stores, putting a serialization instruction after the stores.
	setOperationAction(ISD::ATOMIC_LOAD, VT, Custom);
	setOperationAction(ISD::ATOMIC_STORE, VT, Custom);

	// Lower ATOMIC_LOAD_SUB into ATOMIC_LOAD_ADD if LAA and LAAG are
	// available, or if the operand is constant.
	setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);

	// Use POPCNT on z196 and above.
	if (Subtarget.hasPopulationCount())
	setOperationAction(ISD::CTPOP, VT, Custom);
	else
	setOperationAction(ISD::CTPOP, VT, Expand);

	// No special instructions for these.
	setOperationAction(ISD::CTTZ, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);

	// Use MUL_LOHI where possible instead of MULH.
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Custom);
	setOperationAction(ISD::UMUL_LOHI, VT, Custom);

	// Only z196 and above have native support for conversions to unsigned.
	// On z10, promoting to i64 doesn't generate an inexact condition for
	// values that are outside the i32 range but in the i64 range, so use
	// the default expansion.
	if (!Subtarget.hasFPExtension())
	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
	}
	}

	// Type legalization will convert 8- and 16-bit atomic operations into
	// forms that operate on i32s (but still keeping the original memory VT).
	// Lower them into full i32 operations.
	setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Custom);

	// Even though i128 is not a legal type, we still need to custom lower
	// the atomic operations in order to exploit SystemZ instructions.
	setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
	setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);

	// We can use the CC result of compare-and-swap to implement
	// the "success" result of ATOMIC_CMP_SWAP_WITH_SUCCESS.
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Custom);
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);

	setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);

	// Traps are legal, as we will convert them to "j .+2".
	setOperationAction(ISD::TRAP, MVT::Other, Legal);

	// z10 has instructions for signed but not unsigned FP conversion.
	// Handle unsigned 32-bit types as signed 64-bit types.
	if (!Subtarget.hasFPExtension()) {
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
	}

	// We have native support for a 64-bit CTLZ, via FLOGR.
	setOperationAction(ISD::CTLZ, MVT::i32, Promote);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Promote);
	setOperationAction(ISD::CTLZ, MVT::i64, Legal);

	// On arch13 we have native support for a 64-bit CTPOP.
	if (Subtarget.hasMiscellaneousExtensions3()) {
	setOperationAction(ISD::CTPOP, MVT::i32, Promote);
	setOperationAction(ISD::CTPOP, MVT::i64, Legal);
	}

	// Give LowerOperation the chance to replace 64-bit ORs with subregs.
	setOperationAction(ISD::OR, MVT::i64, Custom);

	// FIXME: Can we support these natively?
	setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
	setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
	setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);

	// We have native instructions for i8, i16 and i32 extensions, but not i1.
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
	for (MVT VT : MVT::integer_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
	}

	// Handle the various types of symbolic address.
	setOperationAction(ISD::ConstantPool, PtrVT, Custom);
	setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
	setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
	setOperationAction(ISD::BlockAddress, PtrVT, Custom);
	setOperationAction(ISD::JumpTable, PtrVT, Custom);

	// We need to handle dynamic allocations specially because of the
	// 160-byte area at the bottom of the stack.
	setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
	setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, PtrVT, Custom);

	// Use custom expanders so that we can force the function to use
	// a frame pointer.
	setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);

	// Handle prefetches with PFD or PFDRL.
	setOperationAction(ISD::PREFETCH, MVT::Other, Custom);

	for (MVT VT : MVT::vector_valuetypes()) {
	// Assume by default that all vector operations need to be expanded.
	for (unsigned Opcode = 0; Opcode < ISD::BUILTIN_OP_END; ++Opcode)
	if (getOperationAction(Opcode, VT) == Legal)
	setOperationAction(Opcode, VT, Expand);

	// Likewise all truncating stores and extending loads.
	for (MVT InnerVT : MVT::vector_valuetypes()) {
	setTruncStoreAction(VT, InnerVT, Expand);
	setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
	}

	if (isTypeLegal(VT)) {
	// These operations are legal for anything that can be stored in a
	// vector register, even if there is no native support for the format
	// as such. In particular, we can do these for v4f32 even though there
	// are no specific instructions for that format.
	setOperationAction(ISD::LOAD, VT, Legal);
	setOperationAction(ISD::STORE, VT, Legal);
	setOperationAction(ISD::VSELECT, VT, Legal);
	setOperationAction(ISD::BITCAST, VT, Legal);
	setOperationAction(ISD::UNDEF, VT, Legal);

	// Likewise, except that we need to replace the nodes with something
	// more specific.
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	}
	}

	// Handle integer vector types.
	for (MVT VT : MVT::integer_vector_valuetypes()) {
	if (isTypeLegal(VT)) {
	// These operations have direct equivalents.
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
	setOperationAction(ISD::ADD, VT, Legal);
	setOperationAction(ISD::SUB, VT, Legal);
	if (VT != MVT::v2i64)
	setOperationAction(ISD::MUL, VT, Legal);
	setOperationAction(ISD::AND, VT, Legal);
	setOperationAction(ISD::OR, VT, Legal);
	setOperationAction(ISD::XOR, VT, Legal);
	if (Subtarget.hasVectorEnhancements1())
	setOperationAction(ISD::CTPOP, VT, Legal);
	else
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Legal);
	setOperationAction(ISD::CTLZ, VT, Legal);

	// Convert a GPR scalar to a vector by inserting it into element 0.
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

	// Use a series of unpacks for extensions.
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

	// Detect shifts by a scalar amount and convert them into
	// V*_BY_SCALAR.
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::SRL, VT, Custom);

	// At present ROTL isn't matched by DAGCombiner. ROTR should be
	// converted into ROTL.
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);

	// Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
	// and inverting the result as necessary.
	setOperationAction(ISD::SETCC, VT, Custom);
	}
	}

	if (Subtarget.hasVector()) {
	// There should be no need to check for float types other than v2f64
	// since <2 x f32> isn't a legal type.
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal);
	}

	if (Subtarget.hasVectorEnhancements2()) {
	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v4f32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4f32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4f32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4f32, Legal);
	}

	// Handle floating-point types.
	for (unsigned I = MVT::FIRST_FP_VALUETYPE;
	I <= MVT::LAST_FP_VALUETYPE;
	++I) {
	MVT VT = MVT::SimpleValueType(I);
	if (isTypeLegal(VT)) {
	// We can use FI for FRINT.
	setOperationAction(ISD::FRINT, VT, Legal);

	// We can use the extended form of FI for other rounding operations.
	if (Subtarget.hasFPExtension()) {
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FROUND, VT, Legal);
	}

	// No special instructions for these.
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);

	// Handle constrained floating-point operations.
	setOperationAction(ISD::STRICT_FADD, VT, Legal);
	setOperationAction(ISD::STRICT_FSUB, VT, Legal);
	setOperationAction(ISD::STRICT_FMUL, VT, Legal);
	setOperationAction(ISD::STRICT_FDIV, VT, Legal);
	setOperationAction(ISD::STRICT_FMA, VT, Legal);
	setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
	setOperationAction(ISD::STRICT_FRINT, VT, Legal);
	setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
	setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
	if (Subtarget.hasFPExtension()) {
	setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
	setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
	setOperationAction(ISD::STRICT_FROUND, VT, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
	}
	}
	}

	// Handle floating-point vector types.
	if (Subtarget.hasVector()) {
	// Scalar-to-vector conversion is just a subreg.
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);

	// Some insertions and extractions can be done directly but others
	// need to go via integers.
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);

	// These operations have direct equivalents.
	setOperationAction(ISD::FADD, MVT::v2f64, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
	setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
	setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
	setOperationAction(ISD::FMA, MVT::v2f64, Legal);
	setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
	setOperationAction(ISD::FABS, MVT::v2f64, Legal);
	setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
	setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
	setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
	setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
	setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
	setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
	setOperationAction(ISD::FROUND, MVT::v2f64, Legal);

	// Handle constrained floating-point operations.
	setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FMA, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FRINT, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);
	setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal);
	}

	// The vector enhancements facility 1 has instructions for these.
	if (Subtarget.hasVectorEnhancements1()) {
	setOperationAction(ISD::FADD, MVT::v4f32, Legal);
	setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
	setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
	setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
	setOperationAction(ISD::FMA, MVT::v4f32, Legal);
	setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
	setOperationAction(ISD::FABS, MVT::v4f32, Legal);
	setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
	setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
	setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
	setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
	setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
	setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
	setOperationAction(ISD::FROUND, MVT::v4f32, Legal);

	setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::f64, Legal);
	setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
	setOperationAction(ISD::FMINIMUM, MVT::f64, Legal);

	setOperationAction(ISD::FMAXNUM, MVT::v2f64, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::v2f64, Legal);
	setOperationAction(ISD::FMINNUM, MVT::v2f64, Legal);
	setOperationAction(ISD::FMINIMUM, MVT::v2f64, Legal);

	setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
	setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
	setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);

	setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
	setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
	setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);

	setOperationAction(ISD::FMAXNUM, MVT::f128, Legal);
	setOperationAction(ISD::FMAXIMUM, MVT::f128, Legal);
	setOperationAction(ISD::FMINNUM, MVT::f128, Legal);
	setOperationAction(ISD::FMINIMUM, MVT::f128, Legal);

	// Handle constrained floating-point operations.
	setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FMA, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FRINT, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FROUND, MVT::v4f32, Legal);
	setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);
	for (auto VT : { MVT::f32, MVT::f64, MVT::f128,
	MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::STRICT_FMAXNUM, VT, Legal);
	setOperationAction(ISD::STRICT_FMINNUM, VT, Legal);
	}
	}

	// We have fused multiply-addition for f32 and f64 but not f128.
	setOperationAction(ISD::FMA, MVT::f32, Legal);
	setOperationAction(ISD::FMA, MVT::f64, Legal);
	if (Subtarget.hasVectorEnhancements1())
	setOperationAction(ISD::FMA, MVT::f128, Legal);
	else
	setOperationAction(ISD::FMA, MVT::f128, Expand);

	// We don't have a copysign instruction on vector registers.
	if (Subtarget.hasVectorEnhancements1())
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);

	// Needed so that we don't try to implement f128 constant loads using
	// a load-and-extend of a f80 constant (in cases where the constant
	// would fit in an f80).
	for (MVT VT : MVT::fp_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);

	// We don't have extending load instruction on vector registers.
	if (Subtarget.hasVectorEnhancements1()) {
	setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
	}

	// Floating-point truncation and stores need to be done separately.
	setTruncStoreAction(MVT::f64, MVT::f32, Expand);
	setTruncStoreAction(MVT::f128, MVT::f32, Expand);
	setTruncStoreAction(MVT::f128, MVT::f64, Expand);

	// We have 64-bit FPR<->GPR moves, but need special handling for
	// 32-bit forms.
	if (!Subtarget.hasVector()) {
	setOperationAction(ISD::BITCAST, MVT::i32, Custom);
	setOperationAction(ISD::BITCAST, MVT::f32, Custom);
	}

	// VASTART and VACOPY need to deal with the SystemZ-specific varargs
	// structure, but VAEND is a no-op.
	setOperationAction(ISD::VASTART, MVT::Other, Custom);
	setOperationAction(ISD::VACOPY, MVT::Other, Custom);
	setOperationAction(ISD::VAEND, MVT::Other, Expand);

	// Codes for which we want to perform some z-specific combinations.
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
	setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
	setTargetDAGCombine(ISD::FP_ROUND);
	setTargetDAGCombine(ISD::FP_EXTEND);
	setTargetDAGCombine(ISD::BSWAP);
	setTargetDAGCombine(ISD::SDIV);
	setTargetDAGCombine(ISD::UDIV);
	setTargetDAGCombine(ISD::SREM);
	setTargetDAGCombine(ISD::UREM);

	// Handle intrinsics.
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

	// We want to use MVC in preference to even a single load/store pair.
	MaxStoresPerMemcpy = 0;
	MaxStoresPerMemcpyOptSize = 0;

	// The main memset sequence is a byte store followed by an MVC.
	// Two STC or MV..I stores win over that, but the kind of fused stores
	// generated by target-independent code don't when the byte value is
	// variable. E.g. "STC <reg>;MHI <reg>,257;STH <reg>" is not better
	// than "STC;MVC". Handle the choice in target-specific code instead.
	MaxStoresPerMemset = 0;
	MaxStoresPerMemsetOptSize = 0;
	}

	EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
	LLVMContext &, EVT VT) const {
	if (!VT.isVector())
	return MVT::i32;
	return VT.changeVectorElementTypeToInteger();
	}

	bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	case MVT::f128:
	return Subtarget.hasVectorEnhancements1();
	default:
	break;
	}

	return false;
	}

	// Return true if the constant can be generated with a vector instruction,
	// such as VGM, VGMB or VREPI.
	bool SystemZVectorConstantInfo::isVectorConstantLegal(
	const SystemZSubtarget &Subtarget) {
	const SystemZInstrInfo *TII =
	static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
	if (!Subtarget.hasVector() \|\|
	(isFP128 && !Subtarget.hasVectorEnhancements1()))
	return false;

	// Try using VECTOR GENERATE BYTE MASK. This is the architecturally-
	// preferred way of creating all-zero and all-one vectors so give it
	// priority over other methods below.
	unsigned Mask = 0;
	unsigned I = 0;
	for (; I < SystemZ::VectorBytes; ++I) {
	uint64_t Byte = IntBits.lshr(I * 8).trunc(8).getZExtValue();
	if (Byte == 0xff)
	Mask \|= 1ULL << I;
	else if (Byte != 0)
	break;
	}
	if (I == SystemZ::VectorBytes) {
	Opcode = SystemZISD::BYTE_MASK;
	OpVals.push_back(Mask);
	VecVT = MVT::getVectorVT(MVT::getIntegerVT(8), 16);
	return true;
	}

	if (SplatBitSize > 64)
	return false;

	auto tryValue = [&](uint64_t Value) -> bool {
	// Try VECTOR REPLICATE IMMEDIATE
	int64_t SignedValue = SignExtend64(Value, SplatBitSize);
	if (isInt<16>(SignedValue)) {
	OpVals.push_back(((unsigned) SignedValue));
	Opcode = SystemZISD::REPLICATE;
	VecVT = MVT::getVectorVT(MVT::getIntegerVT(SplatBitSize),
	SystemZ::VectorBits / SplatBitSize);
	return true;
	}
	// Try VECTOR GENERATE MASK
	unsigned Start, End;
	if (TII->isRxSBGMask(Value, SplatBitSize, Start, End)) {
	// isRxSBGMask returns the bit numbers for a full 64-bit value, with 0
	// denoting 1 << 63 and 63 denoting 1. Convert them to bit numbers for
	// an SplatBitSize value, so that 0 denotes 1 << (SplatBitSize-1).
	OpVals.push_back(Start - (64 - SplatBitSize));
	OpVals.push_back(End - (64 - SplatBitSize));
	Opcode = SystemZISD::ROTATE_MASK;
	VecVT = MVT::getVectorVT(MVT::getIntegerVT(SplatBitSize),
	SystemZ::VectorBits / SplatBitSize);
	return true;
	}
	return false;
	};

	// First try assuming that any undefined bits above the highest set bit
	// and below the lowest set bit are 1s. This increases the likelihood of
	// being able to use a sign-extended element value in VECTOR REPLICATE
	// IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK.
	uint64_t SplatBitsZ = SplatBits.getZExtValue();
	uint64_t SplatUndefZ = SplatUndef.getZExtValue();
	uint64_t Lower =
	(SplatUndefZ & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1));
	uint64_t Upper =
	(SplatUndefZ & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1));
	if (tryValue(SplatBitsZ \| Upper \| Lower))
	return true;

	// Now try assuming that any undefined bits between the first and
	// last defined set bits are set. This increases the chances of
	// using a non-wraparound mask.
	uint64_t Middle = SplatUndefZ & ~Upper & ~Lower;
	return tryValue(SplatBitsZ \| Middle);
	}

	SystemZVectorConstantInfo::SystemZVectorConstantInfo(APFloat FPImm) {
	IntBits = FPImm.bitcastToAPInt().zextOrSelf(128);
	isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad());

	// Find the smallest splat.
	SplatBits = FPImm.bitcastToAPInt();
	unsigned Width = SplatBits.getBitWidth();
	while (Width > 8) {
	unsigned HalfSize = Width / 2;
	APInt HighValue = SplatBits.lshr(HalfSize).trunc(HalfSize);
	APInt LowValue = SplatBits.trunc(HalfSize);

	// If the two halves do not match, stop here.
	if (HighValue != LowValue \|\| 8 > HalfSize)
	break;

	SplatBits = HighValue;
	Width = HalfSize;
	}
	SplatUndef = 0;
	SplatBitSize = Width;
	}

	SystemZVectorConstantInfo::SystemZVectorConstantInfo(BuildVectorSDNode *BVN) {
	assert(BVN->isConstant() && "Expected a constant BUILD_VECTOR");
	bool HasAnyUndefs;

	// Get IntBits by finding the 128 bit splat.
	BVN->isConstantSplat(IntBits, SplatUndef, SplatBitSize, HasAnyUndefs, 128,
	true);

	// Get SplatBits by finding the 8 bit or greater splat.
	BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 8,
	true);
	}

	bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
	bool ForCodeSize) const {
	// We can load zero using LZ?R and negative zero using LZ?R;LC?BR.
	if (Imm.isZero() \|\| Imm.isNegZero())
	return true;

	return SystemZVectorConstantInfo(Imm).isVectorConstantLegal(Subtarget);
	}

	bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	// We can use CGFI or CLGFI.
	return isInt<32>(Imm) \|\| isUInt<32>(Imm);
	}

	bool SystemZTargetLowering::isLegalAddImmediate(int64_t Imm) const {
	// We can use ALGFI or SLGFI.
	return isUInt<32>(Imm) \|\| isUInt<32>(-Imm);
	}

	bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(
	EVT VT, unsigned, unsigned, MachineMemOperand::Flags, bool *Fast) const {
	// Unaligned accesses should never be slower than the expanded version.
	// We check specifically for aligned accesses in the few cases where
	// they are required.
	if (Fast)
	*Fast = true;
	return true;
	}

	// Information about the addressing mode for a memory access.
	struct AddressingMode {
	// True if a long displacement is supported.
	bool LongDisplacement;

	// True if use of index register is supported.
	bool IndexReg;

	AddressingMode(bool LongDispl, bool IdxReg) :
	LongDisplacement(LongDispl), IndexReg(IdxReg) {}
	};

	// Return the desired addressing mode for a Load which has only one use (in
	// the same block) which is a Store.
	static AddressingMode getLoadStoreAddrMode(bool HasVector,
	Type *Ty) {
	// With vector support a Load->Store combination may be combined to either
	// an MVC or vector operations and it seems to work best to allow the
	// vector addressing mode.
	if (HasVector)
	return AddressingMode(false/LongDispl/, true/IdxReg/);

	// Otherwise only the MVC case is special.
	bool MVC = Ty->isIntegerTy(8);
	return AddressingMode(!MVC/LongDispl/, !MVC/IdxReg/);
	}

	// Return the addressing mode which seems most desirable given an LLVM
	// Instruction pointer.
	static AddressingMode
	supportedAddressingMode(Instruction *I, bool HasVector) {
	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
	switch (II->getIntrinsicID()) {
	default: break;
	case Intrinsic::memset:
	case Intrinsic::memmove:
	case Intrinsic::memcpy:
	return AddressingMode(false/LongDispl/, false/IdxReg/);
	}
	}

	if (isa<LoadInst>(I) && I->hasOneUse()) {
	auto SingleUser = dyn_cast<Instruction>(I->user_begin());
	if (SingleUser->getParent() == I->getParent()) {
	if (isa<ICmpInst>(SingleUser)) {
	if (auto *C = dyn_cast<ConstantInt>(SingleUser->getOperand(1)))
	if (C->getBitWidth() <= 64 &&
	(isInt<16>(C->getSExtValue()) \|\| isUInt<16>(C->getZExtValue())))
	// Comparison of memory with 16 bit signed / unsigned immediate
	return AddressingMode(false/LongDispl/, false/IdxReg/);
	} else if (isa<StoreInst>(SingleUser))
	// Load->Store
	return getLoadStoreAddrMode(HasVector, I->getType());
	}
	} else if (auto *StoreI = dyn_cast<StoreInst>(I)) {
	if (auto *LoadI = dyn_cast<LoadInst>(StoreI->getValueOperand()))
	if (LoadI->hasOneUse() && LoadI->getParent() == I->getParent())
	// Load->Store
	return getLoadStoreAddrMode(HasVector, LoadI->getType());
	}

	if (HasVector && (isa<LoadInst>(I) \|\| isa<StoreInst>(I))) {

	// * Use LDE instead of LE/LEY for z13 to avoid partial register
	// dependencies (LDE only supports small offsets).
	// * Utilize the vector registers to hold floating point
	// values (vector load / store instructions only support small
	// offsets).

	Type *MemAccessTy = (isa<LoadInst>(I) ? I->getType() :
	I->getOperand(0)->getType());
	bool IsFPAccess = MemAccessTy->isFloatingPointTy();
	bool IsVectorAccess = MemAccessTy->isVectorTy();

	// A store of an extracted vector element will be combined into a VSTE type
	// instruction.
	if (!IsVectorAccess && isa<StoreInst>(I)) {
	Value *DataOp = I->getOperand(0);
	if (isa<ExtractElementInst>(DataOp))
	IsVectorAccess = true;
	}

	// A load which gets inserted into a vector element will be combined into a
	// VLE type instruction.
	if (!IsVectorAccess && isa<LoadInst>(I) && I->hasOneUse()) {
	User LoadUser = I->user_begin();
	if (isa<InsertElementInst>(LoadUser))
	IsVectorAccess = true;
	}

	if (IsFPAccess \|\| IsVectorAccess)
	return AddressingMode(false/LongDispl/, true/IdxReg/);
	}

	return AddressingMode(true/LongDispl/, true/IdxReg/);
	}

	bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type Ty, unsigned AS, Instruction I) const {
	// Punt on globals for now, although they can be used in limited
	// RELATIVE LONG cases.
	if (AM.BaseGV)
	return false;

	// Require a 20-bit signed offset.
	if (!isInt<20>(AM.BaseOffs))
	return false;

	AddressingMode SupportedAM(true, true);
	if (I != nullptr)
	SupportedAM = supportedAddressingMode(I, Subtarget.hasVector());

	if (!SupportedAM.LongDisplacement && !isUInt<12>(AM.BaseOffs))
	return false;

	if (!SupportedAM.IndexReg)
	// No indexing allowed.
	return AM.Scale == 0;
	else
	// Indexing is OK but no scale factor can be applied.
	return AM.Scale == 0 \|\| AM.Scale == 1;
	}

	bool SystemZTargetLowering::isTruncateFree(Type FromType, Type ToType) const {
	if (!FromType->isIntegerTy() \|\| !ToType->isIntegerTy())
	return false;
	unsigned FromBits = FromType->getPrimitiveSizeInBits();
	unsigned ToBits = ToType->getPrimitiveSizeInBits();
	return FromBits > ToBits;
	}

	bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
	if (!FromVT.isInteger() \|\| !ToVT.isInteger())
	return false;
	unsigned FromBits = FromVT.getSizeInBits();
	unsigned ToBits = ToVT.getSizeInBits();
	return FromBits > ToBits;
	}

	//===----------------------------------------------------------------------===//
	// Inline asm support
	//===----------------------------------------------------------------------===//

	TargetLowering::ConstraintType
	SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'a': // Address register
	case 'd': // Data register (equivalent to 'r')
	case 'f': // Floating-point register
	case 'h': // High-part register
	case 'r': // General-purpose register
	case 'v': // Vector register
	return C_RegisterClass;

	case 'Q': // Memory with base and unsigned 12-bit displacement
	case 'R': // Likewise, plus an index
	case 'S': // Memory with base and signed 20-bit displacement
	case 'T': // Likewise, plus an index
	case 'm': // Equivalent to 'T'.
	return C_Memory;

	case 'I': // Unsigned 8-bit constant
	case 'J': // Unsigned 12-bit constant
	case 'K': // Signed 16-bit constant
	case 'L': // Signed 20-bit displacement (on all targets we support)
	case 'M': // 0x7fffffff
	- return C_Other;
	+ return C_Immediate;

	default:
	break;
	}
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	TargetLowering::ConstraintWeight SystemZTargetLowering::
	getSingleConstraintMatchWeight(AsmOperandInfo &info,
	const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	break;

	case 'a': // Address register
	case 'd': // Data register (equivalent to 'r')
	case 'h': // High-part register
	case 'r': // General-purpose register
	if (CallOperandVal->getType()->isIntegerTy())
	weight = CW_Register;
	break;

	case 'f': // Floating-point register
	if (type->isFloatingPointTy())
	weight = CW_Register;
	break;

	case 'v': // Vector register
	if ((type->isVectorTy() \|\| type->isFloatingPointTy()) &&
	Subtarget.hasVector())
	weight = CW_Register;
	break;

	case 'I': // Unsigned 8-bit constant
	if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
	if (isUInt<8>(C->getZExtValue()))
	weight = CW_Constant;
	break;

	case 'J': // Unsigned 12-bit constant
	if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
	if (isUInt<12>(C->getZExtValue()))
	weight = CW_Constant;
	break;

	case 'K': // Signed 16-bit constant
	if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
	if (isInt<16>(C->getSExtValue()))
	weight = CW_Constant;
	break;

	case 'L': // Signed 20-bit displacement (on all targets we support)
	if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
	if (isInt<20>(C->getSExtValue()))
	weight = CW_Constant;
	break;

	case 'M': // 0x7fffffff
	if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
	if (C->getZExtValue() == 0x7fffffff)
	weight = CW_Constant;
	break;
	}
	return weight;
	}

	// Parse a "{tNNN}" register constraint for which the register type "t"
	// has already been verified. MC is the class associated with "t" and
	// Map maps 0-based register numbers to LLVM register numbers.
	static std::pair<unsigned, const TargetRegisterClass *>
	parseRegisterNumber(StringRef Constraint, const TargetRegisterClass *RC,
	const unsigned *Map, unsigned Size) {
	assert(*(Constraint.end()-1) == '}' && "Missing '}'");
	if (isdigit(Constraint[2])) {
	unsigned Index;
	bool Failed =
	Constraint.slice(2, Constraint.size() - 1).getAsInteger(10, Index);
	if (!Failed && Index < Size && Map[Index])
	return std::make_pair(Map[Index], RC);
	}
	return std::make_pair(0U, nullptr);
	}

	std::pair<unsigned, const TargetRegisterClass *>
	SystemZTargetLowering::getRegForInlineAsmConstraint(
	const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
	if (Constraint.size() == 1) {
	// GCC Constraint Letters
	switch (Constraint[0]) {
	default: break;
	case 'd': // Data register (equivalent to 'r')
	case 'r': // General-purpose register
	if (VT == MVT::i64)
	return std::make_pair(0U, &SystemZ::GR64BitRegClass);
	else if (VT == MVT::i128)
	return std::make_pair(0U, &SystemZ::GR128BitRegClass);
	return std::make_pair(0U, &SystemZ::GR32BitRegClass);

	case 'a': // Address register
	if (VT == MVT::i64)
	return std::make_pair(0U, &SystemZ::ADDR64BitRegClass);
	else if (VT == MVT::i128)
	return std::make_pair(0U, &SystemZ::ADDR128BitRegClass);
	return std::make_pair(0U, &SystemZ::ADDR32BitRegClass);

	case 'h': // High-part register (an LLVM extension)
	return std::make_pair(0U, &SystemZ::GRH32BitRegClass);

	case 'f': // Floating-point register
	if (VT == MVT::f64)
	return std::make_pair(0U, &SystemZ::FP64BitRegClass);
	else if (VT == MVT::f128)
	return std::make_pair(0U, &SystemZ::FP128BitRegClass);
	return std::make_pair(0U, &SystemZ::FP32BitRegClass);

	case 'v': // Vector register
	if (Subtarget.hasVector()) {
	if (VT == MVT::f32)
	return std::make_pair(0U, &SystemZ::VR32BitRegClass);
	if (VT == MVT::f64)
	return std::make_pair(0U, &SystemZ::VR64BitRegClass);
	return std::make_pair(0U, &SystemZ::VR128BitRegClass);
	}
	break;
	}
	}
	if (Constraint.size() > 0 && Constraint[0] == '{') {
	// We need to override the default register parsing for GPRs and FPRs
	// because the interpretation depends on VT. The internal names of
	// the registers are also different from the external names
	// (F0D and F0S instead of F0, etc.).
	if (Constraint[1] == 'r') {
	if (VT == MVT::i32)
	return parseRegisterNumber(Constraint, &SystemZ::GR32BitRegClass,
	SystemZMC::GR32Regs, 16);
	if (VT == MVT::i128)
	return parseRegisterNumber(Constraint, &SystemZ::GR128BitRegClass,
	SystemZMC::GR128Regs, 16);
	return parseRegisterNumber(Constraint, &SystemZ::GR64BitRegClass,
	SystemZMC::GR64Regs, 16);
	}
	if (Constraint[1] == 'f') {
	if (VT == MVT::f32)
	return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass,
	SystemZMC::FP32Regs, 16);
	if (VT == MVT::f128)
	return parseRegisterNumber(Constraint, &SystemZ::FP128BitRegClass,
	SystemZMC::FP128Regs, 16);
	return parseRegisterNumber(Constraint, &SystemZ::FP64BitRegClass,
	SystemZMC::FP64Regs, 16);
	}
	if (Constraint[1] == 'v') {
	if (VT == MVT::f32)
	return parseRegisterNumber(Constraint, &SystemZ::VR32BitRegClass,
	SystemZMC::VR32Regs, 32);
	if (VT == MVT::f64)
	return parseRegisterNumber(Constraint, &SystemZ::VR64BitRegClass,
	SystemZMC::VR64Regs, 32);
	return parseRegisterNumber(Constraint, &SystemZ::VR128BitRegClass,
	SystemZMC::VR128Regs, 32);
	}
	}
	return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
	}

	void SystemZTargetLowering::
	LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
	std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const {
	// Only support length 1 constraints for now.
	if (Constraint.length() == 1) {
	switch (Constraint[0]) {
	case 'I': // Unsigned 8-bit constant
	if (auto *C = dyn_cast<ConstantSDNode>(Op))
	if (isUInt<8>(C->getZExtValue()))
	Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType()));
	return;

	case 'J': // Unsigned 12-bit constant
	if (auto *C = dyn_cast<ConstantSDNode>(Op))
	if (isUInt<12>(C->getZExtValue()))
	Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType()));
	return;

	case 'K': // Signed 16-bit constant
	if (auto *C = dyn_cast<ConstantSDNode>(Op))
	if (isInt<16>(C->getSExtValue()))
	Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
	Op.getValueType()));
	return;

	case 'L': // Signed 20-bit displacement (on all targets we support)
	if (auto *C = dyn_cast<ConstantSDNode>(Op))
	if (isInt<20>(C->getSExtValue()))
	Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
	Op.getValueType()));
	return;

	case 'M': // 0x7fffffff
	if (auto *C = dyn_cast<ConstantSDNode>(Op))
	if (C->getZExtValue() == 0x7fffffff)
	Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType()));
	return;
	}
	}
	TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	//===----------------------------------------------------------------------===//
	// Calling conventions
	//===----------------------------------------------------------------------===//

	#include "SystemZGenCallingConv.inc"

	const MCPhysReg *SystemZTargetLowering::getScratchRegisters(
	CallingConv::ID) const {
	static const MCPhysReg ScratchRegs[] = { SystemZ::R0D, SystemZ::R1D,
	SystemZ::R14D, 0 };
	return ScratchRegs;
	}

	bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
	Type *ToType) const {
	return isTruncateFree(FromType, ToType);
	}

	bool SystemZTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	return CI->isTailCall();
	}

	// We do not yet support 128-bit single-element vector types. If the user
	// attempts to use such types as function argument or return type, prefer
	// to error out instead of emitting code violating the ABI.
	static void VerifyVectorType(MVT VT, EVT ArgVT) {
	if (ArgVT.isVector() && !VT.isVector())
	report_fatal_error("Unsupported vector argument or return type");
	}

	static void VerifyVectorTypes(const SmallVectorImpl<ISD::InputArg> &Ins) {
	for (unsigned i = 0; i < Ins.size(); ++i)
	VerifyVectorType(Ins[i].VT, Ins[i].ArgVT);
	}

	static void VerifyVectorTypes(const SmallVectorImpl<ISD::OutputArg> &Outs) {
	for (unsigned i = 0; i < Outs.size(); ++i)
	VerifyVectorType(Outs[i].VT, Outs[i].ArgVT);
	}

	// Value is a value that has been passed to us in the location described by VA
	// (and so has type VA.getLocVT()). Convert Value to VA.getValVT(), chaining
	// any loads onto Chain.
	static SDValue convertLocVTToValVT(SelectionDAG &DAG, const SDLoc &DL,
	CCValAssign &VA, SDValue Chain,
	SDValue Value) {
	// If the argument has been promoted from a smaller type, insert an
	// assertion to capture this.
	if (VA.getLocInfo() == CCValAssign::SExt)
	Value = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Value,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	Value = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Value,
	DAG.getValueType(VA.getValVT()));

	if (VA.isExtInLoc())
	Value = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Value);
	else if (VA.getLocInfo() == CCValAssign::BCvt) {
	// If this is a short vector argument loaded from the stack,
	// extend from i64 to full vector size and then bitcast.
	assert(VA.getLocVT() == MVT::i64);
	assert(VA.getValVT().isVector());
	Value = DAG.getBuildVector(MVT::v2i64, DL, {Value, DAG.getUNDEF(MVT::i64)});
	Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
	} else
	assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
	return Value;
	}

	// Value is a value of type VA.getValVT() that we need to copy into
	// the location described by VA. Return a copy of Value converted to
	// VA.getValVT(). The caller is responsible for handling indirect values.
	static SDValue convertValVTToLocVT(SelectionDAG &DAG, const SDLoc &DL,
	CCValAssign &VA, SDValue Value) {
	switch (VA.getLocInfo()) {
	case CCValAssign::SExt:
	return DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Value);
	case CCValAssign::ZExt:
	return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
	case CCValAssign::AExt:
	return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
	case CCValAssign::BCvt:
	// If this is a short vector argument to be stored to the stack,
	// bitcast to v2i64 and then extract first element.
	assert(VA.getLocVT() == MVT::i64);
	assert(VA.getValVT().isVector());
	Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
	DAG.getConstant(0, DL, MVT::i32));
	case CCValAssign::Full:
	return Value;
	default:
	llvm_unreachable("Unhandled getLocInfo()");
	}
	}

	SDValue SystemZTargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MachineRegisterInfo &MRI = MF.getRegInfo();
	SystemZMachineFunctionInfo *FuncInfo =
	MF.getInfo<SystemZMachineFunctionInfo>();
	auto *TFL =
	static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	// Detect unsupported vector argument types.
	if (Subtarget.hasVector())
	VerifyVectorTypes(Ins);

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	SystemZCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
	CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);

	unsigned NumFixedGPRs = 0;
	unsigned NumFixedFPRs = 0;
	for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
	SDValue ArgValue;
	CCValAssign &VA = ArgLocs[I];
	EVT LocVT = VA.getLocVT();
	if (VA.isRegLoc()) {
	// Arguments passed in registers
	const TargetRegisterClass *RC;
	switch (LocVT.getSimpleVT().SimpleTy) {
	default:
	// Integers smaller than i64 should be promoted to i64.
	llvm_unreachable("Unexpected argument type");
	case MVT::i32:
	NumFixedGPRs += 1;
	RC = &SystemZ::GR32BitRegClass;
	break;
	case MVT::i64:
	NumFixedGPRs += 1;
	RC = &SystemZ::GR64BitRegClass;
	break;
	case MVT::f32:
	NumFixedFPRs += 1;
	RC = &SystemZ::FP32BitRegClass;
	break;
	case MVT::f64:
	NumFixedFPRs += 1;
	RC = &SystemZ::FP64BitRegClass;
	break;
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v2f64:
	RC = &SystemZ::VR128BitRegClass;
	break;
	}

	unsigned VReg = MRI.createVirtualRegister(RC);
	MRI.addLiveIn(VA.getLocReg(), VReg);
	ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
	} else {
	assert(VA.isMemLoc() && "Argument not register or memory");

	// Create the frame index object for this incoming parameter.
	int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), true);

	// Create the SelectionDAG nodes corresponding to a load
	// from this parameter. Unpromoted ints and floats are
	// passed as right-justified 8-byte values.
	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	if (VA.getLocVT() == MVT::i32 \|\| VA.getLocVT() == MVT::f32)
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
	DAG.getIntPtrConstant(4, DL));
	ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
	MachinePointerInfo::getFixedStack(MF, FI));
	}

	// Convert the value of the argument register into the value that's
	// being passed.
	if (VA.getLocInfo() == CCValAssign::Indirect) {
	InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
	MachinePointerInfo()));
	// If the original argument was split (e.g. i128), we need
	// to load all parts of it here (using the same address).
	unsigned ArgIndex = Ins[I].OrigArgIndex;
	assert (Ins[I].PartOffset == 0);
	while (I + 1 != E && Ins[I + 1].OrigArgIndex == ArgIndex) {
	CCValAssign &PartVA = ArgLocs[I + 1];
	unsigned PartOffset = Ins[I + 1].PartOffset;
	SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue,
	DAG.getIntPtrConstant(PartOffset, DL));
	InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
	MachinePointerInfo()));
	++I;
	}
	} else
	InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, ArgValue));
	}

	if (IsVarArg) {
	// Save the number of non-varargs registers for later use by va_start, etc.
	FuncInfo->setVarArgsFirstGPR(NumFixedGPRs);
	FuncInfo->setVarArgsFirstFPR(NumFixedFPRs);

	// Likewise the address (in the form of a frame index) of where the
	// first stack vararg would be. The 1-byte size here is arbitrary.
	int64_t StackSize = CCInfo.getNextStackOffset();
	FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));

	// ...and a similar frame index for the caller-allocated save area
	// that will be used to store the incoming registers.
	int64_t RegSaveOffset = TFL->getOffsetOfLocalArea();
	unsigned RegSaveIndex = MFI.CreateFixedObject(1, RegSaveOffset, true);
	FuncInfo->setRegSaveFrameIndex(RegSaveIndex);

	// Store the FPR varargs in the reserved frame slots. (We store the
	// GPRs as part of the prologue.)
	if (NumFixedFPRs < SystemZ::NumArgFPRs) {
	SDValue MemOps[SystemZ::NumArgFPRs];
	for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) {
	unsigned Offset = TFL->getRegSpillOffset(SystemZ::ArgFPRs[I]);
	int FI = MFI.CreateFixedObject(8, RegSaveOffset + Offset, true);
	SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I],
	&SystemZ::FP64BitRegClass);
	SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
	MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
	MachinePointerInfo::getFixedStack(MF, FI));
	}
	// Join the stores, which are independent of one another.
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
	makeArrayRef(&MemOps[NumFixedFPRs],
	SystemZ::NumArgFPRs-NumFixedFPRs));
	}
	}

	return Chain;
	}

	static bool canUseSiblingCall(const CCState &ArgCCInfo,
	SmallVectorImpl<CCValAssign> &ArgLocs,
	SmallVectorImpl<ISD::OutputArg> &Outs) {
	// Punt if there are any indirect or stack arguments, or if the call
	// needs the callee-saved argument register R6, or if the call uses
	// the callee-saved register arguments SwiftSelf and SwiftError.
	for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
	CCValAssign &VA = ArgLocs[I];
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;
	if (!VA.isRegLoc())
	return false;
	unsigned Reg = VA.getLocReg();
	if (Reg == SystemZ::R6H \|\| Reg == SystemZ::R6L \|\| Reg == SystemZ::R6D)
	return false;
	if (Outs[I].Flags.isSwiftSelf() \|\| Outs[I].Flags.isSwiftError())
	return false;
	}
	return true;
	}

	SDValue
	SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &DL = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &IsTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool IsVarArg = CLI.IsVarArg;
	MachineFunction &MF = DAG.getMachineFunction();
	EVT PtrVT = getPointerTy(MF.getDataLayout());

	// Detect unsupported vector argument and return types.
	if (Subtarget.hasVector()) {
	VerifyVectorTypes(Outs);
	VerifyVectorTypes(Ins);
	}

	// Analyze the operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
	ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);

	// We don't support GuaranteedTailCallOpt, only automatically-detected
	// sibling calls.
	if (IsTailCall && !canUseSiblingCall(ArgCCInfo, ArgLocs, Outs))
	IsTailCall = false;

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = ArgCCInfo.getNextStackOffset();

	// Mark the start of the call.
	if (!IsTailCall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);

	// Copy argument values to their designated locations.
	SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	SDValue StackPtr;
	for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
	CCValAssign &VA = ArgLocs[I];
	SDValue ArgValue = OutVals[I];

	if (VA.getLocInfo() == CCValAssign::Indirect) {
	// Store the argument in a stack slot and pass its address.
	SDValue SpillSlot = DAG.CreateStackTemporary(Outs[I].ArgVT);
	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, ArgValue, SpillSlot,
	MachinePointerInfo::getFixedStack(MF, FI)));
	// If the original argument was split (e.g. i128), we need
	// to store all parts of it here (and pass just one address).
	unsigned ArgIndex = Outs[I].OrigArgIndex;
	assert (Outs[I].PartOffset == 0);
	while (I + 1 != E && Outs[I + 1].OrigArgIndex == ArgIndex) {
	SDValue PartValue = OutVals[I + 1];
	unsigned PartOffset = Outs[I + 1].PartOffset;
	SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot,
	DAG.getIntPtrConstant(PartOffset, DL));
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, PartValue, Address,
	MachinePointerInfo::getFixedStack(MF, FI)));
	++I;
	}
	ArgValue = SpillSlot;
	} else
	ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);

	if (VA.isRegLoc())
	// Queue up the argument copies and emit them at the end.
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
	else {
	assert(VA.isMemLoc() && "Argument not register or memory");

	// Work out the address of the stack slot. Unpromoted ints and
	// floats are passed as right-justified 8-byte values.
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, PtrVT);
	unsigned Offset = SystemZMC::CallFrameSize + VA.getLocMemOffset();
	if (VA.getLocVT() == MVT::i32 \|\| VA.getLocVT() == MVT::f32)
	Offset += 4;
	SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
	DAG.getIntPtrConstant(Offset, DL));

	// Emit the store.
	MemOpChains.push_back(
	DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
	}
	}

	// Join the stores, which are independent of one another.
	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

	// Accept direct calls by converting symbolic call addresses to the
	// associated Target* opcodes. Force %r1 to be used for indirect
	// tail calls.
	SDValue Glue;
	if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
	Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
	} else if (auto *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT);
	Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
	} else if (IsTailCall) {
	Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R1D, Callee, Glue);
	Glue = Chain.getValue(1);
	Callee = DAG.getRegister(SystemZ::R1D, Callee.getValueType());
	}

	// Build a sequence of copy-to-reg nodes, chained and glued together.
	for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
	Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
	RegsToPass[I].second, Glue);
	Glue = Chain.getValue(1);
	}

	// The first call operand is the chain and the second is the target address.
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);

	// Add argument registers to the end of the list so that they are
	// known live into the call.
	for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
	Ops.push_back(DAG.getRegister(RegsToPass[I].first,
	RegsToPass[I].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	// Glue the call to the argument copies, if any.
	if (Glue.getNode())
	Ops.push_back(Glue);

	// Emit the call.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	if (IsTailCall)
	return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, Ops);
	Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, Ops);
	Glue = Chain.getValue(1);

	// Mark the end of the call, which is glued to the call itself.
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getConstant(NumBytes, DL, PtrVT, true),
	DAG.getConstant(0, DL, PtrVT, true),
	Glue, DL);
	Glue = Chain.getValue(1);

	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RetLocs;
	CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
	RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
	CCValAssign &VA = RetLocs[I];

	// Copy the value out, gluing the copy to the end of the call sequence.
	SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
	VA.getLocVT(), Glue);
	Chain = RetValue.getValue(1);
	Glue = RetValue.getValue(2);

	// Convert the value of the return register into the value that's
	// being returned.
	InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, RetValue));
	}

	return Chain;
	}

	bool SystemZTargetLowering::
	CanLowerReturn(CallingConv::ID CallConv,
	MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const {
	// Detect unsupported vector return types.
	if (Subtarget.hasVector())
	VerifyVectorTypes(Outs);

	// Special case that we cannot easily detect in RetCC_SystemZ since
	// i128 is not a legal type.
	for (auto &Out : Outs)
	if (Out.ArgVT == MVT::i128)
	return false;

	SmallVector<CCValAssign, 16> RetLocs;
	CCState RetCCInfo(CallConv, isVarArg, MF, RetLocs, Context);
	return RetCCInfo.CheckReturn(Outs, RetCC_SystemZ);
	}

	SDValue
	SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();

	// Detect unsupported vector return types.
	if (Subtarget.hasVector())
	VerifyVectorTypes(Outs);

	// Assign locations to each returned value.
	SmallVector<CCValAssign, 16> RetLocs;
	CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
	RetCCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);

	// Quick exit for void returns
	if (RetLocs.empty())
	return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, Chain);

	// Copy the result values into the output registers.
	SDValue Glue;
	SmallVector<SDValue, 4> RetOps;
	RetOps.push_back(Chain);
	for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
	CCValAssign &VA = RetLocs[I];
	SDValue RetValue = OutVals[I];

	// Make the return register live on exit.
	assert(VA.isRegLoc() && "Can only return in registers!");

	// Promote the value as required.
	RetValue = convertValVTToLocVT(DAG, DL, VA, RetValue);

	// Chain and glue the copies together.
	unsigned Reg = VA.getLocReg();
	Chain = DAG.getCopyToReg(Chain, DL, Reg, RetValue, Glue);
	Glue = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(Reg, VA.getLocVT()));
	}

	// Update chain and glue.
	RetOps[0] = Chain;
	if (Glue.getNode())
	RetOps.push_back(Glue);

	return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps);
	}

	// Return true if Op is an intrinsic node with chain that returns the CC value
	// as its only (other) argument. Provide the associated SystemZISD opcode and
	// the mask of valid CC values if so.
	static bool isIntrinsicWithCCAndChain(SDValue Op, unsigned &Opcode,
	unsigned &CCValid) {
	unsigned Id = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
	switch (Id) {
	case Intrinsic::s390_tbegin:
	Opcode = SystemZISD::TBEGIN;
	CCValid = SystemZ::CCMASK_TBEGIN;
	return true;

	case Intrinsic::s390_tbegin_nofloat:
	Opcode = SystemZISD::TBEGIN_NOFLOAT;
	CCValid = SystemZ::CCMASK_TBEGIN;
	return true;

	case Intrinsic::s390_tend:
	Opcode = SystemZISD::TEND;
	CCValid = SystemZ::CCMASK_TEND;
	return true;

	default:
	return false;
	}
	}

	// Return true if Op is an intrinsic node without chain that returns the
	// CC value as its final argument. Provide the associated SystemZISD
	// opcode and the mask of valid CC values if so.
	static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
	unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	switch (Id) {
	case Intrinsic::s390_vpkshs:
	case Intrinsic::s390_vpksfs:
	case Intrinsic::s390_vpksgs:
	Opcode = SystemZISD::PACKS_CC;
	CCValid = SystemZ::CCMASK_VCMP;
	return true;

	case Intrinsic::s390_vpklshs:
	case Intrinsic::s390_vpklsfs:
	case Intrinsic::s390_vpklsgs:
	Opcode = SystemZISD::PACKLS_CC;
	CCValid = SystemZ::CCMASK_VCMP;
	return true;

	case Intrinsic::s390_vceqbs:
	case Intrinsic::s390_vceqhs:
	case Intrinsic::s390_vceqfs:
	case Intrinsic::s390_vceqgs:
	Opcode = SystemZISD::VICMPES;
	CCValid = SystemZ::CCMASK_VCMP;
	return true;

	case Intrinsic::s390_vchbs:
	case Intrinsic::s390_vchhs:
	case Intrinsic::s390_vchfs:
	case Intrinsic::s390_vchgs:
	Opcode = SystemZISD::VICMPHS;
	CCValid = SystemZ::CCMASK_VCMP;
	return true;

	case Intrinsic::s390_vchlbs:
	case Intrinsic::s390_vchlhs:
	case Intrinsic::s390_vchlfs:
	case Intrinsic::s390_vchlgs:
	Opcode = SystemZISD::VICMPHLS;
	CCValid = SystemZ::CCMASK_VCMP;
	return true;

	case Intrinsic::s390_vtm:
	Opcode = SystemZISD::VTM;
	CCValid = SystemZ::CCMASK_VCMP;
	return true;

	case Intrinsic::s390_vfaebs:
	case Intrinsic::s390_vfaehs:
	case Intrinsic::s390_vfaefs:
	Opcode = SystemZISD::VFAE_CC;
	CCValid = SystemZ::CCMASK_ANY;
	return true;

	case Intrinsic::s390_vfaezbs:
	case Intrinsic::s390_vfaezhs:
	case Intrinsic::s390_vfaezfs:
	Opcode = SystemZISD::VFAEZ_CC;
	CCValid = SystemZ::CCMASK_ANY;
	return true;

	case Intrinsic::s390_vfeebs:
	case Intrinsic::s390_vfeehs:
	case Intrinsic::s390_vfeefs:
	Opcode = SystemZISD::VFEE_CC;
	CCValid = SystemZ::CCMASK_ANY;
	return true;

	case Intrinsic::s390_vfeezbs:
	case Intrinsic::s390_vfeezhs:
	case Intrinsic::s390_vfeezfs:
	Opcode = SystemZISD::VFEEZ_CC;
	CCValid = SystemZ::CCMASK_ANY;
	return true;

	case Intrinsic::s390_vfenebs:
	case Intrinsic::s390_vfenehs:
	case Intrinsic::s390_vfenefs:
	Opcode = SystemZISD::VFENE_CC;
	CCValid = SystemZ::CCMASK_ANY;
	return true;

	case Intrinsic::s390_vfenezbs:
	case Intrinsic::s390_vfenezhs:
	case Intrinsic::s390_vfenezfs:
	Opcode = SystemZISD::VFENEZ_CC;
	CCValid = SystemZ::CCMASK_ANY;
	return true;

	case Intrinsic::s390_vistrbs:
	case Intrinsic::s390_vistrhs:
	case Intrinsic::s390_vistrfs:
	Opcode = SystemZISD::VISTR_CC;
	CCValid = SystemZ::CCMASK_0 \| SystemZ::CCMASK_3;
	return true;

	case Intrinsic::s390_vstrcbs:
	case Intrinsic::s390_vstrchs:
	case Intrinsic::s390_vstrcfs:
	Opcode = SystemZISD::VSTRC_CC;
	CCValid = SystemZ::CCMASK_ANY;
	return true;

	case Intrinsic::s390_vstrczbs:
	case Intrinsic::s390_vstrczhs:
	case Intrinsic::s390_vstrczfs:
	Opcode = SystemZISD::VSTRCZ_CC;
	CCValid = SystemZ::CCMASK_ANY;
	return true;

	case Intrinsic::s390_vstrsb:
	case Intrinsic::s390_vstrsh:
	case Intrinsic::s390_vstrsf:
	Opcode = SystemZISD::VSTRS_CC;
	CCValid = SystemZ::CCMASK_ANY;
	return true;

	case Intrinsic::s390_vstrszb:
	case Intrinsic::s390_vstrszh:
	case Intrinsic::s390_vstrszf:
	Opcode = SystemZISD::VSTRSZ_CC;
	CCValid = SystemZ::CCMASK_ANY;
	return true;

	case Intrinsic::s390_vfcedbs:
	case Intrinsic::s390_vfcesbs:
	Opcode = SystemZISD::VFCMPES;
	CCValid = SystemZ::CCMASK_VCMP;
	return true;

	case Intrinsic::s390_vfchdbs:
	case Intrinsic::s390_vfchsbs:
	Opcode = SystemZISD::VFCMPHS;
	CCValid = SystemZ::CCMASK_VCMP;
	return true;

	case Intrinsic::s390_vfchedbs:
	case Intrinsic::s390_vfchesbs:
	Opcode = SystemZISD::VFCMPHES;
	CCValid = SystemZ::CCMASK_VCMP;
	return true;

	case Intrinsic::s390_vftcidb:
	case Intrinsic::s390_vftcisb:
	Opcode = SystemZISD::VFTCI;
	CCValid = SystemZ::CCMASK_VCMP;
	return true;

	case Intrinsic::s390_tdc:
	Opcode = SystemZISD::TDC;
	CCValid = SystemZ::CCMASK_TDC;
	return true;

	default:
	return false;
	}
	}

	// Emit an intrinsic with chain and an explicit CC register result.
	static SDNode *emitIntrinsicWithCCAndChain(SelectionDAG &DAG, SDValue Op,
	unsigned Opcode) {
	// Copy all operands except the intrinsic ID.
	unsigned NumOps = Op.getNumOperands();
	SmallVector<SDValue, 6> Ops;
	Ops.reserve(NumOps - 1);
	Ops.push_back(Op.getOperand(0));
	for (unsigned I = 2; I < NumOps; ++I)
	Ops.push_back(Op.getOperand(I));

	assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
	SDVTList RawVTs = DAG.getVTList(MVT::i32, MVT::Other);
	SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
	SDValue OldChain = SDValue(Op.getNode(), 1);
	SDValue NewChain = SDValue(Intr.getNode(), 1);
	DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
	return Intr.getNode();
	}

	// Emit an intrinsic with an explicit CC register result.
	static SDNode *emitIntrinsicWithCC(SelectionDAG &DAG, SDValue Op,
	unsigned Opcode) {
	// Copy all operands except the intrinsic ID.
	unsigned NumOps = Op.getNumOperands();
	SmallVector<SDValue, 6> Ops;
	Ops.reserve(NumOps - 1);
	for (unsigned I = 1; I < NumOps; ++I)
	Ops.push_back(Op.getOperand(I));

	SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), Op->getVTList(), Ops);
	return Intr.getNode();
	}

	// CC is a comparison that will be implemented using an integer or
	// floating-point comparison. Return the condition code mask for
	// a branch on true. In the integer case, CCMASK_CMP_UO is set for
	// unsigned comparisons and clear for signed ones. In the floating-point
	// case, CCMASK_CMP_UO has its normal mask meaning (unordered).
	static unsigned CCMaskForCondCode(ISD::CondCode CC) {
	#define CONV(X) \
	case ISD::SET##X: return SystemZ::CCMASK_CMP_##X; \
	case ISD::SETO##X: return SystemZ::CCMASK_CMP_##X; \
	case ISD::SETU##X: return SystemZ::CCMASK_CMP_UO \| SystemZ::CCMASK_CMP_##X

	switch (CC) {
	default:
	llvm_unreachable("Invalid integer condition!");

	CONV(EQ);
	CONV(NE);
	CONV(GT);
	CONV(GE);
	CONV(LT);
	CONV(LE);

	case ISD::SETO: return SystemZ::CCMASK_CMP_O;
	case ISD::SETUO: return SystemZ::CCMASK_CMP_UO;
	}
	#undef CONV
	}

	// If C can be converted to a comparison against zero, adjust the operands
	// as necessary.
	static void adjustZeroCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
	if (C.ICmpType == SystemZICMP::UnsignedOnly)
	return;

	auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1.getNode());
	if (!ConstOp1)
	return;

	int64_t Value = ConstOp1->getSExtValue();
	if ((Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_GT) \|\|
	(Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_LE) \|\|
	(Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_LT) \|\|
	(Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_GE)) {
	C.CCMask ^= SystemZ::CCMASK_CMP_EQ;
	C.Op1 = DAG.getConstant(0, DL, C.Op1.getValueType());
	}
	}

	// If a comparison described by C is suitable for CLI(Y), CHHSI or CLHHSI,
	// adjust the operands as necessary.
	static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL,
	Comparison &C) {
	// For us to make any changes, it must a comparison between a single-use
	// load and a constant.
	if (!C.Op0.hasOneUse() \|\|
	C.Op0.getOpcode() != ISD::LOAD \|\|
	C.Op1.getOpcode() != ISD::Constant)
	return;

	// We must have an 8- or 16-bit load.
	auto *Load = cast<LoadSDNode>(C.Op0);
	unsigned NumBits = Load->getMemoryVT().getStoreSizeInBits();
	if (NumBits != 8 && NumBits != 16)
	return;

	// The load must be an extending one and the constant must be within the
	// range of the unextended value.
	auto *ConstOp1 = cast<ConstantSDNode>(C.Op1);
	uint64_t Value = ConstOp1->getZExtValue();
	uint64_t Mask = (1 << NumBits) - 1;
	if (Load->getExtensionType() == ISD::SEXTLOAD) {
	// Make sure that ConstOp1 is in range of C.Op0.
	int64_t SignedValue = ConstOp1->getSExtValue();
	if (uint64_t(SignedValue) + (uint64_t(1) << (NumBits - 1)) > Mask)
	return;
	if (C.ICmpType != SystemZICMP::SignedOnly) {
	// Unsigned comparison between two sign-extended values is equivalent
	// to unsigned comparison between two zero-extended values.
	Value &= Mask;
	} else if (NumBits == 8) {
	// Try to treat the comparison as unsigned, so that we can use CLI.
	// Adjust CCMask and Value as necessary.
	if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_LT)
	// Test whether the high bit of the byte is set.
	Value = 127, C.CCMask = SystemZ::CCMASK_CMP_GT;
	else if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_GE)
	// Test whether the high bit of the byte is clear.
	Value = 128, C.CCMask = SystemZ::CCMASK_CMP_LT;
	else
	// No instruction exists for this combination.
	return;
	C.ICmpType = SystemZICMP::UnsignedOnly;
	}
	} else if (Load->getExtensionType() == ISD::ZEXTLOAD) {
	if (Value > Mask)
	return;
	// If the constant is in range, we can use any comparison.
	C.ICmpType = SystemZICMP::Any;
	} else
	return;

	// Make sure that the first operand is an i32 of the right extension type.
	ISD::LoadExtType ExtType = (C.ICmpType == SystemZICMP::SignedOnly ?
	ISD::SEXTLOAD :
	ISD::ZEXTLOAD);
	if (C.Op0.getValueType() != MVT::i32 \|\|
	Load->getExtensionType() != ExtType) {
	C.Op0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32, Load->getChain(),
	Load->getBasePtr(), Load->getPointerInfo(),
	Load->getMemoryVT(), Load->getAlignment(),
	Load->getMemOperand()->getFlags());
	// Update the chain uses.
	DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), C.Op0.getValue(1));
	}

	// Make sure that the second operand is an i32 with the right value.
	if (C.Op1.getValueType() != MVT::i32 \|\|
	Value != ConstOp1->getZExtValue())
	C.Op1 = DAG.getConstant(Value, DL, MVT::i32);
	}

	// Return true if Op is either an unextended load, or a load suitable
	// for integer register-memory comparisons of type ICmpType.
	static bool isNaturalMemoryOperand(SDValue Op, unsigned ICmpType) {
	auto *Load = dyn_cast<LoadSDNode>(Op.getNode());
	if (Load) {
	// There are no instructions to compare a register with a memory byte.
	if (Load->getMemoryVT() == MVT::i8)
	return false;
	// Otherwise decide on extension type.
	switch (Load->getExtensionType()) {
	case ISD::NON_EXTLOAD:
	return true;
	case ISD::SEXTLOAD:
	return ICmpType != SystemZICMP::UnsignedOnly;
	case ISD::ZEXTLOAD:
	return ICmpType != SystemZICMP::SignedOnly;
	default:
	break;
	}
	}
	return false;
	}

	// Return true if it is better to swap the operands of C.
	static bool shouldSwapCmpOperands(const Comparison &C) {
	// Leave f128 comparisons alone, since they have no memory forms.
	if (C.Op0.getValueType() == MVT::f128)
	return false;

	// Always keep a floating-point constant second, since comparisons with
	// zero can use LOAD TEST and comparisons with other constants make a
	// natural memory operand.
	if (isa<ConstantFPSDNode>(C.Op1))
	return false;

	// Never swap comparisons with zero since there are many ways to optimize
	// those later.
	auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
	if (ConstOp1 && ConstOp1->getZExtValue() == 0)
	return false;

	// Also keep natural memory operands second if the loaded value is
	// only used here. Several comparisons have memory forms.
	if (isNaturalMemoryOperand(C.Op1, C.ICmpType) && C.Op1.hasOneUse())
	return false;

	// Look for cases where Cmp0 is a single-use load and Cmp1 isn't.
	// In that case we generally prefer the memory to be second.
	if (isNaturalMemoryOperand(C.Op0, C.ICmpType) && C.Op0.hasOneUse()) {
	// The only exceptions are when the second operand is a constant and
	// we can use things like CHHSI.
	if (!ConstOp1)
	return true;
	// The unsigned memory-immediate instructions can handle 16-bit
	// unsigned integers.
	if (C.ICmpType != SystemZICMP::SignedOnly &&
	isUInt<16>(ConstOp1->getZExtValue()))
	return false;
	// The signed memory-immediate instructions can handle 16-bit
	// signed integers.
	if (C.ICmpType != SystemZICMP::UnsignedOnly &&
	isInt<16>(ConstOp1->getSExtValue()))
	return false;
	return true;
	}

	// Try to promote the use of CGFR and CLGFR.
	unsigned Opcode0 = C.Op0.getOpcode();
	if (C.ICmpType != SystemZICMP::UnsignedOnly && Opcode0 == ISD::SIGN_EXTEND)
	return true;
	if (C.ICmpType != SystemZICMP::SignedOnly && Opcode0 == ISD::ZERO_EXTEND)
	return true;
	if (C.ICmpType != SystemZICMP::SignedOnly &&
	Opcode0 == ISD::AND &&
	C.Op0.getOperand(1).getOpcode() == ISD::Constant &&
	cast<ConstantSDNode>(C.Op0.getOperand(1))->getZExtValue() == 0xffffffff)
	return true;

	return false;
	}

	// Return a version of comparison CC mask CCMask in which the LT and GT
	// actions are swapped.
	static unsigned reverseCCMask(unsigned CCMask) {
	return ((CCMask & SystemZ::CCMASK_CMP_EQ) \|
	(CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) \|
	(CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) \|
	(CCMask & SystemZ::CCMASK_CMP_UO));
	}

	// Check whether C tests for equality between X and Y and whether X - Y
	// or Y - X is also computed. In that case it's better to compare the
	// result of the subtraction against zero.
	static void adjustForSubtraction(SelectionDAG &DAG, const SDLoc &DL,
	Comparison &C) {
	if (C.CCMask == SystemZ::CCMASK_CMP_EQ \|\|
	C.CCMask == SystemZ::CCMASK_CMP_NE) {
	for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
	SDNode N = I;
	if (N->getOpcode() == ISD::SUB &&
	((N->getOperand(0) == C.Op0 && N->getOperand(1) == C.Op1) \|\|
	(N->getOperand(0) == C.Op1 && N->getOperand(1) == C.Op0))) {
	C.Op0 = SDValue(N, 0);
	C.Op1 = DAG.getConstant(0, DL, N->getValueType(0));
	return;
	}
	}
	}
	}

	// Check whether C compares a floating-point value with zero and if that
	// floating-point value is also negated. In this case we can use the
	// negation to set CC, so avoiding separate LOAD AND TEST and
	// LOAD (NEGATIVE/COMPLEMENT) instructions.
	static void adjustForFNeg(Comparison &C) {
	auto *C1 = dyn_cast<ConstantFPSDNode>(C.Op1);
	if (C1 && C1->isZero()) {
	for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
	SDNode N = I;
	if (N->getOpcode() == ISD::FNEG) {
	C.Op0 = SDValue(N, 0);
	C.CCMask = reverseCCMask(C.CCMask);
	return;
	}
	}
	}
	}

	// Check whether C compares (shl X, 32) with 0 and whether X is
	// also sign-extended. In that case it is better to test the result
	// of the sign extension using LTGFR.
	//
	// This case is important because InstCombine transforms a comparison
	// with (sext (trunc X)) into a comparison with (shl X, 32).
	static void adjustForLTGFR(Comparison &C) {
	// Check for a comparison between (shl X, 32) and 0.
	if (C.Op0.getOpcode() == ISD::SHL &&
	C.Op0.getValueType() == MVT::i64 &&
	C.Op1.getOpcode() == ISD::Constant &&
	cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
	auto *C1 = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
	if (C1 && C1->getZExtValue() == 32) {
	SDValue ShlOp0 = C.Op0.getOperand(0);
	// See whether X has any SIGN_EXTEND_INREG uses.
	for (auto I = ShlOp0->use_begin(), E = ShlOp0->use_end(); I != E; ++I) {
	SDNode N = I;
	if (N->getOpcode() == ISD::SIGN_EXTEND_INREG &&
	cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32) {
	C.Op0 = SDValue(N, 0);
	return;
	}
	}
	}
	}
	}

	// If C compares the truncation of an extending load, try to compare
	// the untruncated value instead. This exposes more opportunities to
	// reuse CC.
	static void adjustICmpTruncate(SelectionDAG &DAG, const SDLoc &DL,
	Comparison &C) {
	if (C.Op0.getOpcode() == ISD::TRUNCATE &&
	C.Op0.getOperand(0).getOpcode() == ISD::LOAD &&
	C.Op1.getOpcode() == ISD::Constant &&
	cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
	auto *L = cast<LoadSDNode>(C.Op0.getOperand(0));
	if (L->getMemoryVT().getStoreSizeInBits() <= C.Op0.getValueSizeInBits()) {
	unsigned Type = L->getExtensionType();
	if ((Type == ISD::ZEXTLOAD && C.ICmpType != SystemZICMP::SignedOnly) \|\|
	(Type == ISD::SEXTLOAD && C.ICmpType != SystemZICMP::UnsignedOnly)) {
	C.Op0 = C.Op0.getOperand(0);
	C.Op1 = DAG.getConstant(0, DL, C.Op0.getValueType());
	}
	}
	}
	}

	// Return true if shift operation N has an in-range constant shift value.
	// Store it in ShiftVal if so.
	static bool isSimpleShift(SDValue N, unsigned &ShiftVal) {
	auto *Shift = dyn_cast<ConstantSDNode>(N.getOperand(1));
	if (!Shift)
	return false;

	uint64_t Amount = Shift->getZExtValue();
	if (Amount >= N.getValueSizeInBits())
	return false;

	ShiftVal = Amount;
	return true;
	}

	// Check whether an AND with Mask is suitable for a TEST UNDER MASK
	// instruction and whether the CC value is descriptive enough to handle
	// a comparison of type Opcode between the AND result and CmpVal.
	// CCMask says which comparison result is being tested and BitSize is
	// the number of bits in the operands. If TEST UNDER MASK can be used,
	// return the corresponding CC mask, otherwise return 0.
	static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
	uint64_t Mask, uint64_t CmpVal,
	unsigned ICmpType) {
	assert(Mask != 0 && "ANDs with zero should have been removed by now");

	// Check whether the mask is suitable for TMHH, TMHL, TMLH or TMLL.
	if (!SystemZ::isImmLL(Mask) && !SystemZ::isImmLH(Mask) &&
	!SystemZ::isImmHL(Mask) && !SystemZ::isImmHH(Mask))
	return 0;

	// Work out the masks for the lowest and highest bits.
	unsigned HighShift = 63 - countLeadingZeros(Mask);
	uint64_t High = uint64_t(1) << HighShift;
	uint64_t Low = uint64_t(1) << countTrailingZeros(Mask);

	// Signed ordered comparisons are effectively unsigned if the sign
	// bit is dropped.
	bool EffectivelyUnsigned = (ICmpType != SystemZICMP::SignedOnly);

	// Check for equality comparisons with 0, or the equivalent.
	if (CmpVal == 0) {
	if (CCMask == SystemZ::CCMASK_CMP_EQ)
	return SystemZ::CCMASK_TM_ALL_0;
	if (CCMask == SystemZ::CCMASK_CMP_NE)
	return SystemZ::CCMASK_TM_SOME_1;
	}
	if (EffectivelyUnsigned && CmpVal > 0 && CmpVal <= Low) {
	if (CCMask == SystemZ::CCMASK_CMP_LT)
	return SystemZ::CCMASK_TM_ALL_0;
	if (CCMask == SystemZ::CCMASK_CMP_GE)
	return SystemZ::CCMASK_TM_SOME_1;
	}
	if (EffectivelyUnsigned && CmpVal < Low) {
	if (CCMask == SystemZ::CCMASK_CMP_LE)
	return SystemZ::CCMASK_TM_ALL_0;
	if (CCMask == SystemZ::CCMASK_CMP_GT)
	return SystemZ::CCMASK_TM_SOME_1;
	}

	// Check for equality comparisons with the mask, or the equivalent.
	if (CmpVal == Mask) {
	if (CCMask == SystemZ::CCMASK_CMP_EQ)
	return SystemZ::CCMASK_TM_ALL_1;
	if (CCMask == SystemZ::CCMASK_CMP_NE)
	return SystemZ::CCMASK_TM_SOME_0;
	}
	if (EffectivelyUnsigned && CmpVal >= Mask - Low && CmpVal < Mask) {
	if (CCMask == SystemZ::CCMASK_CMP_GT)
	return SystemZ::CCMASK_TM_ALL_1;
	if (CCMask == SystemZ::CCMASK_CMP_LE)
	return SystemZ::CCMASK_TM_SOME_0;
	}
	if (EffectivelyUnsigned && CmpVal > Mask - Low && CmpVal <= Mask) {
	if (CCMask == SystemZ::CCMASK_CMP_GE)
	return SystemZ::CCMASK_TM_ALL_1;
	if (CCMask == SystemZ::CCMASK_CMP_LT)
	return SystemZ::CCMASK_TM_SOME_0;
	}

	// Check for ordered comparisons with the top bit.
	if (EffectivelyUnsigned && CmpVal >= Mask - High && CmpVal < High) {
	if (CCMask == SystemZ::CCMASK_CMP_LE)
	return SystemZ::CCMASK_TM_MSB_0;
	if (CCMask == SystemZ::CCMASK_CMP_GT)
	return SystemZ::CCMASK_TM_MSB_1;
	}
	if (EffectivelyUnsigned && CmpVal > Mask - High && CmpVal <= High) {
	if (CCMask == SystemZ::CCMASK_CMP_LT)
	return SystemZ::CCMASK_TM_MSB_0;
	if (CCMask == SystemZ::CCMASK_CMP_GE)
	return SystemZ::CCMASK_TM_MSB_1;
	}

	// If there are just two bits, we can do equality checks for Low and High
	// as well.
	if (Mask == Low + High) {
	if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == Low)
	return SystemZ::CCMASK_TM_MIXED_MSB_0;
	if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == Low)
	return SystemZ::CCMASK_TM_MIXED_MSB_0 ^ SystemZ::CCMASK_ANY;
	if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == High)
	return SystemZ::CCMASK_TM_MIXED_MSB_1;
	if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == High)
	return SystemZ::CCMASK_TM_MIXED_MSB_1 ^ SystemZ::CCMASK_ANY;
	}

	// Looks like we've exhausted our options.
	return 0;
	}

	// See whether C can be implemented as a TEST UNDER MASK instruction.
	// Update the arguments with the TM version if so.
	static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
	Comparison &C) {
	// Check that we have a comparison with a constant.
	auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
	if (!ConstOp1)
	return;
	uint64_t CmpVal = ConstOp1->getZExtValue();

	// Check whether the nonconstant input is an AND with a constant mask.
	Comparison NewC(C);
	uint64_t MaskVal;
	ConstantSDNode *Mask = nullptr;
	if (C.Op0.getOpcode() == ISD::AND) {
	NewC.Op0 = C.Op0.getOperand(0);
	NewC.Op1 = C.Op0.getOperand(1);
	Mask = dyn_cast<ConstantSDNode>(NewC.Op1);
	if (!Mask)
	return;
	MaskVal = Mask->getZExtValue();
	} else {
	// There is no instruction to compare with a 64-bit immediate
	// so use TMHH instead if possible. We need an unsigned ordered
	// comparison with an i64 immediate.
	if (NewC.Op0.getValueType() != MVT::i64 \|\|
	NewC.CCMask == SystemZ::CCMASK_CMP_EQ \|\|
	NewC.CCMask == SystemZ::CCMASK_CMP_NE \|\|
	NewC.ICmpType == SystemZICMP::SignedOnly)
	return;
	// Convert LE and GT comparisons into LT and GE.
	if (NewC.CCMask == SystemZ::CCMASK_CMP_LE \|\|
	NewC.CCMask == SystemZ::CCMASK_CMP_GT) {
	if (CmpVal == uint64_t(-1))
	return;
	CmpVal += 1;
	NewC.CCMask ^= SystemZ::CCMASK_CMP_EQ;
	}
	// If the low N bits of Op1 are zero than the low N bits of Op0 can
	// be masked off without changing the result.
	MaskVal = -(CmpVal & -CmpVal);
	NewC.ICmpType = SystemZICMP::UnsignedOnly;
	}
	if (!MaskVal)
	return;

	// Check whether the combination of mask, comparison value and comparison
	// type are suitable.
	unsigned BitSize = NewC.Op0.getValueSizeInBits();
	unsigned NewCCMask, ShiftVal;
	if (NewC.ICmpType != SystemZICMP::SignedOnly &&
	NewC.Op0.getOpcode() == ISD::SHL &&
	isSimpleShift(NewC.Op0, ShiftVal) &&
	(MaskVal >> ShiftVal != 0) &&
	((CmpVal >> ShiftVal) << ShiftVal) == CmpVal &&
	(NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
	MaskVal >> ShiftVal,
	CmpVal >> ShiftVal,
	SystemZICMP::Any))) {
	NewC.Op0 = NewC.Op0.getOperand(0);
	MaskVal >>= ShiftVal;
	} else if (NewC.ICmpType != SystemZICMP::SignedOnly &&
	NewC.Op0.getOpcode() == ISD::SRL &&
	isSimpleShift(NewC.Op0, ShiftVal) &&
	(MaskVal << ShiftVal != 0) &&
	((CmpVal << ShiftVal) >> ShiftVal) == CmpVal &&
	(NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
	MaskVal << ShiftVal,
	CmpVal << ShiftVal,
	SystemZICMP::UnsignedOnly))) {
	NewC.Op0 = NewC.Op0.getOperand(0);
	MaskVal <<= ShiftVal;
	} else {
	NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask, MaskVal, CmpVal,
	NewC.ICmpType);
	if (!NewCCMask)
	return;
	}

	// Go ahead and make the change.
	C.Opcode = SystemZISD::TM;
	C.Op0 = NewC.Op0;
	if (Mask && Mask->getZExtValue() == MaskVal)
	C.Op1 = SDValue(Mask, 0);
	else
	C.Op1 = DAG.getConstant(MaskVal, DL, C.Op0.getValueType());
	C.CCValid = SystemZ::CCMASK_TM;
	C.CCMask = NewCCMask;
	}

	// See whether the comparison argument contains a redundant AND
	// and remove it if so. This sometimes happens due to the generic
	// BRCOND expansion.
	static void adjustForRedundantAnd(SelectionDAG &DAG, const SDLoc &DL,
	Comparison &C) {
	if (C.Op0.getOpcode() != ISD::AND)
	return;
	auto *Mask = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
	if (!Mask)
	return;
	KnownBits Known = DAG.computeKnownBits(C.Op0.getOperand(0));
	if ((~Known.Zero).getZExtValue() & ~Mask->getZExtValue())
	return;

	C.Op0 = C.Op0.getOperand(0);
	}

	// Return a Comparison that tests the condition-code result of intrinsic
	// node Call against constant integer CC using comparison code Cond.
	// Opcode is the opcode of the SystemZISD operation for the intrinsic
	// and CCValid is the set of possible condition-code results.
	static Comparison getIntrinsicCmp(SelectionDAG &DAG, unsigned Opcode,
	SDValue Call, unsigned CCValid, uint64_t CC,
	ISD::CondCode Cond) {
	Comparison C(Call, SDValue());
	C.Opcode = Opcode;
	C.CCValid = CCValid;
	if (Cond == ISD::SETEQ)
	// bit 3 for CC==0, bit 0 for CC==3, always false for CC>3.
	C.CCMask = CC < 4 ? 1 << (3 - CC) : 0;
	else if (Cond == ISD::SETNE)
	// ...and the inverse of that.
	C.CCMask = CC < 4 ? ~(1 << (3 - CC)) : -1;
	else if (Cond == ISD::SETLT \|\| Cond == ISD::SETULT)
	// bits above bit 3 for CC==0 (always false), bits above bit 0 for CC==3,
	// always true for CC>3.
	C.CCMask = CC < 4 ? ~0U << (4 - CC) : -1;
	else if (Cond == ISD::SETGE \|\| Cond == ISD::SETUGE)
	// ...and the inverse of that.
	C.CCMask = CC < 4 ? ~(~0U << (4 - CC)) : 0;
	else if (Cond == ISD::SETLE \|\| Cond == ISD::SETULE)
	// bit 3 and above for CC==0, bit 0 and above for CC==3 (always true),
	// always true for CC>3.
	C.CCMask = CC < 4 ? ~0U << (3 - CC) : -1;
	else if (Cond == ISD::SETGT \|\| Cond == ISD::SETUGT)
	// ...and the inverse of that.
	C.CCMask = CC < 4 ? ~(~0U << (3 - CC)) : 0;
	else
	llvm_unreachable("Unexpected integer comparison type");
	C.CCMask &= CCValid;
	return C;
	}

	// Decide how to implement a comparison of type Cond between CmpOp0 with CmpOp1.
	static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
	ISD::CondCode Cond, const SDLoc &DL) {
	if (CmpOp1.getOpcode() == ISD::Constant) {
	uint64_t Constant = cast<ConstantSDNode>(CmpOp1)->getZExtValue();
	unsigned Opcode, CCValid;
	if (CmpOp0.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
	CmpOp0.getResNo() == 0 && CmpOp0->hasNUsesOfValue(1, 0) &&
	isIntrinsicWithCCAndChain(CmpOp0, Opcode, CCValid))
	return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
	if (CmpOp0.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
	CmpOp0.getResNo() == CmpOp0->getNumValues() - 1 &&
	isIntrinsicWithCC(CmpOp0, Opcode, CCValid))
	return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
	}
	Comparison C(CmpOp0, CmpOp1);
	C.CCMask = CCMaskForCondCode(Cond);
	if (C.Op0.getValueType().isFloatingPoint()) {
	C.CCValid = SystemZ::CCMASK_FCMP;
	C.Opcode = SystemZISD::FCMP;
	adjustForFNeg(C);
	} else {
	C.CCValid = SystemZ::CCMASK_ICMP;
	C.Opcode = SystemZISD::ICMP;
	// Choose the type of comparison. Equality and inequality tests can
	// use either signed or unsigned comparisons. The choice also doesn't
	// matter if both sign bits are known to be clear. In those cases we
	// want to give the main isel code the freedom to choose whichever
	// form fits best.
	if (C.CCMask == SystemZ::CCMASK_CMP_EQ \|\|
	C.CCMask == SystemZ::CCMASK_CMP_NE \|\|
	(DAG.SignBitIsZero(C.Op0) && DAG.SignBitIsZero(C.Op1)))
	C.ICmpType = SystemZICMP::Any;
	else if (C.CCMask & SystemZ::CCMASK_CMP_UO)
	C.ICmpType = SystemZICMP::UnsignedOnly;
	else
	C.ICmpType = SystemZICMP::SignedOnly;
	C.CCMask &= ~SystemZ::CCMASK_CMP_UO;
	adjustForRedundantAnd(DAG, DL, C);
	adjustZeroCmp(DAG, DL, C);
	adjustSubwordCmp(DAG, DL, C);
	adjustForSubtraction(DAG, DL, C);
	adjustForLTGFR(C);
	adjustICmpTruncate(DAG, DL, C);
	}

	if (shouldSwapCmpOperands(C)) {
	std::swap(C.Op0, C.Op1);
	C.CCMask = reverseCCMask(C.CCMask);
	}

	adjustForTestUnderMask(DAG, DL, C);
	return C;
	}

	// Emit the comparison instruction described by C.
	static SDValue emitCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
	if (!C.Op1.getNode()) {
	SDNode *Node;
	switch (C.Op0.getOpcode()) {
	case ISD::INTRINSIC_W_CHAIN:
	Node = emitIntrinsicWithCCAndChain(DAG, C.Op0, C.Opcode);
	return SDValue(Node, 0);
	case ISD::INTRINSIC_WO_CHAIN:
	Node = emitIntrinsicWithCC(DAG, C.Op0, C.Opcode);
	return SDValue(Node, Node->getNumValues() - 1);
	default:
	llvm_unreachable("Invalid comparison operands");
	}
	}
	if (C.Opcode == SystemZISD::ICMP)
	return DAG.getNode(SystemZISD::ICMP, DL, MVT::i32, C.Op0, C.Op1,
	DAG.getConstant(C.ICmpType, DL, MVT::i32));
	if (C.Opcode == SystemZISD::TM) {
	bool RegisterOnly = (bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
	bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
	return DAG.getNode(SystemZISD::TM, DL, MVT::i32, C.Op0, C.Op1,
	DAG.getConstant(RegisterOnly, DL, MVT::i32));
	}
	return DAG.getNode(C.Opcode, DL, MVT::i32, C.Op0, C.Op1);
	}

	// Implement a 32-bit *MUL_LOHI operation by extending both operands to
	// 64 bits. Extend is the extension type to use. Store the high part
	// in Hi and the low part in Lo.
	static void lowerMUL_LOHI32(SelectionDAG &DAG, const SDLoc &DL, unsigned Extend,
	SDValue Op0, SDValue Op1, SDValue &Hi,
	SDValue &Lo) {
	Op0 = DAG.getNode(Extend, DL, MVT::i64, Op0);
	Op1 = DAG.getNode(Extend, DL, MVT::i64, Op1);
	SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, Op0, Op1);
	Hi = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
	DAG.getConstant(32, DL, MVT::i64));
	Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Hi);
	Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
	}

	// Lower a binary operation that produces two VT results, one in each
	// half of a GR128 pair. Op0 and Op1 are the VT operands to the operation,
	// and Opcode performs the GR128 operation. Store the even register result
	// in Even and the odd register result in Odd.
	static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
	unsigned Opcode, SDValue Op0, SDValue Op1,
	SDValue &Even, SDValue &Odd) {
	SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped, Op0, Op1);
	bool Is32Bit = is32Bit(VT);
	Even = DAG.getTargetExtractSubreg(SystemZ::even128(Is32Bit), DL, VT, Result);
	Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
	}

	// Return an i32 value that is 1 if the CC value produced by CCReg is
	// in the mask CCMask and 0 otherwise. CC is known to have a value
	// in CCValid, so other values can be ignored.
	static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue CCReg,
	unsigned CCValid, unsigned CCMask) {
	SDValue Ops[] = { DAG.getConstant(1, DL, MVT::i32),
	DAG.getConstant(0, DL, MVT::i32),
	DAG.getConstant(CCValid, DL, MVT::i32),
	DAG.getConstant(CCMask, DL, MVT::i32), CCReg };
	return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, MVT::i32, Ops);
	}

	// Return the SystemISD vector comparison operation for CC, or 0 if it cannot
	// be done directly. IsFP is true if CC is for a floating-point rather than
	// integer comparison.
	static unsigned getVectorComparison(ISD::CondCode CC, bool IsFP) {
	switch (CC) {
	case ISD::SETOEQ:
	case ISD::SETEQ:
	return IsFP ? SystemZISD::VFCMPE : SystemZISD::VICMPE;

	case ISD::SETOGE:
	case ISD::SETGE:
	return IsFP ? SystemZISD::VFCMPHE : static_cast<SystemZISD::NodeType>(0);

	case ISD::SETOGT:
	case ISD::SETGT:
	return IsFP ? SystemZISD::VFCMPH : SystemZISD::VICMPH;

	case ISD::SETUGT:
	return IsFP ? static_cast<SystemZISD::NodeType>(0) : SystemZISD::VICMPHL;

	default:
	return 0;
	}
	}

	// Return the SystemZISD vector comparison operation for CC or its inverse,
	// or 0 if neither can be done directly. Indicate in Invert whether the
	// result is for the inverse of CC. IsFP is true if CC is for a
	// floating-point rather than integer comparison.
	static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, bool IsFP,
	bool &Invert) {
	if (unsigned Opcode = getVectorComparison(CC, IsFP)) {
	Invert = false;
	return Opcode;
	}

	CC = ISD::getSetCCInverse(CC, !IsFP);
	if (unsigned Opcode = getVectorComparison(CC, IsFP)) {
	Invert = true;
	return Opcode;
	}

	return 0;
	}

	// Return a v2f64 that contains the extended form of elements Start and Start+1
	// of v4f32 value Op.
	static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, const SDLoc &DL,
	SDValue Op) {
	int Mask[] = { Start, -1, Start + 1, -1 };
	Op = DAG.getVectorShuffle(MVT::v4f32, DL, Op, DAG.getUNDEF(MVT::v4f32), Mask);
	return DAG.getNode(SystemZISD::VEXTEND, DL, MVT::v2f64, Op);
	}

	// Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
	// producing a result of type VT.
	SDValue SystemZTargetLowering::getVectorCmp(SelectionDAG &DAG, unsigned Opcode,
	const SDLoc &DL, EVT VT,
	SDValue CmpOp0,
	SDValue CmpOp1) const {
	// There is no hardware support for v4f32 (unless we have the vector
	// enhancements facility 1), so extend the vector into two v2f64s
	// and compare those.
	if (CmpOp0.getValueType() == MVT::v4f32 &&
	!Subtarget.hasVectorEnhancements1()) {
	SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0);
	SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0);
	SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1);
	SDValue L1 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp1);
	SDValue HRes = DAG.getNode(Opcode, DL, MVT::v2i64, H0, H1);
	SDValue LRes = DAG.getNode(Opcode, DL, MVT::v2i64, L0, L1);
	return DAG.getNode(SystemZISD::PACK, DL, VT, HRes, LRes);
	}
	return DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1);
	}

	// Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
	// an integer mask of type VT.
	SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG,
	const SDLoc &DL, EVT VT,
	ISD::CondCode CC,
	SDValue CmpOp0,
	SDValue CmpOp1) const {
	bool IsFP = CmpOp0.getValueType().isFloatingPoint();
	bool Invert = false;
	SDValue Cmp;
	switch (CC) {
	// Handle tests for order using (or (ogt y x) (oge x y)).
	case ISD::SETUO:
	Invert = true;
	LLVM_FALLTHROUGH;
	case ISD::SETO: {
	assert(IsFP && "Unexpected integer comparison");
	SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
	SDValue GE = getVectorCmp(DAG, SystemZISD::VFCMPHE, DL, VT, CmpOp0, CmpOp1);
	Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GE);
	break;
	}

	// Handle <> tests using (or (ogt y x) (ogt x y)).
	case ISD::SETUEQ:
	Invert = true;
	LLVM_FALLTHROUGH;
	case ISD::SETONE: {
	assert(IsFP && "Unexpected integer comparison");
	SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
	SDValue GT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp0, CmpOp1);
	Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GT);
	break;
	}

	// Otherwise a single comparison is enough. It doesn't really
	// matter whether we try the inversion or the swap first, since
	// there are no cases where both work.
	default:
	if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
	Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp0, CmpOp1);
	else {
	CC = ISD::getSetCCSwappedOperands(CC);
	if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
	Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp1, CmpOp0);
	else
	llvm_unreachable("Unhandled comparison");
	}
	break;
	}
	if (Invert) {
	SDValue Mask =
	DAG.getSplatBuildVector(VT, DL, DAG.getConstant(-1, DL, MVT::i64));
	Cmp = DAG.getNode(ISD::XOR, DL, VT, Cmp, Mask);
	}
	return Cmp;
	}

	SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue CmpOp0 = Op.getOperand(0);
	SDValue CmpOp1 = Op.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDLoc DL(Op);
	EVT VT = Op.getValueType();
	if (VT.isVector())
	return lowerVectorSETCC(DAG, DL, VT, CC, CmpOp0, CmpOp1);

	Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
	SDValue CCReg = emitCmp(DAG, DL, C);
	return emitSETCC(DAG, DL, CCReg, C.CCValid, C.CCMask);
	}

	SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
	SDValue CmpOp0 = Op.getOperand(2);
	SDValue CmpOp1 = Op.getOperand(3);
	SDValue Dest = Op.getOperand(4);
	SDLoc DL(Op);

	Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
	SDValue CCReg = emitCmp(DAG, DL, C);
	return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
	Op.getOperand(0), DAG.getConstant(C.CCValid, DL, MVT::i32),
	DAG.getConstant(C.CCMask, DL, MVT::i32), Dest, CCReg);
	}

	// Return true if Pos is CmpOp and Neg is the negative of CmpOp,
	// allowing Pos and Neg to be wider than CmpOp.
	static bool isAbsolute(SDValue CmpOp, SDValue Pos, SDValue Neg) {
	return (Neg.getOpcode() == ISD::SUB &&
	Neg.getOperand(0).getOpcode() == ISD::Constant &&
	cast<ConstantSDNode>(Neg.getOperand(0))->getZExtValue() == 0 &&
	Neg.getOperand(1) == Pos &&
	(Pos == CmpOp \|\|
	(Pos.getOpcode() == ISD::SIGN_EXTEND &&
	Pos.getOperand(0) == CmpOp)));
	}

	// Return the absolute or negative absolute of Op; IsNegative decides which.
	static SDValue getAbsolute(SelectionDAG &DAG, const SDLoc &DL, SDValue Op,
	bool IsNegative) {
	Op = DAG.getNode(SystemZISD::IABS, DL, Op.getValueType(), Op);
	if (IsNegative)
	Op = DAG.getNode(ISD::SUB, DL, Op.getValueType(),
	DAG.getConstant(0, DL, Op.getValueType()), Op);
	return Op;
	}

	SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue CmpOp0 = Op.getOperand(0);
	SDValue CmpOp1 = Op.getOperand(1);
	SDValue TrueOp = Op.getOperand(2);
	SDValue FalseOp = Op.getOperand(3);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
	SDLoc DL(Op);

	Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));

	// Check for absolute and negative-absolute selections, including those
	// where the comparison value is sign-extended (for LPGFR and LNGFR).
	// This check supplements the one in DAGCombiner.
	if (C.Opcode == SystemZISD::ICMP &&
	C.CCMask != SystemZ::CCMASK_CMP_EQ &&
	C.CCMask != SystemZ::CCMASK_CMP_NE &&
	C.Op1.getOpcode() == ISD::Constant &&
	cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
	if (isAbsolute(C.Op0, TrueOp, FalseOp))
	return getAbsolute(DAG, DL, TrueOp, C.CCMask & SystemZ::CCMASK_CMP_LT);
	if (isAbsolute(C.Op0, FalseOp, TrueOp))
	return getAbsolute(DAG, DL, FalseOp, C.CCMask & SystemZ::CCMASK_CMP_GT);
	}

	SDValue CCReg = emitCmp(DAG, DL, C);
	SDValue Ops[] = {TrueOp, FalseOp, DAG.getConstant(C.CCValid, DL, MVT::i32),
	DAG.getConstant(C.CCMask, DL, MVT::i32), CCReg};

	return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, Op.getValueType(), Ops);
	}

	SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
	SelectionDAG &DAG) const {
	SDLoc DL(Node);
	const GlobalValue *GV = Node->getGlobal();
	int64_t Offset = Node->getOffset();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	CodeModel::Model CM = DAG.getTarget().getCodeModel();

	SDValue Result;
	if (Subtarget.isPC32DBLSymbol(GV, CM)) {
	// Assign anchors at 1<<12 byte boundaries.
	uint64_t Anchor = Offset & ~uint64_t(0xfff);
	Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor);
	Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);

	// The offset can be folded into the address if it is aligned to a halfword.
	Offset -= Anchor;
	if (Offset != 0 && (Offset & 1) == 0) {
	SDValue Full = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor + Offset);
	Result = DAG.getNode(SystemZISD::PCREL_OFFSET, DL, PtrVT, Full, Result);
	Offset = 0;
	}
	} else {
	Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
	Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
	Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}

	// If there was a non-zero offset that we didn't fold, create an explicit
	// addition for it.
	if (Offset != 0)
	Result = DAG.getNode(ISD::ADD, DL, PtrVT, Result,
	DAG.getConstant(Offset, DL, PtrVT));

	return Result;
	}

	SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
	SelectionDAG &DAG,
	unsigned Opcode,
	SDValue GOTOffset) const {
	SDLoc DL(Node);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Chain = DAG.getEntryNode();
	SDValue Glue;

	// __tls_get_offset takes the GOT offset in %r2 and the GOT in %r12.
	SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
	Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R12D, GOT, Glue);
	Glue = Chain.getValue(1);
	Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R2D, GOTOffset, Glue);
	Glue = Chain.getValue(1);

	// The first call operand is the chain and the second is the TLS symbol.
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(Chain);
	Ops.push_back(DAG.getTargetGlobalAddress(Node->getGlobal(), DL,
	Node->getValueType(0),
	0, 0));

	// Add argument registers to the end of the list so that they are
	// known live into the call.
	Ops.push_back(DAG.getRegister(SystemZ::R2D, PtrVT));
	Ops.push_back(DAG.getRegister(SystemZ::R12D, PtrVT));

	// Add a register mask operand representing the call-preserved registers.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *Mask =
	TRI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	// Glue the call to the argument copies.
	Ops.push_back(Glue);

	// Emit the call.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getNode(Opcode, DL, NodeTys, Ops);
	Glue = Chain.getValue(1);

	// Copy the return value from %r2.
	return DAG.getCopyFromReg(Chain, DL, SystemZ::R2D, PtrVT, Glue);
	}

	SDValue SystemZTargetLowering::lowerThreadPointer(const SDLoc &DL,
	SelectionDAG &DAG) const {
	SDValue Chain = DAG.getEntryNode();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	// The high part of the thread pointer is in access register 0.
	SDValue TPHi = DAG.getCopyFromReg(Chain, DL, SystemZ::A0, MVT::i32);
	TPHi = DAG.getNode(ISD::ANY_EXTEND, DL, PtrVT, TPHi);

	// The low part of the thread pointer is in access register 1.
	SDValue TPLo = DAG.getCopyFromReg(Chain, DL, SystemZ::A1, MVT::i32);
	TPLo = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TPLo);

	// Merge them into a single 64-bit address.
	SDValue TPHiShifted = DAG.getNode(ISD::SHL, DL, PtrVT, TPHi,
	DAG.getConstant(32, DL, PtrVT));
	return DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
	}

	SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
	SelectionDAG &DAG) const {
	if (DAG.getTarget().useEmulatedTLS())
	return LowerToTLSEmulatedModel(Node, DAG);
	SDLoc DL(Node);
	const GlobalValue *GV = Node->getGlobal();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	TLSModel::Model model = DAG.getTarget().getTLSModel(GV);

	SDValue TP = lowerThreadPointer(DL, DAG);

	// Get the offset of GA from the thread pointer, based on the TLS model.
	SDValue Offset;
	switch (model) {
	case TLSModel::GeneralDynamic: {
	// Load the GOT offset of the tls_index (module ID / per-symbol offset).
	SystemZConstantPoolValue *CPV =
	SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD);

	Offset = DAG.getConstantPool(CPV, PtrVT, 8);
	Offset = DAG.getLoad(
	PtrVT, DL, DAG.getEntryNode(), Offset,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));

	// Call __tls_get_offset to retrieve the offset.
	Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
	break;
	}

	case TLSModel::LocalDynamic: {
	// Load the GOT offset of the module ID.
	SystemZConstantPoolValue *CPV =
	SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM);

	Offset = DAG.getConstantPool(CPV, PtrVT, 8);
	Offset = DAG.getLoad(
	PtrVT, DL, DAG.getEntryNode(), Offset,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));

	// Call __tls_get_offset to retrieve the module base offset.
	Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);

	// Note: The SystemZLDCleanupPass will remove redundant computations
	// of the module base offset. Count total number of local-dynamic
	// accesses to trigger execution of that pass.
	SystemZMachineFunctionInfo* MFI =
	DAG.getMachineFunction().getInfo<SystemZMachineFunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	// Add the per-symbol offset.
	CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF);

	SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8);
	DTPOffset = DAG.getLoad(
	PtrVT, DL, DAG.getEntryNode(), DTPOffset,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));

	Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
	break;
	}

	case TLSModel::InitialExec: {
	// Load the offset from the GOT.
	Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
	SystemZII::MO_INDNTPOFF);
	Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
	Offset =
	DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Offset,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	break;
	}

	case TLSModel::LocalExec: {
	// Force the offset into the constant pool and load it from there.
	SystemZConstantPoolValue *CPV =
	SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);

	Offset = DAG.getConstantPool(CPV, PtrVT, 8);
	Offset = DAG.getLoad(
	PtrVT, DL, DAG.getEntryNode(), Offset,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
	break;
	}
	}

	// Add the base and offset together.
	return DAG.getNode(ISD::ADD, DL, PtrVT, TP, Offset);
	}

	SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
	SelectionDAG &DAG) const {
	SDLoc DL(Node);
	const BlockAddress *BA = Node->getBlockAddress();
	int64_t Offset = Node->getOffset();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset);
	Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
	return Result;
	}

	SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
	SelectionDAG &DAG) const {
	SDLoc DL(JT);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);

	// Use LARL to load the address of the table.
	return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
	}

	SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
	SelectionDAG &DAG) const {
	SDLoc DL(CP);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	SDValue Result;
	if (CP->isMachineConstantPoolEntry())
	Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
	CP->getAlignment());
	else
	Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
	CP->getAlignment(), CP->getOffset());

	// Use LARL to load the address of the constant pool entry.
	return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
	}

	SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setFrameAddressIsTaken(true);

	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	// If the back chain frame index has not been allocated yet, do so.
	SystemZMachineFunctionInfo *FI = MF.getInfo<SystemZMachineFunctionInfo>();
	int BackChainIdx = FI->getFramePointerSaveIndex();
	if (!BackChainIdx) {
	// By definition, the frame address is the address of the back chain.
	BackChainIdx = MFI.CreateFixedObject(8, -SystemZMC::CallFrameSize, false);
	FI->setFramePointerSaveIndex(BackChainIdx);
	}
	SDValue BackChain = DAG.getFrameIndex(BackChainIdx, PtrVT);

	// FIXME The frontend should detect this case.
	if (Depth > 0) {
	report_fatal_error("Unsupported stack frame traversal count");
	}

	return BackChain;
	}

	SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	// FIXME The frontend should detect this case.
	if (Depth > 0) {
	report_fatal_error("Unsupported stack frame traversal count");
	}

	// Return R14D, which has the return address. Mark it an implicit live-in.
	unsigned LinkReg = MF.addLiveIn(SystemZ::R14D, &SystemZ::GR64BitRegClass);
	return DAG.getCopyFromReg(DAG.getEntryNode(), DL, LinkReg, PtrVT);
	}

	SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	SDValue In = Op.getOperand(0);
	EVT InVT = In.getValueType();
	EVT ResVT = Op.getValueType();

	// Convert loads directly. This is normally done by DAGCombiner,
	// but we need this case for bitcasts that are created during lowering
	// and which are then lowered themselves.
	if (auto *LoadN = dyn_cast<LoadSDNode>(In))
	if (ISD::isNormalLoad(LoadN)) {
	SDValue NewLoad = DAG.getLoad(ResVT, DL, LoadN->getChain(),
	LoadN->getBasePtr(), LoadN->getMemOperand());
	// Update the chain uses.
	DAG.ReplaceAllUsesOfValueWith(SDValue(LoadN, 1), NewLoad.getValue(1));
	return NewLoad;
	}

	if (InVT == MVT::i32 && ResVT == MVT::f32) {
	SDValue In64;
	if (Subtarget.hasHighWord()) {
	SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL,
	MVT::i64);
	In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
	MVT::i64, SDValue(U64, 0), In);
	} else {
	In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
	In64 = DAG.getNode(ISD::SHL, DL, MVT::i64, In64,
	DAG.getConstant(32, DL, MVT::i64));
	}
	SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, In64);
	return DAG.getTargetExtractSubreg(SystemZ::subreg_h32,
	DL, MVT::f32, Out64);
	}
	if (InVT == MVT::f32 && ResVT == MVT::i32) {
	SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f64);
	SDValue In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
	MVT::f64, SDValue(U64, 0), In);
	SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, In64);
	if (Subtarget.hasHighWord())
	return DAG.getTargetExtractSubreg(SystemZ::subreg_h32, DL,
	MVT::i32, Out64);
	SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64,
	DAG.getConstant(32, DL, MVT::i64));
	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
	}
	llvm_unreachable("Unexpected bitcast combination");
	}

	SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	SystemZMachineFunctionInfo *FuncInfo =
	MF.getInfo<SystemZMachineFunctionInfo>();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	SDValue Chain = Op.getOperand(0);
	SDValue Addr = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SDLoc DL(Op);

	// The initial values of each field.
	const unsigned NumFields = 4;
	SDValue Fields[NumFields] = {
	DAG.getConstant(FuncInfo->getVarArgsFirstGPR(), DL, PtrVT),
	DAG.getConstant(FuncInfo->getVarArgsFirstFPR(), DL, PtrVT),
	DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT),
	DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT)
	};

	// Store each field into its respective slot.
	SDValue MemOps[NumFields];
	unsigned Offset = 0;
	for (unsigned I = 0; I < NumFields; ++I) {
	SDValue FieldAddr = Addr;
	if (Offset != 0)
	FieldAddr = DAG.getNode(ISD::ADD, DL, PtrVT, FieldAddr,
	DAG.getIntPtrConstant(Offset, DL));
	MemOps[I] = DAG.getStore(Chain, DL, Fields[I], FieldAddr,
	MachinePointerInfo(SV, Offset));
	Offset += 8;
	}
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue DstPtr = Op.getOperand(1);
	SDValue SrcPtr = Op.getOperand(2);
	const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	SDLoc DL(Op);

	return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(32, DL),
	/Align/8, /isVolatile/false, /AlwaysInline/false,
	/isTailCall/false,
	MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
	}

	SDValue SystemZTargetLowering::
	lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
	const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
	MachineFunction &MF = DAG.getMachineFunction();
	bool RealignOpt = !MF.getFunction().hasFnAttribute("no-realign-stack");
	bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");

	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	SDValue Align = Op.getOperand(2);
	SDLoc DL(Op);

	// If user has set the no alignment function attribute, ignore
	// alloca alignments.
	uint64_t AlignVal = (RealignOpt ?
	dyn_cast<ConstantSDNode>(Align)->getZExtValue() : 0);

	uint64_t StackAlign = TFI->getStackAlignment();
	uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
	uint64_t ExtraAlignSpace = RequiredAlign - StackAlign;

	unsigned SPReg = getStackPointerRegisterToSaveRestore();
	SDValue NeededSpace = Size;

	// Get a reference to the stack pointer.
	SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i64);

	// If we need a backchain, save it now.
	SDValue Backchain;
	if (StoreBackchain)
	Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());

	// Add extra space for alignment if needed.
	if (ExtraAlignSpace)
	NeededSpace = DAG.getNode(ISD::ADD, DL, MVT::i64, NeededSpace,
	DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));

	// Get the new stack pointer value.
	SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace);

	// Copy the new stack pointer back.
	Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);

	// The allocated data lives above the 160 bytes allocated for the standard
	// frame, plus any outgoing stack arguments. We don't know how much that
	// amounts to yet, so emit a special ADJDYNALLOC placeholder.
	SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
	SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust);

	// Dynamically realign if needed.
	if (RequiredAlign > StackAlign) {
	Result =
	DAG.getNode(ISD::ADD, DL, MVT::i64, Result,
	DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
	Result =
	DAG.getNode(ISD::AND, DL, MVT::i64, Result,
	DAG.getConstant(~(RequiredAlign - 1), DL, MVT::i64));
	}

	if (StoreBackchain)
	Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());

	SDValue Ops[2] = { Result, Chain };
	return DAG.getMergeValues(Ops, DL);
	}

	SDValue SystemZTargetLowering::lowerGET_DYNAMIC_AREA_OFFSET(
	SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);

	return DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
	}

	SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue Ops[2];
	if (is32Bit(VT))
	// Just do a normal 64-bit multiplication and extract the results.
	// We define this so that it can be used for constant division.
	lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
	Op.getOperand(1), Ops[1], Ops[0]);
	else if (Subtarget.hasMiscellaneousExtensions2())
	// SystemZISD::SMUL_LOHI returns the low result in the odd register and
	// the high result in the even register. ISD::SMUL_LOHI is defined to
	// return the low half first, so the results are in reverse order.
	lowerGR128Binary(DAG, DL, VT, SystemZISD::SMUL_LOHI,
	Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
	else {
	// Do a full 128-bit multiplication based on SystemZISD::UMUL_LOHI:
	//
	// (ll * rl) + ((lh * rl) << 64) + ((ll * rh) << 64)
	//
	// but using the fact that the upper halves are either all zeros
	// or all ones:
	//
	// (ll * rl) - ((lh & rl) << 64) - ((ll & rh) << 64)
	//
	// and grouping the right terms together since they are quicker than the
	// multiplication:
	//
	// (ll * rl) - (((lh & rl) + (ll & rh)) << 64)
	SDValue C63 = DAG.getConstant(63, DL, MVT::i64);
	SDValue LL = Op.getOperand(0);
	SDValue RL = Op.getOperand(1);
	SDValue LH = DAG.getNode(ISD::SRA, DL, VT, LL, C63);
	SDValue RH = DAG.getNode(ISD::SRA, DL, VT, RL, C63);
	// SystemZISD::UMUL_LOHI returns the low result in the odd register and
	// the high result in the even register. ISD::SMUL_LOHI is defined to
	// return the low half first, so the results are in reverse order.
	lowerGR128Binary(DAG, DL, VT, SystemZISD::UMUL_LOHI,
	LL, RL, Ops[1], Ops[0]);
	SDValue NegLLTimesRH = DAG.getNode(ISD::AND, DL, VT, LL, RH);
	SDValue NegLHTimesRL = DAG.getNode(ISD::AND, DL, VT, LH, RL);
	SDValue NegSum = DAG.getNode(ISD::ADD, DL, VT, NegLLTimesRH, NegLHTimesRL);
	Ops[1] = DAG.getNode(ISD::SUB, DL, VT, Ops[1], NegSum);
	}
	return DAG.getMergeValues(Ops, DL);
	}

	SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue Ops[2];
	if (is32Bit(VT))
	// Just do a normal 64-bit multiplication and extract the results.
	// We define this so that it can be used for constant division.
	lowerMUL_LOHI32(DAG, DL, ISD::ZERO_EXTEND, Op.getOperand(0),
	Op.getOperand(1), Ops[1], Ops[0]);
	else
	// SystemZISD::UMUL_LOHI returns the low result in the odd register and
	// the high result in the even register. ISD::UMUL_LOHI is defined to
	// return the low half first, so the results are in reverse order.
	lowerGR128Binary(DAG, DL, VT, SystemZISD::UMUL_LOHI,
	Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
	return DAG.getMergeValues(Ops, DL);
	}

	SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	EVT VT = Op.getValueType();
	SDLoc DL(Op);

	// We use DSGF for 32-bit division. This means the first operand must
	// always be 64-bit, and the second operand should be 32-bit whenever
	// that is possible, to improve performance.
	if (is32Bit(VT))
	Op0 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op0);
	else if (DAG.ComputeNumSignBits(Op1) > 32)
	Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);

	// DSG(F) returns the remainder in the even register and the
	// quotient in the odd register.
	SDValue Ops[2];
	lowerGR128Binary(DAG, DL, VT, SystemZISD::SDIVREM, Op0, Op1, Ops[1], Ops[0]);
	return DAG.getMergeValues(Ops, DL);
	}

	SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);

	// DL(G) returns the remainder in the even register and the
	// quotient in the odd register.
	SDValue Ops[2];
	lowerGR128Binary(DAG, DL, VT, SystemZISD::UDIVREM,
	Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
	return DAG.getMergeValues(Ops, DL);
	}

	SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
	assert(Op.getValueType() == MVT::i64 && "Should be 64-bit operation");

	// Get the known-zero masks for each operand.
	SDValue Ops[] = {Op.getOperand(0), Op.getOperand(1)};
	KnownBits Known[2] = {DAG.computeKnownBits(Ops[0]),
	DAG.computeKnownBits(Ops[1])};

	// See if the upper 32 bits of one operand and the lower 32 bits of the
	// other are known zero. They are the low and high operands respectively.
	uint64_t Masks[] = { Known[0].Zero.getZExtValue(),
	Known[1].Zero.getZExtValue() };
	unsigned High, Low;
	if ((Masks[0] >> 32) == 0xffffffff && uint32_t(Masks[1]) == 0xffffffff)
	High = 1, Low = 0;
	else if ((Masks[1] >> 32) == 0xffffffff && uint32_t(Masks[0]) == 0xffffffff)
	High = 0, Low = 1;
	else
	return Op;

	SDValue LowOp = Ops[Low];
	SDValue HighOp = Ops[High];

	// If the high part is a constant, we're better off using IILH.
	if (HighOp.getOpcode() == ISD::Constant)
	return Op;

	// If the low part is a constant that is outside the range of LHI,
	// then we're better off using IILF.
	if (LowOp.getOpcode() == ISD::Constant) {
	int64_t Value = int32_t(cast<ConstantSDNode>(LowOp)->getZExtValue());
	if (!isInt<16>(Value))
	return Op;
	}

	// Check whether the high part is an AND that doesn't change the
	// high 32 bits and just masks out low bits. We can skip it if so.
	if (HighOp.getOpcode() == ISD::AND &&
	HighOp.getOperand(1).getOpcode() == ISD::Constant) {
	SDValue HighOp0 = HighOp.getOperand(0);
	uint64_t Mask = cast<ConstantSDNode>(HighOp.getOperand(1))->getZExtValue();
	if (DAG.MaskedValueIsZero(HighOp0, APInt(64, ~(Mask \| 0xffffffff))))
	HighOp = HighOp0;
	}

	// Take advantage of the fact that all GR32 operations only change the
	// low 32 bits by truncating Low to an i32 and inserting it directly
	// using a subreg. The interesting cases are those where the truncation
	// can be folded.
	SDLoc DL(Op);
	SDValue Low32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, LowOp);
	return DAG.getTargetInsertSubreg(SystemZ::subreg_l32, DL,
	MVT::i64, HighOp, Low32);
	}

	// Lower SADDO/SSUBO/UADDO/USUBO nodes.
	SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
	SelectionDAG &DAG) const {
	SDNode *N = Op.getNode();
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	SDLoc DL(N);
	unsigned BaseOp = 0;
	unsigned CCValid = 0;
	unsigned CCMask = 0;

	switch (Op.getOpcode()) {
	default: llvm_unreachable("Unknown instruction!");
	case ISD::SADDO:
	BaseOp = SystemZISD::SADDO;
	CCValid = SystemZ::CCMASK_ARITH;
	CCMask = SystemZ::CCMASK_ARITH_OVERFLOW;
	break;
	case ISD::SSUBO:
	BaseOp = SystemZISD::SSUBO;
	CCValid = SystemZ::CCMASK_ARITH;
	CCMask = SystemZ::CCMASK_ARITH_OVERFLOW;
	break;
	case ISD::UADDO:
	BaseOp = SystemZISD::UADDO;
	CCValid = SystemZ::CCMASK_LOGICAL;
	CCMask = SystemZ::CCMASK_LOGICAL_CARRY;
	break;
	case ISD::USUBO:
	BaseOp = SystemZISD::USUBO;
	CCValid = SystemZ::CCMASK_LOGICAL;
	CCMask = SystemZ::CCMASK_LOGICAL_BORROW;
	break;
	}

	SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
	SDValue Result = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

	SDValue SetCC = emitSETCC(DAG, DL, Result.getValue(1), CCValid, CCMask);
	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
	}

	static bool isAddCarryChain(SDValue Carry) {
	while (Carry.getOpcode() == ISD::ADDCARRY)
	Carry = Carry.getOperand(2);
	return Carry.getOpcode() == ISD::UADDO;
	}

	static bool isSubBorrowChain(SDValue Carry) {
	while (Carry.getOpcode() == ISD::SUBCARRY)
	Carry = Carry.getOperand(2);
	return Carry.getOpcode() == ISD::USUBO;
	}

	// Lower ADDCARRY/SUBCARRY nodes.
	SDValue SystemZTargetLowering::lowerADDSUBCARRY(SDValue Op,
	SelectionDAG &DAG) const {

	SDNode *N = Op.getNode();
	MVT VT = N->getSimpleValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	SDValue Carry = Op.getOperand(2);
	SDLoc DL(N);
	unsigned BaseOp = 0;
	unsigned CCValid = 0;
	unsigned CCMask = 0;

	switch (Op.getOpcode()) {
	default: llvm_unreachable("Unknown instruction!");
	case ISD::ADDCARRY:
	if (!isAddCarryChain(Carry))
	return SDValue();

	BaseOp = SystemZISD::ADDCARRY;
	CCValid = SystemZ::CCMASK_LOGICAL;
	CCMask = SystemZ::CCMASK_LOGICAL_CARRY;
	break;
	case ISD::SUBCARRY:
	if (!isSubBorrowChain(Carry))
	return SDValue();

	BaseOp = SystemZISD::SUBCARRY;
	CCValid = SystemZ::CCMASK_LOGICAL;
	CCMask = SystemZ::CCMASK_LOGICAL_BORROW;
	break;
	}

	// Set the condition code from the carry flag.
	Carry = DAG.getNode(SystemZISD::GET_CCMASK, DL, MVT::i32, Carry,
	DAG.getConstant(CCValid, DL, MVT::i32),
	DAG.getConstant(CCMask, DL, MVT::i32));

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	SDValue Result = DAG.getNode(BaseOp, DL, VTs, LHS, RHS, Carry);

	SDValue SetCC = emitSETCC(DAG, DL, Result.getValue(1), CCValid, CCMask);
	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
	}

	SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	Op = Op.getOperand(0);

	// Handle vector types via VPOPCT.
	if (VT.isVector()) {
	Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Op);
	Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::v16i8, Op);
	switch (VT.getScalarSizeInBits()) {
	case 8:
	break;
	case 16: {
	Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
	SDValue Shift = DAG.getConstant(8, DL, MVT::i32);
	SDValue Tmp = DAG.getNode(SystemZISD::VSHL_BY_SCALAR, DL, VT, Op, Shift);
	Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
	Op = DAG.getNode(SystemZISD::VSRL_BY_SCALAR, DL, VT, Op, Shift);
	break;
	}
	case 32: {
	SDValue Tmp = DAG.getSplatBuildVector(MVT::v16i8, DL,
	DAG.getConstant(0, DL, MVT::i32));
	Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
	break;
	}
	case 64: {
	SDValue Tmp = DAG.getSplatBuildVector(MVT::v16i8, DL,
	DAG.getConstant(0, DL, MVT::i32));
	Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Tmp);
	Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
	break;
	}
	default:
	llvm_unreachable("Unexpected type");
	}
	return Op;
	}

	// Get the known-zero mask for the operand.
	KnownBits Known = DAG.computeKnownBits(Op);
	unsigned NumSignificantBits = (~Known.Zero).getActiveBits();
	if (NumSignificantBits == 0)
	return DAG.getConstant(0, DL, VT);

	// Skip known-zero high parts of the operand.
	int64_t OrigBitSize = VT.getSizeInBits();
	int64_t BitSize = (int64_t)1 << Log2_32_Ceil(NumSignificantBits);
	BitSize = std::min(BitSize, OrigBitSize);

	// The POPCNT instruction counts the number of bits in each byte.
	Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op);
	Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::i64, Op);
	Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

	// Add up per-byte counts in a binary tree. All bits of Op at
	// position larger than BitSize remain zero throughout.
	for (int64_t I = BitSize / 2; I >= 8; I = I / 2) {
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, VT, Op, DAG.getConstant(I, DL, VT));
	if (BitSize != OrigBitSize)
	Tmp = DAG.getNode(ISD::AND, DL, VT, Tmp,
	DAG.getConstant(((uint64_t)1 << BitSize) - 1, DL, VT));
	Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
	}

	// Extract overall result from high byte.
	if (BitSize > 8)
	Op = DAG.getNode(ISD::SRL, DL, VT, Op,
	DAG.getConstant(BitSize - 8, DL, VT));

	return Op;
	}

	SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
	cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
	SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

	// The only fence that needs an instruction is a sequentially-consistent
	// cross-thread fence.
	if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
	FenceSSID == SyncScope::System) {
	return SDValue(DAG.getMachineNode(SystemZ::Serialize, DL, MVT::Other,
	Op.getOperand(0)),
	0);
	}

	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	return DAG.getNode(SystemZISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
	}

	// Op is an atomic load. Lower it into a normal volatile load.
	SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
	SelectionDAG &DAG) const {
	auto *Node = cast<AtomicSDNode>(Op.getNode());
	return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
	Node->getChain(), Node->getBasePtr(),
	Node->getMemoryVT(), Node->getMemOperand());
	}

	// Op is an atomic store. Lower it into a normal volatile store.
	SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
	SelectionDAG &DAG) const {
	auto *Node = cast<AtomicSDNode>(Op.getNode());
	SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(),
	Node->getBasePtr(), Node->getMemoryVT(),
	Node->getMemOperand());
	// We have to enforce sequential consistency by performing a
	// serialization operation after the store.
	if (Node->getOrdering() == AtomicOrdering::SequentiallyConsistent)
	Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
	MVT::Other, Chain), 0);
	return Chain;
	}

	// Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation. Lower the first
	// two into the fullword ATOMIC_LOADW_* operation given by Opcode.
	SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
	SelectionDAG &DAG,
	unsigned Opcode) const {
	auto *Node = cast<AtomicSDNode>(Op.getNode());

	// 32-bit operations need no code outside the main loop.
	EVT NarrowVT = Node->getMemoryVT();
	EVT WideVT = MVT::i32;
	if (NarrowVT == WideVT)
	return Op;

	int64_t BitSize = NarrowVT.getSizeInBits();
	SDValue ChainIn = Node->getChain();
	SDValue Addr = Node->getBasePtr();
	SDValue Src2 = Node->getVal();
	MachineMemOperand *MMO = Node->getMemOperand();
	SDLoc DL(Node);
	EVT PtrVT = Addr.getValueType();

	// Convert atomic subtracts of constants into additions.
	if (Opcode == SystemZISD::ATOMIC_LOADW_SUB)
	if (auto *Const = dyn_cast<ConstantSDNode>(Src2)) {
	Opcode = SystemZISD::ATOMIC_LOADW_ADD;
	Src2 = DAG.getConstant(-Const->getSExtValue(), DL, Src2.getValueType());
	}

	// Get the address of the containing word.
	SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
	DAG.getConstant(-4, DL, PtrVT));

	// Get the number of bits that the word must be rotated left in order
	// to bring the field to the top bits of a GR32.
	SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
	DAG.getConstant(3, DL, PtrVT));
	BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);

	// Get the complementing shift amount, for rotating a field in the top
	// bits back to its proper position.
	SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
	DAG.getConstant(0, DL, WideVT), BitShift);

	// Extend the source operand to 32 bits and prepare it for the inner loop.
	// ATOMIC_SWAPW uses RISBG to rotate the field left, but all other
	// operations require the source to be shifted in advance. (This shift
	// can be folded if the source is constant.) For AND and NAND, the lower
	// bits must be set, while for other opcodes they should be left clear.
	if (Opcode != SystemZISD::ATOMIC_SWAPW)
	Src2 = DAG.getNode(ISD::SHL, DL, WideVT, Src2,
	DAG.getConstant(32 - BitSize, DL, WideVT));
	if (Opcode == SystemZISD::ATOMIC_LOADW_AND \|\|
	Opcode == SystemZISD::ATOMIC_LOADW_NAND)
	Src2 = DAG.getNode(ISD::OR, DL, WideVT, Src2,
	DAG.getConstant(uint32_t(-1) >> BitSize, DL, WideVT));

	// Construct the ATOMIC_LOADW_* node.
	SDVTList VTList = DAG.getVTList(WideVT, MVT::Other);
	SDValue Ops[] = { ChainIn, AlignedAddr, Src2, BitShift, NegBitShift,
	DAG.getConstant(BitSize, DL, WideVT) };
	SDValue AtomicOp = DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops,
	NarrowVT, MMO);

	// Rotate the result of the final CS so that the field is in the lower
	// bits of a GR32, then truncate it.
	SDValue ResultShift = DAG.getNode(ISD::ADD, DL, WideVT, BitShift,
	DAG.getConstant(BitSize, DL, WideVT));
	SDValue Result = DAG.getNode(ISD::ROTL, DL, WideVT, AtomicOp, ResultShift);

	SDValue RetOps[2] = { Result, AtomicOp.getValue(1) };
	return DAG.getMergeValues(RetOps, DL);
	}

	// Op is an ATOMIC_LOAD_SUB operation. Lower 8- and 16-bit operations
	// into ATOMIC_LOADW_SUBs and decide whether to convert 32- and 64-bit
	// operations into additions.
	SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op,
	SelectionDAG &DAG) const {
	auto *Node = cast<AtomicSDNode>(Op.getNode());
	EVT MemVT = Node->getMemoryVT();
	if (MemVT == MVT::i32 \|\| MemVT == MVT::i64) {
	// A full-width operation.
	assert(Op.getValueType() == MemVT && "Mismatched VTs");
	SDValue Src2 = Node->getVal();
	SDValue NegSrc2;
	SDLoc DL(Src2);

	if (auto *Op2 = dyn_cast<ConstantSDNode>(Src2)) {
	// Use an addition if the operand is constant and either LAA(G) is
	// available or the negative value is in the range of A(G)FHI.
	int64_t Value = (-Op2->getAPIntValue()).getSExtValue();
	if (isInt<32>(Value) \|\| Subtarget.hasInterlockedAccess1())
	NegSrc2 = DAG.getConstant(Value, DL, MemVT);
	} else if (Subtarget.hasInterlockedAccess1())
	// Use LAA(G) if available.
	NegSrc2 = DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, DL, MemVT),
	Src2);

	if (NegSrc2.getNode())
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, MemVT,
	Node->getChain(), Node->getBasePtr(), NegSrc2,
	Node->getMemOperand());

	// Use the node as-is.
	return Op;
	}

	return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB);
	}

	// Lower 8/16/32/64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS node.
	SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
	SelectionDAG &DAG) const {
	auto *Node = cast<AtomicSDNode>(Op.getNode());
	SDValue ChainIn = Node->getOperand(0);
	SDValue Addr = Node->getOperand(1);
	SDValue CmpVal = Node->getOperand(2);
	SDValue SwapVal = Node->getOperand(3);
	MachineMemOperand *MMO = Node->getMemOperand();
	SDLoc DL(Node);

	// We have native support for 32-bit and 64-bit compare and swap, but we
	// still need to expand extracting the "success" result from the CC.
	EVT NarrowVT = Node->getMemoryVT();
	EVT WideVT = NarrowVT == MVT::i64 ? MVT::i64 : MVT::i32;
	if (NarrowVT == WideVT) {
	SDVTList Tys = DAG.getVTList(WideVT, MVT::i32, MVT::Other);
	SDValue Ops[] = { ChainIn, Addr, CmpVal, SwapVal };
	SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP,
	DL, Tys, Ops, NarrowVT, MMO);
	SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
	SystemZ::CCMASK_CS, SystemZ::CCMASK_CS_EQ);

	DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
	DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
	DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
	return SDValue();
	}

	// Convert 8-bit and 16-bit compare and swap to a loop, implemented
	// via a fullword ATOMIC_CMP_SWAPW operation.
	int64_t BitSize = NarrowVT.getSizeInBits();
	EVT PtrVT = Addr.getValueType();

	// Get the address of the containing word.
	SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
	DAG.getConstant(-4, DL, PtrVT));

	// Get the number of bits that the word must be rotated left in order
	// to bring the field to the top bits of a GR32.
	SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
	DAG.getConstant(3, DL, PtrVT));
	BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);

	// Get the complementing shift amount, for rotating a field in the top
	// bits back to its proper position.
	SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
	DAG.getConstant(0, DL, WideVT), BitShift);

	// Construct the ATOMIC_CMP_SWAPW node.
	SDVTList VTList = DAG.getVTList(WideVT, MVT::i32, MVT::Other);
	SDValue Ops[] = { ChainIn, AlignedAddr, CmpVal, SwapVal, BitShift,
	NegBitShift, DAG.getConstant(BitSize, DL, WideVT) };
	SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAPW, DL,
	VTList, Ops, NarrowVT, MMO);
	SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
	SystemZ::CCMASK_ICMP, SystemZ::CCMASK_CMP_EQ);

	DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
	DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
	DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
	return SDValue();
	}

	MachineMemOperand::Flags
	SystemZTargetLowering::getMMOFlags(const Instruction &I) const {
	// Because of how we convert atomic_load and atomic_store to normal loads and
	// stores in the DAG, we need to ensure that the MMOs are marked volatile
	// since DAGCombine hasn't been updated to account for atomic, but non
	// volatile loads. (See D57601)
	if (auto *SI = dyn_cast<StoreInst>(&I))
	if (SI->isAtomic())
	return MachineMemOperand::MOVolatile;
	if (auto *LI = dyn_cast<LoadInst>(&I))
	if (LI->isAtomic())
	return MachineMemOperand::MOVolatile;
	if (auto *AI = dyn_cast<AtomicRMWInst>(&I))
	if (AI->isAtomic())
	return MachineMemOperand::MOVolatile;
	if (auto *AI = dyn_cast<AtomicCmpXchgInst>(&I))
	if (AI->isAtomic())
	return MachineMemOperand::MOVolatile;
	return MachineMemOperand::MONone;
	}

	SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
	return DAG.getCopyFromReg(Op.getOperand(0), SDLoc(Op),
	SystemZ::R15D, Op.getValueType());
	}

	SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
	bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");

	SDValue Chain = Op.getOperand(0);
	SDValue NewSP = Op.getOperand(1);
	SDValue Backchain;
	SDLoc DL(Op);

	if (StoreBackchain) {
	SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, MVT::i64);
	Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
	}

	Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R15D, NewSP);

	if (StoreBackchain)
	Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());

	return Chain;
	}

	SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
	SelectionDAG &DAG) const {
	bool IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
	if (!IsData)
	// Just preserve the chain.
	return Op.getOperand(0);

	SDLoc DL(Op);
	bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
	auto *Node = cast<MemIntrinsicSDNode>(Op.getNode());
	SDValue Ops[] = {
	Op.getOperand(0),
	DAG.getConstant(Code, DL, MVT::i32),
	Op.getOperand(1)
	};
	return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, DL,
	Node->getVTList(), Ops,
	Node->getMemoryVT(), Node->getMemOperand());
	}

	// Convert condition code in CCReg to an i32 value.
	static SDValue getCCResult(SelectionDAG &DAG, SDValue CCReg) {
	SDLoc DL(CCReg);
	SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
	return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
	DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
	}

	SDValue
	SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	unsigned Opcode, CCValid;
	if (isIntrinsicWithCCAndChain(Op, Opcode, CCValid)) {
	assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
	SDNode *Node = emitIntrinsicWithCCAndChain(DAG, Op, Opcode);
	SDValue CC = getCCResult(DAG, SDValue(Node, 0));
	DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), CC);
	return SDValue();
	}

	return SDValue();
	}

	SDValue
	SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	unsigned Opcode, CCValid;
	if (isIntrinsicWithCC(Op, Opcode, CCValid)) {
	SDNode *Node = emitIntrinsicWithCC(DAG, Op, Opcode);
	if (Op->getNumValues() == 1)
	return getCCResult(DAG, SDValue(Node, 0));
	assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result");
	return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(),
	SDValue(Node, 0), getCCResult(DAG, SDValue(Node, 1)));
	}

	unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	switch (Id) {
	case Intrinsic::thread_pointer:
	return lowerThreadPointer(SDLoc(Op), DAG);

	case Intrinsic::s390_vpdi:
	return DAG.getNode(SystemZISD::PERMUTE_DWORDS, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));

	case Intrinsic::s390_vperm:
	return DAG.getNode(SystemZISD::PERMUTE, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));

	case Intrinsic::s390_vuphb:
	case Intrinsic::s390_vuphh:
	case Intrinsic::s390_vuphf:
	return DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1));

	case Intrinsic::s390_vuplhb:
	case Intrinsic::s390_vuplhh:
	case Intrinsic::s390_vuplhf:
	return DAG.getNode(SystemZISD::UNPACKL_HIGH, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1));

	case Intrinsic::s390_vuplb:
	case Intrinsic::s390_vuplhw:
	case Intrinsic::s390_vuplf:
	return DAG.getNode(SystemZISD::UNPACK_LOW, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1));

	case Intrinsic::s390_vupllb:
	case Intrinsic::s390_vupllh:
	case Intrinsic::s390_vupllf:
	return DAG.getNode(SystemZISD::UNPACKL_LOW, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1));

	case Intrinsic::s390_vsumb:
	case Intrinsic::s390_vsumh:
	case Intrinsic::s390_vsumgh:
	case Intrinsic::s390_vsumgf:
	case Intrinsic::s390_vsumqf:
	case Intrinsic::s390_vsumqg:
	return DAG.getNode(SystemZISD::VSUM, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	}

	return SDValue();
	}

	namespace {
	// Says that SystemZISD operation Opcode can be used to perform the equivalent
	// of a VPERM with permute vector Bytes. If Opcode takes three operands,
	// Operand is the constant third operand, otherwise it is the number of
	// bytes in each element of the result.
	struct Permute {
	unsigned Opcode;
	unsigned Operand;
	unsigned char Bytes[SystemZ::VectorBytes];
	};
	}

	static const Permute PermuteForms[] = {
	// VMRHG
	{ SystemZISD::MERGE_HIGH, 8,
	{ 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 } },
	// VMRHF
	{ SystemZISD::MERGE_HIGH, 4,
	{ 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 } },
	// VMRHH
	{ SystemZISD::MERGE_HIGH, 2,
	{ 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 } },
	// VMRHB
	{ SystemZISD::MERGE_HIGH, 1,
	{ 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 } },
	// VMRLG
	{ SystemZISD::MERGE_LOW, 8,
	{ 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 } },
	// VMRLF
	{ SystemZISD::MERGE_LOW, 4,
	{ 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 } },
	// VMRLH
	{ SystemZISD::MERGE_LOW, 2,
	{ 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 } },
	// VMRLB
	{ SystemZISD::MERGE_LOW, 1,
	{ 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 } },
	// VPKG
	{ SystemZISD::PACK, 4,
	{ 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 } },
	// VPKF
	{ SystemZISD::PACK, 2,
	{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 } },
	// VPKH
	{ SystemZISD::PACK, 1,
	{ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 } },
	// VPDI V1, V2, 4 (low half of V1, high half of V2)
	{ SystemZISD::PERMUTE_DWORDS, 4,
	{ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 } },
	// VPDI V1, V2, 1 (high half of V1, low half of V2)
	{ SystemZISD::PERMUTE_DWORDS, 1,
	{ 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 } }
	};

	// Called after matching a vector shuffle against a particular pattern.
	// Both the original shuffle and the pattern have two vector operands.
	// OpNos[0] is the operand of the original shuffle that should be used for
	// operand 0 of the pattern, or -1 if operand 0 of the pattern can be anything.
	// OpNos[1] is the same for operand 1 of the pattern. Resolve these -1s and
	// set OpNo0 and OpNo1 to the shuffle operands that should actually be used
	// for operands 0 and 1 of the pattern.
	static bool chooseShuffleOpNos(int *OpNos, unsigned &OpNo0, unsigned &OpNo1) {
	if (OpNos[0] < 0) {
	if (OpNos[1] < 0)
	return false;
	OpNo0 = OpNo1 = OpNos[1];
	} else if (OpNos[1] < 0) {
	OpNo0 = OpNo1 = OpNos[0];
	} else {
	OpNo0 = OpNos[0];
	OpNo1 = OpNos[1];
	}
	return true;
	}

	// Bytes is a VPERM-like permute vector, except that -1 is used for
	// undefined bytes. Return true if the VPERM can be implemented using P.
	// When returning true set OpNo0 to the VPERM operand that should be
	// used for operand 0 of P and likewise OpNo1 for operand 1 of P.
	//
	// For example, if swapping the VPERM operands allows P to match, OpNo0
	// will be 1 and OpNo1 will be 0. If instead Bytes only refers to one
	// operand, but rewriting it to use two duplicated operands allows it to
	// match P, then OpNo0 and OpNo1 will be the same.
	static bool matchPermute(const SmallVectorImpl<int> &Bytes, const Permute &P,
	unsigned &OpNo0, unsigned &OpNo1) {
	int OpNos[] = { -1, -1 };
	for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
	int Elt = Bytes[I];
	if (Elt >= 0) {
	// Make sure that the two permute vectors use the same suboperand
	// byte number. Only the operand numbers (the high bits) are
	// allowed to differ.
	if ((Elt ^ P.Bytes[I]) & (SystemZ::VectorBytes - 1))
	return false;
	int ModelOpNo = P.Bytes[I] / SystemZ::VectorBytes;
	int RealOpNo = unsigned(Elt) / SystemZ::VectorBytes;
	// Make sure that the operand mappings are consistent with previous
	// elements.
	if (OpNos[ModelOpNo] == 1 - RealOpNo)
	return false;
	OpNos[ModelOpNo] = RealOpNo;
	}
	}
	return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
	}

	// As above, but search for a matching permute.
	static const Permute *matchPermute(const SmallVectorImpl<int> &Bytes,
	unsigned &OpNo0, unsigned &OpNo1) {
	for (auto &P : PermuteForms)
	if (matchPermute(Bytes, P, OpNo0, OpNo1))
	return &P;
	return nullptr;
	}

	// Bytes is a VPERM-like permute vector, except that -1 is used for
	// undefined bytes. This permute is an operand of an outer permute.
	// See whether redistributing the -1 bytes gives a shuffle that can be
	// implemented using P. If so, set Transform to a VPERM-like permute vector
	// that, when applied to the result of P, gives the original permute in Bytes.
	static bool matchDoublePermute(const SmallVectorImpl<int> &Bytes,
	const Permute &P,
	SmallVectorImpl<int> &Transform) {
	unsigned To = 0;
	for (unsigned From = 0; From < SystemZ::VectorBytes; ++From) {
	int Elt = Bytes[From];
	if (Elt < 0)
	// Byte number From of the result is undefined.
	Transform[From] = -1;
	else {
	while (P.Bytes[To] != Elt) {
	To += 1;
	if (To == SystemZ::VectorBytes)
	return false;
	}
	Transform[From] = To;
	}
	}
	return true;
	}

	// As above, but search for a matching permute.
	static const Permute *matchDoublePermute(const SmallVectorImpl<int> &Bytes,
	SmallVectorImpl<int> &Transform) {
	for (auto &P : PermuteForms)
	if (matchDoublePermute(Bytes, P, Transform))
	return &P;
	return nullptr;
	}

	// Convert the mask of the given shuffle op into a byte-level mask,
	// as if it had type vNi8.
	static bool getVPermMask(SDValue ShuffleOp,
	SmallVectorImpl<int> &Bytes) {
	EVT VT = ShuffleOp.getValueType();
	unsigned NumElements = VT.getVectorNumElements();
	unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();

	if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(ShuffleOp)) {
	Bytes.resize(NumElements * BytesPerElement, -1);
	for (unsigned I = 0; I < NumElements; ++I) {
	int Index = VSN->getMaskElt(I);
	if (Index >= 0)
	for (unsigned J = 0; J < BytesPerElement; ++J)
	Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
	}
	return true;
	}
	if (SystemZISD::SPLAT == ShuffleOp.getOpcode() &&
	isa<ConstantSDNode>(ShuffleOp.getOperand(1))) {
	unsigned Index = ShuffleOp.getConstantOperandVal(1);
	Bytes.resize(NumElements * BytesPerElement, -1);
	for (unsigned I = 0; I < NumElements; ++I)
	for (unsigned J = 0; J < BytesPerElement; ++J)
	Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
	return true;
	}
	return false;
	}

	// Bytes is a VPERM-like permute vector, except that -1 is used for
	// undefined bytes. See whether bytes [Start, Start + BytesPerElement) of
	// the result come from a contiguous sequence of bytes from one input.
	// Set Base to the selector for the first byte if so.
	static bool getShuffleInput(const SmallVectorImpl<int> &Bytes, unsigned Start,
	unsigned BytesPerElement, int &Base) {
	Base = -1;
	for (unsigned I = 0; I < BytesPerElement; ++I) {
	if (Bytes[Start + I] >= 0) {
	unsigned Elem = Bytes[Start + I];
	if (Base < 0) {
	Base = Elem - I;
	// Make sure the bytes would come from one input operand.
	if (unsigned(Base) % Bytes.size() + BytesPerElement > Bytes.size())
	return false;
	} else if (unsigned(Base) != Elem - I)
	return false;
	}
	}
	return true;
	}

	// Bytes is a VPERM-like permute vector, except that -1 is used for
	// undefined bytes. Return true if it can be performed using VSLDI.
	// When returning true, set StartIndex to the shift amount and OpNo0
	// and OpNo1 to the VPERM operands that should be used as the first
	// and second shift operand respectively.
	static bool isShlDoublePermute(const SmallVectorImpl<int> &Bytes,
	unsigned &StartIndex, unsigned &OpNo0,
	unsigned &OpNo1) {
	int OpNos[] = { -1, -1 };
	int Shift = -1;
	for (unsigned I = 0; I < 16; ++I) {
	int Index = Bytes[I];
	if (Index >= 0) {
	int ExpectedShift = (Index - I) % SystemZ::VectorBytes;
	int ModelOpNo = unsigned(ExpectedShift + I) / SystemZ::VectorBytes;
	int RealOpNo = unsigned(Index) / SystemZ::VectorBytes;
	if (Shift < 0)
	Shift = ExpectedShift;
	else if (Shift != ExpectedShift)
	return false;
	// Make sure that the operand mappings are consistent with previous
	// elements.
	if (OpNos[ModelOpNo] == 1 - RealOpNo)
	return false;
	OpNos[ModelOpNo] = RealOpNo;
	}
	}
	StartIndex = Shift;
	return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
	}

	// Create a node that performs P on operands Op0 and Op1, casting the
	// operands to the appropriate type. The type of the result is determined by P.
	static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
	const Permute &P, SDValue Op0, SDValue Op1) {
	// VPDI (PERMUTE_DWORDS) always operates on v2i64s. The input
	// elements of a PACK are twice as wide as the outputs.
	unsigned InBytes = (P.Opcode == SystemZISD::PERMUTE_DWORDS ? 8 :
	P.Opcode == SystemZISD::PACK ? P.Operand * 2 :
	P.Operand);
	// Cast both operands to the appropriate type.
	MVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBytes * 8),
	SystemZ::VectorBytes / InBytes);
	Op0 = DAG.getNode(ISD::BITCAST, DL, InVT, Op0);
	Op1 = DAG.getNode(ISD::BITCAST, DL, InVT, Op1);
	SDValue Op;
	if (P.Opcode == SystemZISD::PERMUTE_DWORDS) {
	SDValue Op2 = DAG.getConstant(P.Operand, DL, MVT::i32);
	Op = DAG.getNode(SystemZISD::PERMUTE_DWORDS, DL, InVT, Op0, Op1, Op2);
	} else if (P.Opcode == SystemZISD::PACK) {
	MVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(P.Operand * 8),
	SystemZ::VectorBytes / P.Operand);
	Op = DAG.getNode(SystemZISD::PACK, DL, OutVT, Op0, Op1);
	} else {
	Op = DAG.getNode(P.Opcode, DL, InVT, Op0, Op1);
	}
	return Op;
	}

	// Bytes is a VPERM-like permute vector, except that -1 is used for
	// undefined bytes. Implement it on operands Ops[0] and Ops[1] using
	// VSLDI or VPERM.
	static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
	SDValue *Ops,
	const SmallVectorImpl<int> &Bytes) {
	for (unsigned I = 0; I < 2; ++I)
	Ops[I] = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Ops[I]);

	// First see whether VSLDI can be used.
	unsigned StartIndex, OpNo0, OpNo1;
	if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1))
	return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0],
	Ops[OpNo1], DAG.getConstant(StartIndex, DL, MVT::i32));

	// Fall back on VPERM. Construct an SDNode for the permute vector.
	SDValue IndexNodes[SystemZ::VectorBytes];
	for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
	if (Bytes[I] >= 0)
	IndexNodes[I] = DAG.getConstant(Bytes[I], DL, MVT::i32);
	else
	IndexNodes[I] = DAG.getUNDEF(MVT::i32);
	SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
	return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2);
	}

	namespace {
	// Describes a general N-operand vector shuffle.
	struct GeneralShuffle {
	GeneralShuffle(EVT vt) : VT(vt) {}
	void addUndef();
	bool add(SDValue, unsigned);
	SDValue getNode(SelectionDAG &, const SDLoc &);

	// The operands of the shuffle.
	SmallVector<SDValue, SystemZ::VectorBytes> Ops;

	// Index I is -1 if byte I of the result is undefined. Otherwise the
	// result comes from byte Bytes[I] % SystemZ::VectorBytes of operand
	// Bytes[I] / SystemZ::VectorBytes.
	SmallVector<int, SystemZ::VectorBytes> Bytes;

	// The type of the shuffle result.
	EVT VT;
	};
	}

	// Add an extra undefined element to the shuffle.
	void GeneralShuffle::addUndef() {
	unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
	for (unsigned I = 0; I < BytesPerElement; ++I)
	Bytes.push_back(-1);
	}

	// Add an extra element to the shuffle, taking it from element Elem of Op.
	// A null Op indicates a vector input whose value will be calculated later;
	// there is at most one such input per shuffle and it always has the same
	// type as the result. Aborts and returns false if the source vector elements
	// of an EXTRACT_VECTOR_ELT are smaller than the destination elements. Per
	// LLVM they become implicitly extended, but this is rare and not optimized.
	bool GeneralShuffle::add(SDValue Op, unsigned Elem) {
	unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();

	// The source vector can have wider elements than the result,
	// either through an explicit TRUNCATE or because of type legalization.
	// We want the least significant part.
	EVT FromVT = Op.getNode() ? Op.getValueType() : VT;
	unsigned FromBytesPerElement = FromVT.getVectorElementType().getStoreSize();

	// Return false if the source elements are smaller than their destination
	// elements.
	if (FromBytesPerElement < BytesPerElement)
	return false;

	unsigned Byte = ((Elem * FromBytesPerElement) % SystemZ::VectorBytes +
	(FromBytesPerElement - BytesPerElement));

	// Look through things like shuffles and bitcasts.
	while (Op.getNode()) {
	if (Op.getOpcode() == ISD::BITCAST)
	Op = Op.getOperand(0);
	else if (Op.getOpcode() == ISD::VECTOR_SHUFFLE && Op.hasOneUse()) {
	// See whether the bytes we need come from a contiguous part of one
	// operand.
	SmallVector<int, SystemZ::VectorBytes> OpBytes;
	if (!getVPermMask(Op, OpBytes))
	break;
	int NewByte;
	if (!getShuffleInput(OpBytes, Byte, BytesPerElement, NewByte))
	break;
	if (NewByte < 0) {
	addUndef();
	return true;
	}
	Op = Op.getOperand(unsigned(NewByte) / SystemZ::VectorBytes);
	Byte = unsigned(NewByte) % SystemZ::VectorBytes;
	} else if (Op.isUndef()) {
	addUndef();
	return true;
	} else
	break;
	}

	// Make sure that the source of the extraction is in Ops.
	unsigned OpNo = 0;
	for (; OpNo < Ops.size(); ++OpNo)
	if (Ops[OpNo] == Op)
	break;
	if (OpNo == Ops.size())
	Ops.push_back(Op);

	// Add the element to Bytes.
	unsigned Base = OpNo * SystemZ::VectorBytes + Byte;
	for (unsigned I = 0; I < BytesPerElement; ++I)
	Bytes.push_back(Base + I);

	return true;
	}

	// Return SDNodes for the completed shuffle.
	SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
	assert(Bytes.size() == SystemZ::VectorBytes && "Incomplete vector");

	if (Ops.size() == 0)
	return DAG.getUNDEF(VT);

	// Make sure that there are at least two shuffle operands.
	if (Ops.size() == 1)
	Ops.push_back(DAG.getUNDEF(MVT::v16i8));

	// Create a tree of shuffles, deferring root node until after the loop.
	// Try to redistribute the undefined elements of non-root nodes so that
	// the non-root shuffles match something like a pack or merge, then adjust
	// the parent node's permute vector to compensate for the new order.
	// Among other things, this copes with vectors like <2 x i16> that were
	// padded with undefined elements during type legalization.
	//
	// In the best case this redistribution will lead to the whole tree
	// using packs and merges. It should rarely be a loss in other cases.
	unsigned Stride = 1;
	for (; Stride * 2 < Ops.size(); Stride *= 2) {
	for (unsigned I = 0; I < Ops.size() - Stride; I += Stride * 2) {
	SDValue SubOps[] = { Ops[I], Ops[I + Stride] };

	// Create a mask for just these two operands.
	SmallVector<int, SystemZ::VectorBytes> NewBytes(SystemZ::VectorBytes);
	for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
	unsigned OpNo = unsigned(Bytes[J]) / SystemZ::VectorBytes;
	unsigned Byte = unsigned(Bytes[J]) % SystemZ::VectorBytes;
	if (OpNo == I)
	NewBytes[J] = Byte;
	else if (OpNo == I + Stride)
	NewBytes[J] = SystemZ::VectorBytes + Byte;
	else
	NewBytes[J] = -1;
	}
	// See if it would be better to reorganize NewMask to avoid using VPERM.
	SmallVector<int, SystemZ::VectorBytes> NewBytesMap(SystemZ::VectorBytes);
	if (const Permute *P = matchDoublePermute(NewBytes, NewBytesMap)) {
	Ops[I] = getPermuteNode(DAG, DL, *P, SubOps[0], SubOps[1]);
	// Applying NewBytesMap to Ops[I] gets back to NewBytes.
	for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
	if (NewBytes[J] >= 0) {
	assert(unsigned(NewBytesMap[J]) < SystemZ::VectorBytes &&
	"Invalid double permute");
	Bytes[J] = I * SystemZ::VectorBytes + NewBytesMap[J];
	} else
	assert(NewBytesMap[J] < 0 && "Invalid double permute");
	}
	} else {
	// Just use NewBytes on the operands.
	Ops[I] = getGeneralPermuteNode(DAG, DL, SubOps, NewBytes);
	for (unsigned J = 0; J < SystemZ::VectorBytes; ++J)
	if (NewBytes[J] >= 0)
	Bytes[J] = I * SystemZ::VectorBytes + J;
	}
	}
	}

	// Now we just have 2 inputs. Put the second operand in Ops[1].
	if (Stride > 1) {
	Ops[1] = Ops[Stride];
	for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
	if (Bytes[I] >= int(SystemZ::VectorBytes))
	Bytes[I] -= (Stride - 1) * SystemZ::VectorBytes;
	}

	// Look for an instruction that can do the permute without resorting
	// to VPERM.
	unsigned OpNo0, OpNo1;
	SDValue Op;
	if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
	Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
	else
	Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
	return DAG.getNode(ISD::BITCAST, DL, VT, Op);
	}

	// Return true if the given BUILD_VECTOR is a scalar-to-vector conversion.
	static bool isScalarToVector(SDValue Op) {
	for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I)
	if (!Op.getOperand(I).isUndef())
	return false;
	return true;
	}

	// Return a vector of type VT that contains Value in the first element.
	// The other elements don't matter.
	static SDValue buildScalarToVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
	SDValue Value) {
	// If we have a constant, replicate it to all elements and let the
	// BUILD_VECTOR lowering take care of it.
	if (Value.getOpcode() == ISD::Constant \|\|
	Value.getOpcode() == ISD::ConstantFP) {
	SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Value);
	return DAG.getBuildVector(VT, DL, Ops);
	}
	if (Value.isUndef())
	return DAG.getUNDEF(VT);
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
	}

	// Return a vector of type VT in which Op0 is in element 0 and Op1 is in
	// element 1. Used for cases in which replication is cheap.
	static SDValue buildMergeScalars(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
	SDValue Op0, SDValue Op1) {
	if (Op0.isUndef()) {
	if (Op1.isUndef())
	return DAG.getUNDEF(VT);
	return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op1);
	}
	if (Op1.isUndef())
	return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0);
	return DAG.getNode(SystemZISD::MERGE_HIGH, DL, VT,
	buildScalarToVector(DAG, DL, VT, Op0),
	buildScalarToVector(DAG, DL, VT, Op1));
	}

	// Extend GPR scalars Op0 and Op1 to doublewords and return a v2i64
	// vector for them.
	static SDValue joinDwords(SelectionDAG &DAG, const SDLoc &DL, SDValue Op0,
	SDValue Op1) {
	if (Op0.isUndef() && Op1.isUndef())
	return DAG.getUNDEF(MVT::v2i64);
	// If one of the two inputs is undefined then replicate the other one,
	// in order to avoid using another register unnecessarily.
	if (Op0.isUndef())
	Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
	else if (Op1.isUndef())
	Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
	else {
	Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
	Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
	}
	return DAG.getNode(SystemZISD::JOIN_DWORDS, DL, MVT::v2i64, Op0, Op1);
	}

	// If a BUILD_VECTOR contains some EXTRACT_VECTOR_ELTs, it's usually
	// better to use VECTOR_SHUFFLEs on them, only using BUILD_VECTOR for
	// the non-EXTRACT_VECTOR_ELT elements. See if the given BUILD_VECTOR
	// would benefit from this representation and return it if so.
	static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
	BuildVectorSDNode *BVN) {
	EVT VT = BVN->getValueType(0);
	unsigned NumElements = VT.getVectorNumElements();

	// Represent the BUILD_VECTOR as an N-operand VECTOR_SHUFFLE-like operation
	// on byte vectors. If there are non-EXTRACT_VECTOR_ELT elements that still
	// need a BUILD_VECTOR, add an additional placeholder operand for that
	// BUILD_VECTOR and store its operands in ResidueOps.
	GeneralShuffle GS(VT);
	SmallVector<SDValue, SystemZ::VectorBytes> ResidueOps;
	bool FoundOne = false;
	for (unsigned I = 0; I < NumElements; ++I) {
	SDValue Op = BVN->getOperand(I);
	if (Op.getOpcode() == ISD::TRUNCATE)
	Op = Op.getOperand(0);
	if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op.getOperand(1).getOpcode() == ISD::Constant) {
	unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
	if (!GS.add(Op.getOperand(0), Elem))
	return SDValue();
	FoundOne = true;
	} else if (Op.isUndef()) {
	GS.addUndef();
	} else {
	if (!GS.add(SDValue(), ResidueOps.size()))
	return SDValue();
	ResidueOps.push_back(BVN->getOperand(I));
	}
	}

	// Nothing to do if there are no EXTRACT_VECTOR_ELTs.
	if (!FoundOne)
	return SDValue();

	// Create the BUILD_VECTOR for the remaining elements, if any.
	if (!ResidueOps.empty()) {
	while (ResidueOps.size() < NumElements)
	ResidueOps.push_back(DAG.getUNDEF(ResidueOps[0].getValueType()));
	for (auto &Op : GS.Ops) {
	if (!Op.getNode()) {
	Op = DAG.getBuildVector(VT, SDLoc(BVN), ResidueOps);
	break;
	}
	}
	}
	return GS.getNode(DAG, SDLoc(BVN));
	}

	bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const {
	if (Op.getOpcode() == ISD::LOAD && cast<LoadSDNode>(Op)->isUnindexed())
	return true;
	if (Subtarget.hasVectorEnhancements2() && Op.getOpcode() == SystemZISD::LRV)
	return true;
	return false;
	}

	// Combine GPR scalar values Elems into a vector of type VT.
	SDValue
	SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
	SmallVectorImpl<SDValue> &Elems) const {
	// See whether there is a single replicated value.
	SDValue Single;
	unsigned int NumElements = Elems.size();
	unsigned int Count = 0;
	for (auto Elem : Elems) {
	if (!Elem.isUndef()) {
	if (!Single.getNode())
	Single = Elem;
	else if (Elem != Single) {
	Single = SDValue();
	break;
	}
	Count += 1;
	}
	}
	// There are three cases here:
	//
	// - if the only defined element is a loaded one, the best sequence
	// is a replicating load.
	//
	// - otherwise, if the only defined element is an i64 value, we will
	// end up with the same VLVGP sequence regardless of whether we short-cut
	// for replication or fall through to the later code.
	//
	// - otherwise, if the only defined element is an i32 or smaller value,
	// we would need 2 instructions to replicate it: VLVGP followed by VREPx.
	// This is only a win if the single defined element is used more than once.
	// In other cases we're better off using a single VLVGx.
	if (Single.getNode() && (Count > 1 \|\| isVectorElementLoad(Single)))
	return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single);

	// If all elements are loads, use VLREP/VLEs (below).
	bool AllLoads = true;
	for (auto Elem : Elems)
	if (!isVectorElementLoad(Elem)) {
	AllLoads = false;
	break;
	}

	// The best way of building a v2i64 from two i64s is to use VLVGP.
	if (VT == MVT::v2i64 && !AllLoads)
	return joinDwords(DAG, DL, Elems[0], Elems[1]);

	// Use a 64-bit merge high to combine two doubles.
	if (VT == MVT::v2f64 && !AllLoads)
	return buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);

	// Build v4f32 values directly from the FPRs:
	//
	// <Axxx> <Bxxx> <Cxxxx> <Dxxx>
	// V V VMRHF
	// <ABxx> <CDxx>
	// V VMRHG
	// <ABCD>
	if (VT == MVT::v4f32 && !AllLoads) {
	SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
	SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
	// Avoid unnecessary undefs by reusing the other operand.
	if (Op01.isUndef())
	Op01 = Op23;
	else if (Op23.isUndef())
	Op23 = Op01;
	// Merging identical replications is a no-op.
	if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
	return Op01;
	Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01);
	Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23);
	SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
	DL, MVT::v2i64, Op01, Op23);
	return DAG.getNode(ISD::BITCAST, DL, VT, Op);
	}

	// Collect the constant terms.
	SmallVector<SDValue, SystemZ::VectorBytes> Constants(NumElements, SDValue());
	SmallVector<bool, SystemZ::VectorBytes> Done(NumElements, false);

	unsigned NumConstants = 0;
	for (unsigned I = 0; I < NumElements; ++I) {
	SDValue Elem = Elems[I];
	if (Elem.getOpcode() == ISD::Constant \|\|
	Elem.getOpcode() == ISD::ConstantFP) {
	NumConstants += 1;
	Constants[I] = Elem;
	Done[I] = true;
	}
	}
	// If there was at least one constant, fill in the other elements of
	// Constants with undefs to get a full vector constant and use that
	// as the starting point.
	SDValue Result;
	SDValue ReplicatedVal;
	if (NumConstants > 0) {
	for (unsigned I = 0; I < NumElements; ++I)
	if (!Constants[I].getNode())
	Constants[I] = DAG.getUNDEF(Elems[I].getValueType());
	Result = DAG.getBuildVector(VT, DL, Constants);
	} else {
	// Otherwise try to use VLREP or VLVGP to start the sequence in order to
	// avoid a false dependency on any previous contents of the vector
	// register.

	// Use a VLREP if at least one element is a load. Make sure to replicate
	// the load with the most elements having its value.
	std::map<const SDNode*, unsigned> UseCounts;
	SDNode *LoadMaxUses = nullptr;
	for (unsigned I = 0; I < NumElements; ++I)
	if (isVectorElementLoad(Elems[I])) {
	SDNode *Ld = Elems[I].getNode();
	UseCounts[Ld]++;
	if (LoadMaxUses == nullptr \|\| UseCounts[LoadMaxUses] < UseCounts[Ld])
	LoadMaxUses = Ld;
	}
	if (LoadMaxUses != nullptr) {
	ReplicatedVal = SDValue(LoadMaxUses, 0);
	Result = DAG.getNode(SystemZISD::REPLICATE, DL, VT, ReplicatedVal);
	} else {
	// Try to use VLVGP.
	unsigned I1 = NumElements / 2 - 1;
	unsigned I2 = NumElements - 1;
	bool Def1 = !Elems[I1].isUndef();
	bool Def2 = !Elems[I2].isUndef();
	if (Def1 \|\| Def2) {
	SDValue Elem1 = Elems[Def1 ? I1 : I2];
	SDValue Elem2 = Elems[Def2 ? I2 : I1];
	Result = DAG.getNode(ISD::BITCAST, DL, VT,
	joinDwords(DAG, DL, Elem1, Elem2));
	Done[I1] = true;
	Done[I2] = true;
	} else
	Result = DAG.getUNDEF(VT);
	}
	}

	// Use VLVGx to insert the other elements.
	for (unsigned I = 0; I < NumElements; ++I)
	if (!Done[I] && !Elems[I].isUndef() && Elems[I] != ReplicatedVal)
	Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Result, Elems[I],
	DAG.getConstant(I, DL, MVT::i32));
	return Result;
	}

	SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	auto *BVN = cast<BuildVectorSDNode>(Op.getNode());
	SDLoc DL(Op);
	EVT VT = Op.getValueType();

	if (BVN->isConstant()) {
	if (SystemZVectorConstantInfo(BVN).isVectorConstantLegal(Subtarget))
	return Op;

	// Fall back to loading it from memory.
	return SDValue();
	}

	// See if we should use shuffles to construct the vector from other vectors.
	if (SDValue Res = tryBuildVectorShuffle(DAG, BVN))
	return Res;

	// Detect SCALAR_TO_VECTOR conversions.
	if (isOperationLegal(ISD::SCALAR_TO_VECTOR, VT) && isScalarToVector(Op))
	return buildScalarToVector(DAG, DL, VT, Op.getOperand(0));

	// Otherwise use buildVector to build the vector up from GPRs.
	unsigned NumElements = Op.getNumOperands();
	SmallVector<SDValue, SystemZ::VectorBytes> Ops(NumElements);
	for (unsigned I = 0; I < NumElements; ++I)
	Ops[I] = Op.getOperand(I);
	return buildVector(DAG, DL, VT, Ops);
	}

	SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
	SelectionDAG &DAG) const {
	auto *VSN = cast<ShuffleVectorSDNode>(Op.getNode());
	SDLoc DL(Op);
	EVT VT = Op.getValueType();
	unsigned NumElements = VT.getVectorNumElements();

	if (VSN->isSplat()) {
	SDValue Op0 = Op.getOperand(0);
	unsigned Index = VSN->getSplatIndex();
	assert(Index < VT.getVectorNumElements() &&
	"Splat index should be defined and in first operand");
	// See whether the value we're splatting is directly available as a scalar.
	if ((Index == 0 && Op0.getOpcode() == ISD::SCALAR_TO_VECTOR) \|\|
	Op0.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0.getOperand(Index));
	// Otherwise keep it as a vector-to-vector operation.
	return DAG.getNode(SystemZISD::SPLAT, DL, VT, Op.getOperand(0),
	DAG.getConstant(Index, DL, MVT::i32));
	}

	GeneralShuffle GS(VT);
	for (unsigned I = 0; I < NumElements; ++I) {
	int Elt = VSN->getMaskElt(I);
	if (Elt < 0)
	GS.addUndef();
	else if (!GS.add(Op.getOperand(unsigned(Elt) / NumElements),
	unsigned(Elt) % NumElements))
	return SDValue();
	}
	return GS.getNode(DAG, SDLoc(VSN));
	}

	SDValue SystemZTargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	// Just insert the scalar into element 0 of an undefined vector.
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
	Op.getValueType(), DAG.getUNDEF(Op.getValueType()),
	Op.getOperand(0), DAG.getConstant(0, DL, MVT::i32));
	}

	SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	// Handle insertions of floating-point values.
	SDLoc DL(Op);
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Op2 = Op.getOperand(2);
	EVT VT = Op.getValueType();

	// Insertions into constant indices of a v2f64 can be done using VPDI.
	// However, if the inserted value is a bitcast or a constant then it's
	// better to use GPRs, as below.
	if (VT == MVT::v2f64 &&
	Op1.getOpcode() != ISD::BITCAST &&
	Op1.getOpcode() != ISD::ConstantFP &&
	Op2.getOpcode() == ISD::Constant) {
	uint64_t Index = cast<ConstantSDNode>(Op2)->getZExtValue();
	unsigned Mask = VT.getVectorNumElements() - 1;
	if (Index <= Mask)
	return Op;
	}

	// Otherwise bitcast to the equivalent integer form and insert via a GPR.
	MVT IntVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
	MVT IntVecVT = MVT::getVectorVT(IntVT, VT.getVectorNumElements());
	SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntVecVT,
	DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0),
	DAG.getNode(ISD::BITCAST, DL, IntVT, Op1), Op2);
	return DAG.getNode(ISD::BITCAST, DL, VT, Res);
	}

	SDValue
	SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	// Handle extractions of floating-point values.
	SDLoc DL(Op);
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	EVT VT = Op.getValueType();
	EVT VecVT = Op0.getValueType();

	// Extractions of constant indices can be done directly.
	if (auto *CIndexN = dyn_cast<ConstantSDNode>(Op1)) {
	uint64_t Index = CIndexN->getZExtValue();
	unsigned Mask = VecVT.getVectorNumElements() - 1;
	if (Index <= Mask)
	return Op;
	}

	// Otherwise bitcast to the equivalent integer form and extract via a GPR.
	MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits());
	MVT IntVecVT = MVT::getVectorVT(IntVT, VecVT.getVectorNumElements());
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IntVT,
	DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0), Op1);
	return DAG.getNode(ISD::BITCAST, DL, VT, Res);
	}

	SDValue
	SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
	unsigned UnpackHigh) const {
	SDValue PackedOp = Op.getOperand(0);
	EVT OutVT = Op.getValueType();
	EVT InVT = PackedOp.getValueType();
	unsigned ToBits = OutVT.getScalarSizeInBits();
	unsigned FromBits = InVT.getScalarSizeInBits();
	do {
	FromBits *= 2;
	EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
	SystemZ::VectorBits / FromBits);
	PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
	} while (FromBits != ToBits);
	return PackedOp;
	}

	SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
	unsigned ByScalar) const {
	// Look for cases where a vector shift can use the *_BY_SCALAR form.
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDLoc DL(Op);
	EVT VT = Op.getValueType();
	unsigned ElemBitSize = VT.getScalarSizeInBits();

	// See whether the shift vector is a splat represented as BUILD_VECTOR.
	if (auto *BVN = dyn_cast<BuildVectorSDNode>(Op1)) {
	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	// Check for constant splats. Use ElemBitSize as the minimum element
	// width and reject splats that need wider elements.
	if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
	ElemBitSize, true) &&
	SplatBitSize == ElemBitSize) {
	SDValue Shift = DAG.getConstant(SplatBits.getZExtValue() & 0xfff,
	DL, MVT::i32);
	return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
	}
	// Check for variable splats.
	BitVector UndefElements;
	SDValue Splat = BVN->getSplatValue(&UndefElements);
	if (Splat) {
	// Since i32 is the smallest legal type, we either need a no-op
	// or a truncation.
	SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Splat);
	return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
	}
	}

	// See whether the shift vector is a splat represented as SHUFFLE_VECTOR,
	// and the shift amount is directly available in a GPR.
	if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(Op1)) {
	if (VSN->isSplat()) {
	SDValue VSNOp0 = VSN->getOperand(0);
	unsigned Index = VSN->getSplatIndex();
	assert(Index < VT.getVectorNumElements() &&
	"Splat index should be defined and in first operand");
	if ((Index == 0 && VSNOp0.getOpcode() == ISD::SCALAR_TO_VECTOR) \|\|
	VSNOp0.getOpcode() == ISD::BUILD_VECTOR) {
	// Since i32 is the smallest legal type, we either need a no-op
	// or a truncation.
	SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
	VSNOp0.getOperand(Index));
	return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
	}
	}
	}

	// Otherwise just treat the current form as legal.
	return Op;
	}

	SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
	SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	case ISD::FRAMEADDR:
	return lowerFRAMEADDR(Op, DAG);
	case ISD::RETURNADDR:
	return lowerRETURNADDR(Op, DAG);
	case ISD::BR_CC:
	return lowerBR_CC(Op, DAG);
	case ISD::SELECT_CC:
	return lowerSELECT_CC(Op, DAG);
	case ISD::SETCC:
	return lowerSETCC(Op, DAG);
	case ISD::GlobalAddress:
	return lowerGlobalAddress(cast<GlobalAddressSDNode>(Op), DAG);
	case ISD::GlobalTLSAddress:
	return lowerGlobalTLSAddress(cast<GlobalAddressSDNode>(Op), DAG);
	case ISD::BlockAddress:
	return lowerBlockAddress(cast<BlockAddressSDNode>(Op), DAG);
	case ISD::JumpTable:
	return lowerJumpTable(cast<JumpTableSDNode>(Op), DAG);
	case ISD::ConstantPool:
	return lowerConstantPool(cast<ConstantPoolSDNode>(Op), DAG);
	case ISD::BITCAST:
	return lowerBITCAST(Op, DAG);
	case ISD::VASTART:
	return lowerVASTART(Op, DAG);
	case ISD::VACOPY:
	return lowerVACOPY(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC:
	return lowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::GET_DYNAMIC_AREA_OFFSET:
	return lowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
	case ISD::SMUL_LOHI:
	return lowerSMUL_LOHI(Op, DAG);
	case ISD::UMUL_LOHI:
	return lowerUMUL_LOHI(Op, DAG);
	case ISD::SDIVREM:
	return lowerSDIVREM(Op, DAG);
	case ISD::UDIVREM:
	return lowerUDIVREM(Op, DAG);
	case ISD::SADDO:
	case ISD::SSUBO:
	case ISD::UADDO:
	case ISD::USUBO:
	return lowerXALUO(Op, DAG);
	case ISD::ADDCARRY:
	case ISD::SUBCARRY:
	return lowerADDSUBCARRY(Op, DAG);
	case ISD::OR:
	return lowerOR(Op, DAG);
	case ISD::CTPOP:
	return lowerCTPOP(Op, DAG);
	case ISD::ATOMIC_FENCE:
	return lowerATOMIC_FENCE(Op, DAG);
	case ISD::ATOMIC_SWAP:
	return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
	case ISD::ATOMIC_STORE:
	return lowerATOMIC_STORE(Op, DAG);
	case ISD::ATOMIC_LOAD:
	return lowerATOMIC_LOAD(Op, DAG);
	case ISD::ATOMIC_LOAD_ADD:
	return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
	case ISD::ATOMIC_LOAD_SUB:
	return lowerATOMIC_LOAD_SUB(Op, DAG);
	case ISD::ATOMIC_LOAD_AND:
	return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_AND);
	case ISD::ATOMIC_LOAD_OR:
	return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_OR);
	case ISD::ATOMIC_LOAD_XOR:
	return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_XOR);
	case ISD::ATOMIC_LOAD_NAND:
	return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_NAND);
	case ISD::ATOMIC_LOAD_MIN:
	return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MIN);
	case ISD::ATOMIC_LOAD_MAX:
	return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MAX);
	case ISD::ATOMIC_LOAD_UMIN:
	return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMIN);
	case ISD::ATOMIC_LOAD_UMAX:
	return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMAX);
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	return lowerATOMIC_CMP_SWAP(Op, DAG);
	case ISD::STACKSAVE:
	return lowerSTACKSAVE(Op, DAG);
	case ISD::STACKRESTORE:
	return lowerSTACKRESTORE(Op, DAG);
	case ISD::PREFETCH:
	return lowerPREFETCH(Op, DAG);
	case ISD::INTRINSIC_W_CHAIN:
	return lowerINTRINSIC_W_CHAIN(Op, DAG);
	case ISD::INTRINSIC_WO_CHAIN:
	return lowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::BUILD_VECTOR:
	return lowerBUILD_VECTOR(Op, DAG);
	case ISD::VECTOR_SHUFFLE:
	return lowerVECTOR_SHUFFLE(Op, DAG);
	case ISD::SCALAR_TO_VECTOR:
	return lowerSCALAR_TO_VECTOR(Op, DAG);
	case ISD::INSERT_VECTOR_ELT:
	return lowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT:
	return lowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
	case ISD::SHL:
	return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
	case ISD::SRL:
	return lowerShift(Op, DAG, SystemZISD::VSRL_BY_SCALAR);
	case ISD::SRA:
	return lowerShift(Op, DAG, SystemZISD::VSRA_BY_SCALAR);
	default:
	llvm_unreachable("Unexpected node to lower");
	}
	}

	// Lower operations with invalid operand or result types (currently used
	// only for 128-bit integer types).

	static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) {
	SDLoc DL(In);
	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
	DAG.getIntPtrConstant(1, DL));
	SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL,
	MVT::Untyped, Hi, Lo);
	return SDValue(Pair, 0);
	}

	static SDValue lowerGR128ToI128(SelectionDAG &DAG, SDValue In) {
	SDLoc DL(In);
	SDValue Hi = DAG.getTargetExtractSubreg(SystemZ::subreg_h64,
	DL, MVT::i64, In);
	SDValue Lo = DAG.getTargetExtractSubreg(SystemZ::subreg_l64,
	DL, MVT::i64, In);
	return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi);
	}

	void
	SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	switch (N->getOpcode()) {
	case ISD::ATOMIC_LOAD: {
	SDLoc DL(N);
	SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::Other);
	SDValue Ops[] = { N->getOperand(0), N->getOperand(1) };
	MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
	SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_LOAD_128,
	DL, Tys, Ops, MVT::i128, MMO);
	Results.push_back(lowerGR128ToI128(DAG, Res));
	Results.push_back(Res.getValue(1));
	break;
	}
	case ISD::ATOMIC_STORE: {
	SDLoc DL(N);
	SDVTList Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = { N->getOperand(0),
	lowerI128ToGR128(DAG, N->getOperand(2)),
	N->getOperand(1) };
	MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
	SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_STORE_128,
	DL, Tys, Ops, MVT::i128, MMO);
	// We have to enforce sequential consistency by performing a
	// serialization operation after the store.
	if (cast<AtomicSDNode>(N)->getOrdering() ==
	AtomicOrdering::SequentiallyConsistent)
	Res = SDValue(DAG.getMachineNode(SystemZ::Serialize, DL,
	MVT::Other, Res), 0);
	Results.push_back(Res);
	break;
	}
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
	SDLoc DL(N);
	SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other);
	SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
	lowerI128ToGR128(DAG, N->getOperand(2)),
	lowerI128ToGR128(DAG, N->getOperand(3)) };
	MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
	SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP_128,
	DL, Tys, Ops, MVT::i128, MMO);
	SDValue Success = emitSETCC(DAG, DL, Res.getValue(1),
	SystemZ::CCMASK_CS, SystemZ::CCMASK_CS_EQ);
	Success = DAG.getZExtOrTrunc(Success, DL, N->getValueType(1));
	Results.push_back(lowerGR128ToI128(DAG, Res));
	Results.push_back(Success);
	Results.push_back(Res.getValue(2));
	break;
	}
	default:
	llvm_unreachable("Unexpected node to lower");
	}
	}

	void
	SystemZTargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	return LowerOperationWrapper(N, Results, DAG);
	}

	const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
	#define OPCODE(NAME) case SystemZISD::NAME: return "SystemZISD::" #NAME
	switch ((SystemZISD::NodeType)Opcode) {
	case SystemZISD::FIRST_NUMBER: break;
	OPCODE(RET_FLAG);
	OPCODE(CALL);
	OPCODE(SIBCALL);
	OPCODE(TLS_GDCALL);
	OPCODE(TLS_LDCALL);
	OPCODE(PCREL_WRAPPER);
	OPCODE(PCREL_OFFSET);
	OPCODE(IABS);
	OPCODE(ICMP);
	OPCODE(FCMP);
	OPCODE(TM);
	OPCODE(BR_CCMASK);
	OPCODE(SELECT_CCMASK);
	OPCODE(ADJDYNALLOC);
	OPCODE(POPCNT);
	OPCODE(SMUL_LOHI);
	OPCODE(UMUL_LOHI);
	OPCODE(SDIVREM);
	OPCODE(UDIVREM);
	OPCODE(SADDO);
	OPCODE(SSUBO);
	OPCODE(UADDO);
	OPCODE(USUBO);
	OPCODE(ADDCARRY);
	OPCODE(SUBCARRY);
	OPCODE(GET_CCMASK);
	OPCODE(MVC);
	OPCODE(MVC_LOOP);
	OPCODE(NC);
	OPCODE(NC_LOOP);
	OPCODE(OC);
	OPCODE(OC_LOOP);
	OPCODE(XC);
	OPCODE(XC_LOOP);
	OPCODE(CLC);
	OPCODE(CLC_LOOP);
	OPCODE(STPCPY);
	OPCODE(STRCMP);
	OPCODE(SEARCH_STRING);
	OPCODE(IPM);
	OPCODE(MEMBARRIER);
	OPCODE(TBEGIN);
	OPCODE(TBEGIN_NOFLOAT);
	OPCODE(TEND);
	OPCODE(BYTE_MASK);
	OPCODE(ROTATE_MASK);
	OPCODE(REPLICATE);
	OPCODE(JOIN_DWORDS);
	OPCODE(SPLAT);
	OPCODE(MERGE_HIGH);
	OPCODE(MERGE_LOW);
	OPCODE(SHL_DOUBLE);
	OPCODE(PERMUTE_DWORDS);
	OPCODE(PERMUTE);
	OPCODE(PACK);
	OPCODE(PACKS_CC);
	OPCODE(PACKLS_CC);
	OPCODE(UNPACK_HIGH);
	OPCODE(UNPACKL_HIGH);
	OPCODE(UNPACK_LOW);
	OPCODE(UNPACKL_LOW);
	OPCODE(VSHL_BY_SCALAR);
	OPCODE(VSRL_BY_SCALAR);
	OPCODE(VSRA_BY_SCALAR);
	OPCODE(VSUM);
	OPCODE(VICMPE);
	OPCODE(VICMPH);
	OPCODE(VICMPHL);
	OPCODE(VICMPES);
	OPCODE(VICMPHS);
	OPCODE(VICMPHLS);
	OPCODE(VFCMPE);
	OPCODE(VFCMPH);
	OPCODE(VFCMPHE);
	OPCODE(VFCMPES);
	OPCODE(VFCMPHS);
	OPCODE(VFCMPHES);
	OPCODE(VFTCI);
	OPCODE(VEXTEND);
	OPCODE(VROUND);
	OPCODE(VTM);
	OPCODE(VFAE_CC);
	OPCODE(VFAEZ_CC);
	OPCODE(VFEE_CC);
	OPCODE(VFEEZ_CC);
	OPCODE(VFENE_CC);
	OPCODE(VFENEZ_CC);
	OPCODE(VISTR_CC);
	OPCODE(VSTRC_CC);
	OPCODE(VSTRCZ_CC);
	OPCODE(VSTRS_CC);
	OPCODE(VSTRSZ_CC);
	OPCODE(TDC);
	OPCODE(ATOMIC_SWAPW);
	OPCODE(ATOMIC_LOADW_ADD);
	OPCODE(ATOMIC_LOADW_SUB);
	OPCODE(ATOMIC_LOADW_AND);
	OPCODE(ATOMIC_LOADW_OR);
	OPCODE(ATOMIC_LOADW_XOR);
	OPCODE(ATOMIC_LOADW_NAND);
	OPCODE(ATOMIC_LOADW_MIN);
	OPCODE(ATOMIC_LOADW_MAX);
	OPCODE(ATOMIC_LOADW_UMIN);
	OPCODE(ATOMIC_LOADW_UMAX);
	OPCODE(ATOMIC_CMP_SWAPW);
	OPCODE(ATOMIC_CMP_SWAP);
	OPCODE(ATOMIC_LOAD_128);
	OPCODE(ATOMIC_STORE_128);
	OPCODE(ATOMIC_CMP_SWAP_128);
	OPCODE(LRV);
	OPCODE(STRV);
	OPCODE(VLER);
	OPCODE(VSTER);
	OPCODE(PREFETCH);
	}
	return nullptr;
	#undef OPCODE
	}

	// Return true if VT is a vector whose elements are a whole number of bytes
	// in width. Also check for presence of vector support.
	bool SystemZTargetLowering::canTreatAsByteVector(EVT VT) const {
	if (!Subtarget.hasVector())
	return false;

	return VT.isVector() && VT.getScalarSizeInBits() % 8 == 0 && VT.isSimple();
	}

	// Try to simplify an EXTRACT_VECTOR_ELT from a vector of type VecVT
	// producing a result of type ResVT. Op is a possibly bitcast version
	// of the input vector and Index is the index (based on type VecVT) that
	// should be extracted. Return the new extraction if a simplification
	// was possible or if Force is true.
	SDValue SystemZTargetLowering::combineExtract(const SDLoc &DL, EVT ResVT,
	EVT VecVT, SDValue Op,
	unsigned Index,
	DAGCombinerInfo &DCI,
	bool Force) const {
	SelectionDAG &DAG = DCI.DAG;

	// The number of bytes being extracted.
	unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize();

	for (;;) {
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::BITCAST)
	// Look through bitcasts.
	Op = Op.getOperand(0);
	else if ((Opcode == ISD::VECTOR_SHUFFLE \|\| Opcode == SystemZISD::SPLAT) &&
	canTreatAsByteVector(Op.getValueType())) {
	// Get a VPERM-like permute mask and see whether the bytes covered
	// by the extracted element are a contiguous sequence from one
	// source operand.
	SmallVector<int, SystemZ::VectorBytes> Bytes;
	if (!getVPermMask(Op, Bytes))
	break;
	int First;
	if (!getShuffleInput(Bytes, Index * BytesPerElement,
	BytesPerElement, First))
	break;
	if (First < 0)
	return DAG.getUNDEF(ResVT);
	// Make sure the contiguous sequence starts at a multiple of the
	// original element size.
	unsigned Byte = unsigned(First) % Bytes.size();
	if (Byte % BytesPerElement != 0)
	break;
	// We can get the extracted value directly from an input.
	Index = Byte / BytesPerElement;
	Op = Op.getOperand(unsigned(First) / Bytes.size());
	Force = true;
	} else if (Opcode == ISD::BUILD_VECTOR &&
	canTreatAsByteVector(Op.getValueType())) {
	// We can only optimize this case if the BUILD_VECTOR elements are
	// at least as wide as the extracted value.
	EVT OpVT = Op.getValueType();
	unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize();
	if (OpBytesPerElement < BytesPerElement)
	break;
	// Make sure that the least-significant bit of the extracted value
	// is the least significant bit of an input.
	unsigned End = (Index + 1) * BytesPerElement;
	if (End % OpBytesPerElement != 0)
	break;
	// We're extracting the low part of one operand of the BUILD_VECTOR.
	Op = Op.getOperand(End / OpBytesPerElement - 1);
	if (!Op.getValueType().isInteger()) {
	EVT VT = MVT::getIntegerVT(Op.getValueSizeInBits());
	Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
	DCI.AddToWorklist(Op.getNode());
	}
	EVT VT = MVT::getIntegerVT(ResVT.getSizeInBits());
	Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
	if (VT != ResVT) {
	DCI.AddToWorklist(Op.getNode());
	Op = DAG.getNode(ISD::BITCAST, DL, ResVT, Op);
	}
	return Op;
	} else if ((Opcode == ISD::SIGN_EXTEND_VECTOR_INREG \|\|
	Opcode == ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	Opcode == ISD::ANY_EXTEND_VECTOR_INREG) &&
	canTreatAsByteVector(Op.getValueType()) &&
	canTreatAsByteVector(Op.getOperand(0).getValueType())) {
	// Make sure that only the unextended bits are significant.
	EVT ExtVT = Op.getValueType();
	EVT OpVT = Op.getOperand(0).getValueType();
	unsigned ExtBytesPerElement = ExtVT.getVectorElementType().getStoreSize();
	unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize();
	unsigned Byte = Index * BytesPerElement;
	unsigned SubByte = Byte % ExtBytesPerElement;
	unsigned MinSubByte = ExtBytesPerElement - OpBytesPerElement;
	if (SubByte < MinSubByte \|\|
	SubByte + BytesPerElement > ExtBytesPerElement)
	break;
	// Get the byte offset of the unextended element
	Byte = Byte / ExtBytesPerElement * OpBytesPerElement;
	// ...then add the byte offset relative to that element.
	Byte += SubByte - MinSubByte;
	if (Byte % BytesPerElement != 0)
	break;
	Op = Op.getOperand(0);
	Index = Byte / BytesPerElement;
	Force = true;
	} else
	break;
	}
	if (Force) {
	if (Op.getValueType() != VecVT) {
	Op = DAG.getNode(ISD::BITCAST, DL, VecVT, Op);
	DCI.AddToWorklist(Op.getNode());
	}
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op,
	DAG.getConstant(Index, DL, MVT::i32));
	}
	return SDValue();
	}

	// Optimize vector operations in scalar value Op on the basis that Op
	// is truncated to TruncVT.
	SDValue SystemZTargetLowering::combineTruncateExtract(
	const SDLoc &DL, EVT TruncVT, SDValue Op, DAGCombinerInfo &DCI) const {
	// If we have (trunc (extract_vector_elt X, Y)), try to turn it into
	// (extract_vector_elt (bitcast X), Y'), where (bitcast X) has elements
	// of type TruncVT.
	if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	TruncVT.getSizeInBits() % 8 == 0) {
	SDValue Vec = Op.getOperand(0);
	EVT VecVT = Vec.getValueType();
	if (canTreatAsByteVector(VecVT)) {
	if (auto *IndexN = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize();
	unsigned TruncBytes = TruncVT.getStoreSize();
	if (BytesPerElement % TruncBytes == 0) {
	// Calculate the value of Y' in the above description. We are
	// splitting the original elements into Scale equal-sized pieces
	// and for truncation purposes want the last (least-significant)
	// of these pieces for IndexN. This is easiest to do by calculating
	// the start index of the following element and then subtracting 1.
	unsigned Scale = BytesPerElement / TruncBytes;
	unsigned NewIndex = (IndexN->getZExtValue() + 1) * Scale - 1;

	// Defer the creation of the bitcast from X to combineExtract,
	// which might be able to optimize the extraction.
	VecVT = MVT::getVectorVT(MVT::getIntegerVT(TruncBytes * 8),
	VecVT.getStoreSize() / TruncBytes);
	EVT ResVT = (TruncBytes < 4 ? MVT::i32 : TruncVT);
	return combineExtract(DL, ResVT, VecVT, Vec, NewIndex, DCI, true);
	}
	}
	}
	}
	return SDValue();
	}

	SDValue SystemZTargetLowering::combineZERO_EXTEND(
	SDNode *N, DAGCombinerInfo &DCI) const {
	// Convert (zext (select_ccmask C1, C2)) into (select_ccmask C1', C2')
	SelectionDAG &DAG = DCI.DAG;
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	if (N0.getOpcode() == SystemZISD::SELECT_CCMASK) {
	auto *TrueOp = dyn_cast<ConstantSDNode>(N0.getOperand(0));
	auto *FalseOp = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (TrueOp && FalseOp) {
	SDLoc DL(N0);
	SDValue Ops[] = { DAG.getConstant(TrueOp->getZExtValue(), DL, VT),
	DAG.getConstant(FalseOp->getZExtValue(), DL, VT),
	N0.getOperand(2), N0.getOperand(3), N0.getOperand(4) };
	SDValue NewSelect = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VT, Ops);
	// If N0 has multiple uses, change other uses as well.
	if (!N0.hasOneUse()) {
	SDValue TruncSelect =
	DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), NewSelect);
	DCI.CombineTo(N0.getNode(), TruncSelect);
	}
	return NewSelect;
	}
	}
	return SDValue();
	}

	SDValue SystemZTargetLowering::combineSIGN_EXTEND_INREG(
	SDNode *N, DAGCombinerInfo &DCI) const {
	// Convert (sext_in_reg (setcc LHS, RHS, COND), i1)
	// and (sext_in_reg (any_extend (setcc LHS, RHS, COND)), i1)
	// into (select_cc LHS, RHS, -1, 0, COND)
	SelectionDAG &DAG = DCI.DAG;
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
	if (N0.hasOneUse() && N0.getOpcode() == ISD::ANY_EXTEND)
	N0 = N0.getOperand(0);
	if (EVT == MVT::i1 && N0.hasOneUse() && N0.getOpcode() == ISD::SETCC) {
	SDLoc DL(N0);
	SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1),
	DAG.getConstant(-1, DL, VT), DAG.getConstant(0, DL, VT),
	N0.getOperand(2) };
	return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
	}
	return SDValue();
	}

	SDValue SystemZTargetLowering::combineSIGN_EXTEND(
	SDNode *N, DAGCombinerInfo &DCI) const {
	// Convert (sext (ashr (shl X, C1), C2)) to
	// (ashr (shl (anyext X), C1'), C2')), since wider shifts are as
	// cheap as narrower ones.
	SelectionDAG &DAG = DCI.DAG;
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	if (N0.hasOneUse() && N0.getOpcode() == ISD::SRA) {
	auto *SraAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	SDValue Inner = N0.getOperand(0);
	if (SraAmt && Inner.hasOneUse() && Inner.getOpcode() == ISD::SHL) {
	if (auto *ShlAmt = dyn_cast<ConstantSDNode>(Inner.getOperand(1))) {
	unsigned Extra = (VT.getSizeInBits() - N0.getValueSizeInBits());
	unsigned NewShlAmt = ShlAmt->getZExtValue() + Extra;
	unsigned NewSraAmt = SraAmt->getZExtValue() + Extra;
	EVT ShiftVT = N0.getOperand(1).getValueType();
	SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SDLoc(Inner), VT,
	Inner.getOperand(0));
	SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(Inner), VT, Ext,
	DAG.getConstant(NewShlAmt, SDLoc(Inner),
	ShiftVT));
	return DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl,
	DAG.getConstant(NewSraAmt, SDLoc(N0), ShiftVT));
	}
	}
	}
	return SDValue();
	}

	SDValue SystemZTargetLowering::combineMERGE(
	SDNode *N, DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	unsigned Opcode = N->getOpcode();
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	if (Op0.getOpcode() == ISD::BITCAST)
	Op0 = Op0.getOperand(0);
	if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
	// (z_merge_* 0, 0) -> 0. This is mostly useful for using VLLEZF
	// for v4f32.
	if (Op1 == N->getOperand(0))
	return Op1;
	// (z_merge_? 0, X) -> (z_unpackl_? 0, X).
	EVT VT = Op1.getValueType();
	unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
	if (ElemBytes <= 4) {
	Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
	SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
	EVT InVT = VT.changeVectorElementTypeToInteger();
	EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
	SystemZ::VectorBytes / ElemBytes / 2);
	if (VT != InVT) {
	Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
	DCI.AddToWorklist(Op1.getNode());
	}
	SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
	DCI.AddToWorklist(Op.getNode());
	return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
	}
	}
	return SDValue();
	}

	SDValue SystemZTargetLowering::combineLOAD(
	SDNode *N, DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	EVT LdVT = N->getValueType(0);
	if (LdVT.isVector() \|\| LdVT.isInteger())
	return SDValue();
	// Transform a scalar load that is REPLICATEd as well as having other
	// use(s) to the form where the other use(s) use the first element of the
	// REPLICATE instead of the load. Otherwise instruction selection will not
	// produce a VLREP. Avoid extracting to a GPR, so only do this for floating
	// point loads.

	SDValue Replicate;
	SmallVector<SDNode*, 8> OtherUses;
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() == SystemZISD::REPLICATE) {
	if (Replicate)
	return SDValue(); // Should never happen
	Replicate = SDValue(*UI, 0);
	}
	else if (UI.getUse().getResNo() == 0)
	OtherUses.push_back(*UI);
	}
	if (!Replicate \|\| OtherUses.empty())
	return SDValue();

	SDLoc DL(N);
	SDValue Extract0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, LdVT,
	Replicate, DAG.getConstant(0, DL, MVT::i32));
	// Update uses of the loaded Value while preserving old chains.
	for (SDNode *U : OtherUses) {
	SmallVector<SDValue, 8> Ops;
	for (SDValue Op : U->ops())
	Ops.push_back((Op.getNode() == N && Op.getResNo() == 0) ? Extract0 : Op);
	DAG.UpdateNodeOperands(U, Ops);
	}
	return SDValue(N, 0);
	}

	bool SystemZTargetLowering::canLoadStoreByteSwapped(EVT VT) const {
	if (VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64)
	return true;
	if (Subtarget.hasVectorEnhancements2())
	if (VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\| VT == MVT::v2i64)
	return true;
	return false;
	}

	static bool isVectorElementSwap(ArrayRef<int> M, EVT VT) {
	if (!VT.isVector() \|\| !VT.isSimple() \|\|
	VT.getSizeInBits() != 128 \|\|
	VT.getScalarSizeInBits() % 8 != 0)
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	for (unsigned i = 0; i < NumElts; ++i) {
	if (M[i] < 0) continue; // ignore UNDEF indices
	if ((unsigned) M[i] != NumElts - 1 - i)
	return false;
	}

	return true;
	}

	SDValue SystemZTargetLowering::combineSTORE(
	SDNode *N, DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	auto *SN = cast<StoreSDNode>(N);
	auto &Op1 = N->getOperand(1);
	EVT MemVT = SN->getMemoryVT();
	// If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better
	// for the extraction to be done on a vMiN value, so that we can use VSTE.
	// If X has wider elements then convert it to:
	// (truncstoreiN (extract_vector_elt (bitcast X), Y2), Z).
	if (MemVT.isInteger() && SN->isTruncatingStore()) {
	if (SDValue Value =
	combineTruncateExtract(SDLoc(N), MemVT, SN->getValue(), DCI)) {
	DCI.AddToWorklist(Value.getNode());

	// Rewrite the store with the new form of stored value.
	return DAG.getTruncStore(SN->getChain(), SDLoc(SN), Value,
	SN->getBasePtr(), SN->getMemoryVT(),
	SN->getMemOperand());
	}
	}
	// Combine STORE (BSWAP) into STRVH/STRV/STRVG/VSTBR
	if (!SN->isTruncatingStore() &&
	Op1.getOpcode() == ISD::BSWAP &&
	Op1.getNode()->hasOneUse() &&
	canLoadStoreByteSwapped(Op1.getValueType())) {

	SDValue BSwapOp = Op1.getOperand(0);

	if (BSwapOp.getValueType() == MVT::i16)
	BSwapOp = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), MVT::i32, BSwapOp);

	SDValue Ops[] = {
	N->getOperand(0), BSwapOp, N->getOperand(2)
	};

	return
	DAG.getMemIntrinsicNode(SystemZISD::STRV, SDLoc(N), DAG.getVTList(MVT::Other),
	Ops, MemVT, SN->getMemOperand());
	}
	// Combine STORE (element-swap) into VSTER
	if (!SN->isTruncatingStore() &&
	Op1.getOpcode() == ISD::VECTOR_SHUFFLE &&
	Op1.getNode()->hasOneUse() &&
	Subtarget.hasVectorEnhancements2()) {
	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op1.getNode());
	ArrayRef<int> ShuffleMask = SVN->getMask();
	if (isVectorElementSwap(ShuffleMask, Op1.getValueType())) {
	SDValue Ops[] = {
	N->getOperand(0), Op1.getOperand(0), N->getOperand(2)
	};

	return DAG.getMemIntrinsicNode(SystemZISD::VSTER, SDLoc(N),
	DAG.getVTList(MVT::Other),
	Ops, MemVT, SN->getMemOperand());
	}
	}

	return SDValue();
	}

	SDValue SystemZTargetLowering::combineVECTOR_SHUFFLE(
	SDNode *N, DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	// Combine element-swap (LOAD) into VLER
	if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
	N->getOperand(0).hasOneUse() &&
	Subtarget.hasVectorEnhancements2()) {
	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
	ArrayRef<int> ShuffleMask = SVN->getMask();
	if (isVectorElementSwap(ShuffleMask, N->getValueType(0))) {
	SDValue Load = N->getOperand(0);
	LoadSDNode *LD = cast<LoadSDNode>(Load);

	// Create the element-swapping load.
	SDValue Ops[] = {
	LD->getChain(), // Chain
	LD->getBasePtr() // Ptr
	};
	SDValue ESLoad =
	DAG.getMemIntrinsicNode(SystemZISD::VLER, SDLoc(N),
	DAG.getVTList(LD->getValueType(0), MVT::Other),
	Ops, LD->getMemoryVT(), LD->getMemOperand());

	// First, combine the VECTOR_SHUFFLE away. This makes the value produced
	// by the load dead.
	DCI.CombineTo(N, ESLoad);

	// Next, combine the load away, we give it a bogus result value but a real
	// chain result. The result value is dead because the shuffle is dead.
	DCI.CombineTo(Load.getNode(), ESLoad, ESLoad.getValue(1));

	// Return N so it doesn't get rechecked!
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	SDValue SystemZTargetLowering::combineEXTRACT_VECTOR_ELT(
	SDNode *N, DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;

	if (!Subtarget.hasVector())
	return SDValue();

	// Look through bitcasts that retain the number of vector elements.
	SDValue Op = N->getOperand(0);
	if (Op.getOpcode() == ISD::BITCAST &&
	Op.getValueType().isVector() &&
	Op.getOperand(0).getValueType().isVector() &&
	Op.getValueType().getVectorNumElements() ==
	Op.getOperand(0).getValueType().getVectorNumElements())
	Op = Op.getOperand(0);

	// Pull BSWAP out of a vector extraction.
	if (Op.getOpcode() == ISD::BSWAP && Op.hasOneUse()) {
	EVT VecVT = Op.getValueType();
	EVT EltVT = VecVT.getVectorElementType();
	Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), EltVT,
	Op.getOperand(0), N->getOperand(1));
	DCI.AddToWorklist(Op.getNode());
	Op = DAG.getNode(ISD::BSWAP, SDLoc(N), EltVT, Op);
	if (EltVT != N->getValueType(0)) {
	DCI.AddToWorklist(Op.getNode());
	Op = DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op);
	}
	return Op;
	}

	// Try to simplify a vector extraction.
	if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
	SDValue Op0 = N->getOperand(0);
	EVT VecVT = Op0.getValueType();
	return combineExtract(SDLoc(N), N->getValueType(0), VecVT, Op0,
	IndexN->getZExtValue(), DCI, false);
	}
	return SDValue();
	}

	SDValue SystemZTargetLowering::combineJOIN_DWORDS(
	SDNode *N, DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	// (join_dwords X, X) == (replicate X)
	if (N->getOperand(0) == N->getOperand(1))
	return DAG.getNode(SystemZISD::REPLICATE, SDLoc(N), N->getValueType(0),
	N->getOperand(0));
	return SDValue();
	}

	SDValue SystemZTargetLowering::combineFP_ROUND(
	SDNode *N, DAGCombinerInfo &DCI) const {

	if (!Subtarget.hasVector())
	return SDValue();

	// (fpround (extract_vector_elt X 0))
	// (fpround (extract_vector_elt X 1)) ->
	// (extract_vector_elt (VROUND X) 0)
	// (extract_vector_elt (VROUND X) 2)
	//
	// This is a special case since the target doesn't really support v2f32s.
	SelectionDAG &DAG = DCI.DAG;
	SDValue Op0 = N->getOperand(0);
	if (N->getValueType(0) == MVT::f32 &&
	Op0.hasOneUse() &&
	Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op0.getOperand(0).getValueType() == MVT::v2f64 &&
	Op0.getOperand(1).getOpcode() == ISD::Constant &&
	cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
	SDValue Vec = Op0.getOperand(0);
	for (auto *U : Vec->uses()) {
	if (U != Op0.getNode() &&
	U->hasOneUse() &&
	U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	U->getOperand(0) == Vec &&
	U->getOperand(1).getOpcode() == ISD::Constant &&
	cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 1) {
	SDValue OtherRound = SDValue(*U->use_begin(), 0);
	if (OtherRound.getOpcode() == ISD::FP_ROUND &&
	OtherRound.getOperand(0) == SDValue(U, 0) &&
	OtherRound.getValueType() == MVT::f32) {
	SDValue VRound = DAG.getNode(SystemZISD::VROUND, SDLoc(N),
	MVT::v4f32, Vec);
	DCI.AddToWorklist(VRound.getNode());
	SDValue Extract1 =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f32,
	VRound, DAG.getConstant(2, SDLoc(U), MVT::i32));
	DCI.AddToWorklist(Extract1.getNode());
	DAG.ReplaceAllUsesOfValueWith(OtherRound, Extract1);
	SDValue Extract0 =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32,
	VRound, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
	return Extract0;
	}
	}
	}
	}
	return SDValue();
	}

	SDValue SystemZTargetLowering::combineFP_EXTEND(
	SDNode *N, DAGCombinerInfo &DCI) const {

	if (!Subtarget.hasVector())
	return SDValue();

	// (fpextend (extract_vector_elt X 0))
	// (fpextend (extract_vector_elt X 2)) ->
	// (extract_vector_elt (VEXTEND X) 0)
	// (extract_vector_elt (VEXTEND X) 1)
	//
	// This is a special case since the target doesn't really support v2f32s.
	SelectionDAG &DAG = DCI.DAG;
	SDValue Op0 = N->getOperand(0);
	if (N->getValueType(0) == MVT::f64 &&
	Op0.hasOneUse() &&
	Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op0.getOperand(0).getValueType() == MVT::v4f32 &&
	Op0.getOperand(1).getOpcode() == ISD::Constant &&
	cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
	SDValue Vec = Op0.getOperand(0);
	for (auto *U : Vec->uses()) {
	if (U != Op0.getNode() &&
	U->hasOneUse() &&
	U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	U->getOperand(0) == Vec &&
	U->getOperand(1).getOpcode() == ISD::Constant &&
	cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 2) {
	SDValue OtherExtend = SDValue(*U->use_begin(), 0);
	if (OtherExtend.getOpcode() == ISD::FP_EXTEND &&
	OtherExtend.getOperand(0) == SDValue(U, 0) &&
	OtherExtend.getValueType() == MVT::f64) {
	SDValue VExtend = DAG.getNode(SystemZISD::VEXTEND, SDLoc(N),
	MVT::v2f64, Vec);
	DCI.AddToWorklist(VExtend.getNode());
	SDValue Extract1 =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f64,
	VExtend, DAG.getConstant(1, SDLoc(U), MVT::i32));
	DCI.AddToWorklist(Extract1.getNode());
	DAG.ReplaceAllUsesOfValueWith(OtherExtend, Extract1);
	SDValue Extract0 =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f64,
	VExtend, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
	return Extract0;
	}
	}
	}
	}
	return SDValue();
	}

	SDValue SystemZTargetLowering::combineBSWAP(
	SDNode *N, DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	// Combine BSWAP (LOAD) into LRVH/LRV/LRVG/VLBR
	if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
	N->getOperand(0).hasOneUse() &&
	canLoadStoreByteSwapped(N->getValueType(0))) {
	SDValue Load = N->getOperand(0);
	LoadSDNode *LD = cast<LoadSDNode>(Load);

	// Create the byte-swapping load.
	SDValue Ops[] = {
	LD->getChain(), // Chain
	LD->getBasePtr() // Ptr
	};
	EVT LoadVT = N->getValueType(0);
	if (LoadVT == MVT::i16)
	LoadVT = MVT::i32;
	SDValue BSLoad =
	DAG.getMemIntrinsicNode(SystemZISD::LRV, SDLoc(N),
	DAG.getVTList(LoadVT, MVT::Other),
	Ops, LD->getMemoryVT(), LD->getMemOperand());

	// If this is an i16 load, insert the truncate.
	SDValue ResVal = BSLoad;
	if (N->getValueType(0) == MVT::i16)
	ResVal = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i16, BSLoad);

	// First, combine the bswap away. This makes the value produced by the
	// load dead.
	DCI.CombineTo(N, ResVal);

	// Next, combine the load away, we give it a bogus result value but a real
	// chain result. The result value is dead because the bswap is dead.
	DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));

	// Return N so it doesn't get rechecked!
	return SDValue(N, 0);
	}

	// Look through bitcasts that retain the number of vector elements.
	SDValue Op = N->getOperand(0);
	if (Op.getOpcode() == ISD::BITCAST &&
	Op.getValueType().isVector() &&
	Op.getOperand(0).getValueType().isVector() &&
	Op.getValueType().getVectorNumElements() ==
	Op.getOperand(0).getValueType().getVectorNumElements())
	Op = Op.getOperand(0);

	// Push BSWAP into a vector insertion if at least one side then simplifies.
	if (Op.getOpcode() == ISD::INSERT_VECTOR_ELT && Op.hasOneUse()) {
	SDValue Vec = Op.getOperand(0);
	SDValue Elt = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);

	if (DAG.isConstantIntBuildVectorOrConstantInt(Vec) \|\|
	Vec.getOpcode() == ISD::BSWAP \|\| Vec.isUndef() \|\|
	DAG.isConstantIntBuildVectorOrConstantInt(Elt) \|\|
	Elt.getOpcode() == ISD::BSWAP \|\| Elt.isUndef() \|\|
	(canLoadStoreByteSwapped(N->getValueType(0)) &&
	ISD::isNON_EXTLoad(Elt.getNode()) && Elt.hasOneUse())) {
	EVT VecVT = N->getValueType(0);
	EVT EltVT = N->getValueType(0).getVectorElementType();
	if (VecVT != Vec.getValueType()) {
	Vec = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Vec);
	DCI.AddToWorklist(Vec.getNode());
	}
	if (EltVT != Elt.getValueType()) {
	Elt = DAG.getNode(ISD::BITCAST, SDLoc(N), EltVT, Elt);
	DCI.AddToWorklist(Elt.getNode());
	}
	Vec = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Vec);
	DCI.AddToWorklist(Vec.getNode());
	Elt = DAG.getNode(ISD::BSWAP, SDLoc(N), EltVT, Elt);
	DCI.AddToWorklist(Elt.getNode());
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), VecVT,
	Vec, Elt, Idx);
	}
	}

	// Push BSWAP into a vector shuffle if at least one side then simplifies.
	ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(Op);
	if (SV && Op.hasOneUse()) {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) \|\|
	Op0.getOpcode() == ISD::BSWAP \|\| Op0.isUndef() \|\|
	DAG.isConstantIntBuildVectorOrConstantInt(Op1) \|\|
	Op1.getOpcode() == ISD::BSWAP \|\| Op1.isUndef()) {
	EVT VecVT = N->getValueType(0);
	if (VecVT != Op0.getValueType()) {
	Op0 = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Op0);
	DCI.AddToWorklist(Op0.getNode());
	}
	if (VecVT != Op1.getValueType()) {
	Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Op1);
	DCI.AddToWorklist(Op1.getNode());
	}
	Op0 = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Op0);
	DCI.AddToWorklist(Op0.getNode());
	Op1 = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Op1);
	DCI.AddToWorklist(Op1.getNode());
	return DAG.getVectorShuffle(VecVT, SDLoc(N), Op0, Op1, SV->getMask());
	}
	}

	return SDValue();
	}

	static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) {
	// We have a SELECT_CCMASK or BR_CCMASK comparing the condition code
	// set by the CCReg instruction using the CCValid / CCMask masks,
	// If the CCReg instruction is itself a ICMP testing the condition
	// code set by some other instruction, see whether we can directly
	// use that condition code.

	// Verify that we have an ICMP against some constant.
	if (CCValid != SystemZ::CCMASK_ICMP)
	return false;
	auto *ICmp = CCReg.getNode();
	if (ICmp->getOpcode() != SystemZISD::ICMP)
	return false;
	auto *CompareLHS = ICmp->getOperand(0).getNode();
	auto *CompareRHS = dyn_cast<ConstantSDNode>(ICmp->getOperand(1));
	if (!CompareRHS)
	return false;

	// Optimize the case where CompareLHS is a SELECT_CCMASK.
	if (CompareLHS->getOpcode() == SystemZISD::SELECT_CCMASK) {
	// Verify that we have an appropriate mask for a EQ or NE comparison.
	bool Invert = false;
	if (CCMask == SystemZ::CCMASK_CMP_NE)
	Invert = !Invert;
	else if (CCMask != SystemZ::CCMASK_CMP_EQ)
	return false;

	// Verify that the ICMP compares against one of select values.
	auto *TrueVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(0));
	if (!TrueVal)
	return false;
	auto *FalseVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
	if (!FalseVal)
	return false;
	if (CompareRHS->getZExtValue() == FalseVal->getZExtValue())
	Invert = !Invert;
	else if (CompareRHS->getZExtValue() != TrueVal->getZExtValue())
	return false;

	// Compute the effective CC mask for the new branch or select.
	auto *NewCCValid = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(2));
	auto *NewCCMask = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(3));
	if (!NewCCValid \|\| !NewCCMask)
	return false;
	CCValid = NewCCValid->getZExtValue();
	CCMask = NewCCMask->getZExtValue();
	if (Invert)
	CCMask ^= CCValid;

	// Return the updated CCReg link.
	CCReg = CompareLHS->getOperand(4);
	return true;
	}

	// Optimize the case where CompareRHS is (SRA (SHL (IPM))).
	if (CompareLHS->getOpcode() == ISD::SRA) {
	auto *SRACount = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
	if (!SRACount \|\| SRACount->getZExtValue() != 30)
	return false;
	auto *SHL = CompareLHS->getOperand(0).getNode();
	if (SHL->getOpcode() != ISD::SHL)
	return false;
	auto *SHLCount = dyn_cast<ConstantSDNode>(SHL->getOperand(1));
	if (!SHLCount \|\| SHLCount->getZExtValue() != 30 - SystemZ::IPM_CC)
	return false;
	auto *IPM = SHL->getOperand(0).getNode();
	if (IPM->getOpcode() != SystemZISD::IPM)
	return false;

	// Avoid introducing CC spills (because SRA would clobber CC).
	if (!CompareLHS->hasOneUse())
	return false;
	// Verify that the ICMP compares against zero.
	if (CompareRHS->getZExtValue() != 0)
	return false;

	// Compute the effective CC mask for the new branch or select.
	switch (CCMask) {
	case SystemZ::CCMASK_CMP_EQ: break;
	case SystemZ::CCMASK_CMP_NE: break;
	case SystemZ::CCMASK_CMP_LT: CCMask = SystemZ::CCMASK_CMP_GT; break;
	case SystemZ::CCMASK_CMP_GT: CCMask = SystemZ::CCMASK_CMP_LT; break;
	case SystemZ::CCMASK_CMP_LE: CCMask = SystemZ::CCMASK_CMP_GE; break;
	case SystemZ::CCMASK_CMP_GE: CCMask = SystemZ::CCMASK_CMP_LE; break;
	default: return false;
	}

	// Return the updated CCReg link.
	CCReg = IPM->getOperand(0);
	return true;
	}

	return false;
	}

	SDValue SystemZTargetLowering::combineBR_CCMASK(
	SDNode *N, DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;

	// Combine BR_CCMASK (ICMP (SELECT_CCMASK)) into a single BR_CCMASK.
	auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(1));
	auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(2));
	if (!CCValid \|\| !CCMask)
	return SDValue();

	int CCValidVal = CCValid->getZExtValue();
	int CCMaskVal = CCMask->getZExtValue();
	SDValue Chain = N->getOperand(0);
	SDValue CCReg = N->getOperand(4);

	if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
	return DAG.getNode(SystemZISD::BR_CCMASK, SDLoc(N), N->getValueType(0),
	Chain,
	DAG.getConstant(CCValidVal, SDLoc(N), MVT::i32),
	DAG.getConstant(CCMaskVal, SDLoc(N), MVT::i32),
	N->getOperand(3), CCReg);
	return SDValue();
	}

	SDValue SystemZTargetLowering::combineSELECT_CCMASK(
	SDNode *N, DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;

	// Combine SELECT_CCMASK (ICMP (SELECT_CCMASK)) into a single SELECT_CCMASK.
	auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(2));
	auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(3));
	if (!CCValid \|\| !CCMask)
	return SDValue();

	int CCValidVal = CCValid->getZExtValue();
	int CCMaskVal = CCMask->getZExtValue();
	SDValue CCReg = N->getOperand(4);

	if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
	return DAG.getNode(SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0),
	N->getOperand(0),
	N->getOperand(1),
	DAG.getConstant(CCValidVal, SDLoc(N), MVT::i32),
	DAG.getConstant(CCMaskVal, SDLoc(N), MVT::i32),
	CCReg);
	return SDValue();
	}


	SDValue SystemZTargetLowering::combineGET_CCMASK(
	SDNode *N, DAGCombinerInfo &DCI) const {

	// Optimize away GET_CCMASK (SELECT_CCMASK) if the CC masks are compatible
	auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(1));
	auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(2));
	if (!CCValid \|\| !CCMask)
	return SDValue();
	int CCValidVal = CCValid->getZExtValue();
	int CCMaskVal = CCMask->getZExtValue();

	SDValue Select = N->getOperand(0);
	if (Select->getOpcode() != SystemZISD::SELECT_CCMASK)
	return SDValue();

	auto *SelectCCValid = dyn_cast<ConstantSDNode>(Select->getOperand(2));
	auto *SelectCCMask = dyn_cast<ConstantSDNode>(Select->getOperand(3));
	if (!SelectCCValid \|\| !SelectCCMask)
	return SDValue();
	int SelectCCValidVal = SelectCCValid->getZExtValue();
	int SelectCCMaskVal = SelectCCMask->getZExtValue();

	auto *TrueVal = dyn_cast<ConstantSDNode>(Select->getOperand(0));
	auto *FalseVal = dyn_cast<ConstantSDNode>(Select->getOperand(1));
	if (!TrueVal \|\| !FalseVal)
	return SDValue();
	if (TrueVal->getZExtValue() != 0 && FalseVal->getZExtValue() == 0)
	;
	else if (TrueVal->getZExtValue() == 0 && FalseVal->getZExtValue() != 0)
	SelectCCMaskVal ^= SelectCCValidVal;
	else
	return SDValue();

	if (SelectCCValidVal & ~CCValidVal)
	return SDValue();
	if (SelectCCMaskVal != (CCMaskVal & SelectCCValidVal))
	return SDValue();

	return Select->getOperand(4);
	}

	SDValue SystemZTargetLowering::combineIntDIVREM(
	SDNode *N, DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);
	// In the case where the divisor is a vector of constants a cheaper
	// sequence of instructions can replace the divide. BuildSDIV is called to
	// do this during DAG combining, but it only succeeds when it can build a
	// multiplication node. The only option for SystemZ is ISD::SMUL_LOHI, and
	// since it is not Legal but Custom it can only happen before
	// legalization. Therefore we must scalarize this early before Combine
	// 1. For widened vectors, this is already the result of type legalization.
	if (DCI.Level == BeforeLegalizeTypes && VT.isVector() && isTypeLegal(VT) &&
	DAG.isConstantIntBuildVectorOrConstantInt(N->getOperand(1)))
	return DAG.UnrollVectorOp(N);
	return SDValue();
	}

	SDValue SystemZTargetLowering::unwrapAddress(SDValue N) const {
	if (N->getOpcode() == SystemZISD::PCREL_WRAPPER)
	return N->getOperand(0);
	return N;
	}

	SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	switch(N->getOpcode()) {
	default: break;
	case ISD::ZERO_EXTEND: return combineZERO_EXTEND(N, DCI);
	case ISD::SIGN_EXTEND: return combineSIGN_EXTEND(N, DCI);
	case ISD::SIGN_EXTEND_INREG: return combineSIGN_EXTEND_INREG(N, DCI);
	case SystemZISD::MERGE_HIGH:
	case SystemZISD::MERGE_LOW: return combineMERGE(N, DCI);
	case ISD::LOAD: return combineLOAD(N, DCI);
	case ISD::STORE: return combineSTORE(N, DCI);
	case ISD::VECTOR_SHUFFLE: return combineVECTOR_SHUFFLE(N, DCI);
	case ISD::EXTRACT_VECTOR_ELT: return combineEXTRACT_VECTOR_ELT(N, DCI);
	case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI);
	case ISD::FP_ROUND: return combineFP_ROUND(N, DCI);
	case ISD::FP_EXTEND: return combineFP_EXTEND(N, DCI);
	case ISD::BSWAP: return combineBSWAP(N, DCI);
	case SystemZISD::BR_CCMASK: return combineBR_CCMASK(N, DCI);
	case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI);
	case SystemZISD::GET_CCMASK: return combineGET_CCMASK(N, DCI);
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM: return combineIntDIVREM(N, DCI);
	}

	return SDValue();
	}

	// Return the demanded elements for the OpNo source operand of Op. DemandedElts
	// are for Op.
	static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
	unsigned OpNo) {
	EVT VT = Op.getValueType();
	unsigned NumElts = (VT.isVector() ? VT.getVectorNumElements() : 1);
	APInt SrcDemE;
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
	unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	switch (Id) {
	case Intrinsic::s390_vpksh: // PACKS
	case Intrinsic::s390_vpksf:
	case Intrinsic::s390_vpksg:
	case Intrinsic::s390_vpkshs: // PACKS_CC
	case Intrinsic::s390_vpksfs:
	case Intrinsic::s390_vpksgs:
	case Intrinsic::s390_vpklsh: // PACKLS
	case Intrinsic::s390_vpklsf:
	case Intrinsic::s390_vpklsg:
	case Intrinsic::s390_vpklshs: // PACKLS_CC
	case Intrinsic::s390_vpklsfs:
	case Intrinsic::s390_vpklsgs:
	// VECTOR PACK truncates the elements of two source vectors into one.
	SrcDemE = DemandedElts;
	if (OpNo == 2)
	SrcDemE.lshrInPlace(NumElts / 2);
	SrcDemE = SrcDemE.trunc(NumElts / 2);
	break;
	// VECTOR UNPACK extends half the elements of the source vector.
	case Intrinsic::s390_vuphb: // VECTOR UNPACK HIGH
	case Intrinsic::s390_vuphh:
	case Intrinsic::s390_vuphf:
	case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
	case Intrinsic::s390_vuplhh:
	case Intrinsic::s390_vuplhf:
	SrcDemE = APInt(NumElts * 2, 0);
	SrcDemE.insertBits(DemandedElts, 0);
	break;
	case Intrinsic::s390_vuplb: // VECTOR UNPACK LOW
	case Intrinsic::s390_vuplhw:
	case Intrinsic::s390_vuplf:
	case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
	case Intrinsic::s390_vupllh:
	case Intrinsic::s390_vupllf:
	SrcDemE = APInt(NumElts * 2, 0);
	SrcDemE.insertBits(DemandedElts, NumElts);
	break;
	case Intrinsic::s390_vpdi: {
	// VECTOR PERMUTE DWORD IMMEDIATE selects one element from each source.
	SrcDemE = APInt(NumElts, 0);
	if (!DemandedElts[OpNo - 1])
	break;
	unsigned Mask = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	unsigned MaskBit = ((OpNo - 1) ? 1 : 4);
	// Demand input element 0 or 1, given by the mask bit value.
	SrcDemE.setBit((Mask & MaskBit)? 1 : 0);
	break;
	}
	case Intrinsic::s390_vsldb: {
	// VECTOR SHIFT LEFT DOUBLE BY BYTE
	assert(VT == MVT::v16i8 && "Unexpected type.");
	unsigned FirstIdx = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	assert (FirstIdx > 0 && FirstIdx < 16 && "Unused operand.");
	unsigned NumSrc0Els = 16 - FirstIdx;
	SrcDemE = APInt(NumElts, 0);
	if (OpNo == 1) {
	APInt DemEls = DemandedElts.trunc(NumSrc0Els);
	SrcDemE.insertBits(DemEls, FirstIdx);
	} else {
	APInt DemEls = DemandedElts.lshr(NumSrc0Els);
	SrcDemE.insertBits(DemEls, 0);
	}
	break;
	}
	case Intrinsic::s390_vperm:
	SrcDemE = APInt(NumElts, 1);
	break;
	default:
	llvm_unreachable("Unhandled intrinsic.");
	break;
	}
	} else {
	switch (Opcode) {
	case SystemZISD::JOIN_DWORDS:
	// Scalar operand.
	SrcDemE = APInt(1, 1);
	break;
	case SystemZISD::SELECT_CCMASK:
	SrcDemE = DemandedElts;
	break;
	default:
	llvm_unreachable("Unhandled opcode.");
	break;
	}
	}
	return SrcDemE;
	}

	static void computeKnownBitsBinOp(const SDValue Op, KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG, unsigned Depth,
	unsigned OpNo) {
	APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
	APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
	KnownBits LHSKnown =
	DAG.computeKnownBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
	KnownBits RHSKnown =
	DAG.computeKnownBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
	Known.Zero = LHSKnown.Zero & RHSKnown.Zero;
	Known.One = LHSKnown.One & RHSKnown.One;
	}

	void
	SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	Known.resetAll();

	// Intrinsic CC result is returned in the two low bits.
	unsigned tmp0, tmp1; // not used
	if (Op.getResNo() == 1 && isIntrinsicWithCC(Op, tmp0, tmp1)) {
	Known.Zero.setBitsFrom(2);
	return;
	}
	EVT VT = Op.getValueType();
	if (Op.getResNo() != 0 \|\| VT == MVT::Untyped)
	return;
	assert (Known.getBitWidth() == VT.getScalarSizeInBits() &&
	"KnownBits does not match VT in bitwidth");
	assert ((!VT.isVector() \|\|
	(DemandedElts.getBitWidth() == VT.getVectorNumElements())) &&
	"DemandedElts does not match VT number of elements");
	unsigned BitWidth = Known.getBitWidth();
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
	bool IsLogical = false;
	unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	switch (Id) {
	case Intrinsic::s390_vpksh: // PACKS
	case Intrinsic::s390_vpksf:
	case Intrinsic::s390_vpksg:
	case Intrinsic::s390_vpkshs: // PACKS_CC
	case Intrinsic::s390_vpksfs:
	case Intrinsic::s390_vpksgs:
	case Intrinsic::s390_vpklsh: // PACKLS
	case Intrinsic::s390_vpklsf:
	case Intrinsic::s390_vpklsg:
	case Intrinsic::s390_vpklshs: // PACKLS_CC
	case Intrinsic::s390_vpklsfs:
	case Intrinsic::s390_vpklsgs:
	case Intrinsic::s390_vpdi:
	case Intrinsic::s390_vsldb:
	case Intrinsic::s390_vperm:
	computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 1);
	break;
	case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
	case Intrinsic::s390_vuplhh:
	case Intrinsic::s390_vuplhf:
	case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
	case Intrinsic::s390_vupllh:
	case Intrinsic::s390_vupllf:
	IsLogical = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::s390_vuphb: // VECTOR UNPACK HIGH
	case Intrinsic::s390_vuphh:
	case Intrinsic::s390_vuphf:
	case Intrinsic::s390_vuplb: // VECTOR UNPACK LOW
	case Intrinsic::s390_vuplhw:
	case Intrinsic::s390_vuplf: {
	SDValue SrcOp = Op.getOperand(1);
	APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 0);
	Known = DAG.computeKnownBits(SrcOp, SrcDemE, Depth + 1);
	if (IsLogical) {
	Known = Known.zext(BitWidth, true);
	} else
	Known = Known.sext(BitWidth);
	break;
	}
	default:
	break;
	}
	} else {
	switch (Opcode) {
	case SystemZISD::JOIN_DWORDS:
	case SystemZISD::SELECT_CCMASK:
	computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 0);
	break;
	case SystemZISD::REPLICATE: {
	SDValue SrcOp = Op.getOperand(0);
	Known = DAG.computeKnownBits(SrcOp, Depth + 1);
	if (Known.getBitWidth() < BitWidth && isa<ConstantSDNode>(SrcOp))
	Known = Known.sext(BitWidth); // VREPI sign extends the immedate.
	break;
	}
	default:
	break;
	}
	}

	// Known has the width of the source operand(s). Adjust if needed to match
	// the passed bitwidth.
	if (Known.getBitWidth() != BitWidth)
	Known = Known.zextOrTrunc(BitWidth, false);
	}

	static unsigned computeNumSignBitsBinOp(SDValue Op, const APInt &DemandedElts,
	const SelectionDAG &DAG, unsigned Depth,
	unsigned OpNo) {
	APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
	unsigned LHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
	if (LHS == 1) return 1; // Early out.
	APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
	unsigned RHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
	if (RHS == 1) return 1; // Early out.
	unsigned Common = std::min(LHS, RHS);
	unsigned SrcBitWidth = Op.getOperand(OpNo).getScalarValueSizeInBits();
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getScalarSizeInBits();
	if (SrcBitWidth > VTBits) { // PACK
	unsigned SrcExtraBits = SrcBitWidth - VTBits;
	if (Common > SrcExtraBits)
	return (Common - SrcExtraBits);
	return 1;
	}
	assert (SrcBitWidth == VTBits && "Expected operands of same bitwidth.");
	return Common;
	}

	unsigned
	SystemZTargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	if (Op.getResNo() != 0)
	return 1;
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
	unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	switch (Id) {
	case Intrinsic::s390_vpksh: // PACKS
	case Intrinsic::s390_vpksf:
	case Intrinsic::s390_vpksg:
	case Intrinsic::s390_vpkshs: // PACKS_CC
	case Intrinsic::s390_vpksfs:
	case Intrinsic::s390_vpksgs:
	case Intrinsic::s390_vpklsh: // PACKLS
	case Intrinsic::s390_vpklsf:
	case Intrinsic::s390_vpklsg:
	case Intrinsic::s390_vpklshs: // PACKLS_CC
	case Intrinsic::s390_vpklsfs:
	case Intrinsic::s390_vpklsgs:
	case Intrinsic::s390_vpdi:
	case Intrinsic::s390_vsldb:
	case Intrinsic::s390_vperm:
	return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 1);
	case Intrinsic::s390_vuphb: // VECTOR UNPACK HIGH
	case Intrinsic::s390_vuphh:
	case Intrinsic::s390_vuphf:
	case Intrinsic::s390_vuplb: // VECTOR UNPACK LOW
	case Intrinsic::s390_vuplhw:
	case Intrinsic::s390_vuplf: {
	SDValue PackedOp = Op.getOperand(1);
	APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 1);
	unsigned Tmp = DAG.ComputeNumSignBits(PackedOp, SrcDemE, Depth + 1);
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getScalarSizeInBits();
	Tmp += VTBits - PackedOp.getScalarValueSizeInBits();
	return Tmp;
	}
	default:
	break;
	}
	} else {
	switch (Opcode) {
	case SystemZISD::SELECT_CCMASK:
	return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 0);
	default:
	break;
	}
	}

	return 1;
	}

	//===----------------------------------------------------------------------===//
	// Custom insertion
	//===----------------------------------------------------------------------===//

	// Create a new basic block after MBB.
	static MachineBasicBlock emitBlockAfter(MachineBasicBlock MBB) {
	MachineFunction &MF = *MBB->getParent();
	MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
	MF.insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
	return NewMBB;
	}

	// Split MBB after MI and return the new block (the one that contains
	// instructions after MI).
	static MachineBasicBlock *splitBlockAfter(MachineBasicBlock::iterator MI,
	MachineBasicBlock *MBB) {
	MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
	NewMBB->splice(NewMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
	return NewMBB;
	}

	// Split MBB before MI and return the new block (the one that contains MI).
	static MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI,
	MachineBasicBlock *MBB) {
	MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
	NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end());
	NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
	return NewMBB;
	}

	// Force base value Base into a register before MI. Return the register.
	static Register forceReg(MachineInstr &MI, MachineOperand &Base,
	const SystemZInstrInfo *TII) {
	if (Base.isReg())
	return Base.getReg();

	MachineBasicBlock *MBB = MI.getParent();
	MachineFunction &MF = *MBB->getParent();
	MachineRegisterInfo &MRI = MF.getRegInfo();

	Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
	BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LA), Reg)
	.add(Base)
	.addImm(0)
	.addReg(0);
	return Reg;
	}

	// The CC operand of MI might be missing a kill marker because there
	// were multiple uses of CC, and ISel didn't know which to mark.
	// Figure out whether MI should have had a kill marker.
	static bool checkCCKill(MachineInstr &MI, MachineBasicBlock *MBB) {
	// Scan forward through BB for a use/def of CC.
	MachineBasicBlock::iterator miI(std::next(MachineBasicBlock::iterator(MI)));
	for (MachineBasicBlock::iterator miE = MBB->end(); miI != miE; ++miI) {
	const MachineInstr& mi = *miI;
	if (mi.readsRegister(SystemZ::CC))
	return false;
	if (mi.definesRegister(SystemZ::CC))
	break; // Should have kill-flag - update below.
	}

	// If we hit the end of the block, check whether CC is live into a
	// successor.
	if (miI == MBB->end()) {
	for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI)
	if ((*SI)->isLiveIn(SystemZ::CC))
	return false;
	}

	return true;
	}

	// Return true if it is OK for this Select pseudo-opcode to be cascaded
	// together with other Select pseudo-opcodes into a single basic-block with
	// a conditional jump around it.
	static bool isSelectPseudo(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case SystemZ::Select32:
	case SystemZ::Select64:
	case SystemZ::SelectF32:
	case SystemZ::SelectF64:
	case SystemZ::SelectF128:
	case SystemZ::SelectVR32:
	case SystemZ::SelectVR64:
	case SystemZ::SelectVR128:
	return true;

	default:
	return false;
	}
	}

	// Helper function, which inserts PHI functions into SinkMBB:
	// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
	// where %FalseValue(i) and %TrueValue(i) are taken from the consequent Selects
	// in [MIItBegin, MIItEnd) range.
	static void createPHIsForSelects(MachineBasicBlock::iterator MIItBegin,
	MachineBasicBlock::iterator MIItEnd,
	MachineBasicBlock *TrueMBB,
	MachineBasicBlock *FalseMBB,
	MachineBasicBlock *SinkMBB) {
	MachineFunction *MF = TrueMBB->getParent();
	const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();

	unsigned CCValid = MIItBegin->getOperand(3).getImm();
	unsigned CCMask = MIItBegin->getOperand(4).getImm();
	DebugLoc DL = MIItBegin->getDebugLoc();

	MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

	// As we are creating the PHIs, we have to be careful if there is more than
	// one. Later Selects may reference the results of earlier Selects, but later
	// PHIs have to reference the individual true/false inputs from earlier PHIs.
	// That also means that PHI construction must work forward from earlier to
	// later, and that the code must maintain a mapping from earlier PHI's
	// destination registers, and the registers that went into the PHI.
	DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;

	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;
	MIIt = skipDebugInstructionsForward(++MIIt, MIItEnd)) {
	unsigned DestReg = MIIt->getOperand(0).getReg();
	unsigned TrueReg = MIIt->getOperand(1).getReg();
	unsigned FalseReg = MIIt->getOperand(2).getReg();

	// If this Select we are generating is the opposite condition from
	// the jump we generated, then we have to swap the operands for the
	// PHI that is going to be generated.
	if (MIIt->getOperand(4).getImm() == (CCValid ^ CCMask))
	std::swap(TrueReg, FalseReg);

	if (RegRewriteTable.find(TrueReg) != RegRewriteTable.end())
	TrueReg = RegRewriteTable[TrueReg].first;

	if (RegRewriteTable.find(FalseReg) != RegRewriteTable.end())
	FalseReg = RegRewriteTable[FalseReg].second;

	BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(SystemZ::PHI), DestReg)
	.addReg(TrueReg).addMBB(TrueMBB)
	.addReg(FalseReg).addMBB(FalseMBB);

	// Add this PHI to the rewrite table.
	RegRewriteTable[DestReg] = std::make_pair(TrueReg, FalseReg);
	}

	MF->getProperties().reset(MachineFunctionProperties::Property::NoPHIs);
	}

	// Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
	MachineBasicBlock *
	SystemZTargetLowering::emitSelect(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	const SystemZInstrInfo *TII =
	static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());

	unsigned CCValid = MI.getOperand(3).getImm();
	unsigned CCMask = MI.getOperand(4).getImm();
	DebugLoc DL = MI.getDebugLoc();

	// If we have a sequence of Select* pseudo instructions using the
	// same condition code value, we want to expand all of them into
	// a single pair of basic blocks using the same condition.
	MachineInstr *LastMI = &MI;
	MachineBasicBlock::iterator NextMIIt = skipDebugInstructionsForward(
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());

	if (isSelectPseudo(MI))
	while (NextMIIt != MBB->end() && isSelectPseudo(*NextMIIt) &&
	NextMIIt->getOperand(3).getImm() == CCValid &&
	(NextMIIt->getOperand(4).getImm() == CCMask \|\|
	NextMIIt->getOperand(4).getImm() == (CCValid ^ CCMask))) {
	LastMI = &*NextMIIt;
	NextMIIt = skipDebugInstructionsForward(++NextMIIt, MBB->end());
	}

	MachineBasicBlock *StartMBB = MBB;
	MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB);
	MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);

	// Unless CC was killed in the last Select instruction, mark it as
	// live-in to both FalseMBB and JoinMBB.
	if (!LastMI->killsRegister(SystemZ::CC) && !checkCCKill(*LastMI, JoinMBB)) {
	FalseMBB->addLiveIn(SystemZ::CC);
	JoinMBB->addLiveIn(SystemZ::CC);
	}

	// StartMBB:
	// BRC CCMask, JoinMBB
	// # fallthrough to FalseMBB
	MBB = StartMBB;
	BuildMI(MBB, DL, TII->get(SystemZ::BRC))
	.addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
	MBB->addSuccessor(JoinMBB);
	MBB->addSuccessor(FalseMBB);

	// FalseMBB:
	// # fallthrough to JoinMBB
	MBB = FalseMBB;
	MBB->addSuccessor(JoinMBB);

	// JoinMBB:
	// %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
	// ...
	MBB = JoinMBB;
	MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
	MachineBasicBlock::iterator MIItEnd = skipDebugInstructionsForward(
	std::next(MachineBasicBlock::iterator(LastMI)), MBB->end());
	createPHIsForSelects(MIItBegin, MIItEnd, StartMBB, FalseMBB, MBB);

	StartMBB->erase(MIItBegin, MIItEnd);
	return JoinMBB;
	}

	// Implement EmitInstrWithCustomInserter for pseudo CondStore* instruction MI.
	// StoreOpcode is the store to use and Invert says whether the store should
	// happen when the condition is false rather than true. If a STORE ON
	// CONDITION is available, STOCOpcode is its opcode, otherwise it is 0.
	MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
	MachineBasicBlock *MBB,
	unsigned StoreOpcode,
	unsigned STOCOpcode,
	bool Invert) const {
	const SystemZInstrInfo *TII =
	static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());

	unsigned SrcReg = MI.getOperand(0).getReg();
	MachineOperand Base = MI.getOperand(1);
	int64_t Disp = MI.getOperand(2).getImm();
	unsigned IndexReg = MI.getOperand(3).getReg();
	unsigned CCValid = MI.getOperand(4).getImm();
	unsigned CCMask = MI.getOperand(5).getImm();
	DebugLoc DL = MI.getDebugLoc();

	StoreOpcode = TII->getOpcodeForOffset(StoreOpcode, Disp);

	// Use STOCOpcode if possible. We could use different store patterns in
	// order to avoid matching the index register, but the performance trade-offs
	// might be more complicated in that case.
	if (STOCOpcode && !IndexReg && Subtarget.hasLoadStoreOnCond()) {
	if (Invert)
	CCMask ^= CCValid;

	// ISel pattern matching also adds a load memory operand of the same
	// address, so take special care to find the storing memory operand.
	MachineMemOperand *MMO = nullptr;
	for (auto *I : MI.memoperands())
	if (I->isStore()) {
	MMO = I;
	break;
	}

	BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
	.addReg(SrcReg)
	.add(Base)
	.addImm(Disp)
	.addImm(CCValid)
	.addImm(CCMask)
	.addMemOperand(MMO);

	MI.eraseFromParent();
	return MBB;
	}

	// Get the condition needed to branch around the store.
	if (!Invert)
	CCMask ^= CCValid;

	MachineBasicBlock *StartMBB = MBB;
	MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB);
	MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);

	// Unless CC was killed in the CondStore instruction, mark it as
	// live-in to both FalseMBB and JoinMBB.
	if (!MI.killsRegister(SystemZ::CC) && !checkCCKill(MI, JoinMBB)) {
	FalseMBB->addLiveIn(SystemZ::CC);
	JoinMBB->addLiveIn(SystemZ::CC);
	}

	// StartMBB:
	// BRC CCMask, JoinMBB
	// # fallthrough to FalseMBB
	MBB = StartMBB;
	BuildMI(MBB, DL, TII->get(SystemZ::BRC))
	.addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
	MBB->addSuccessor(JoinMBB);
	MBB->addSuccessor(FalseMBB);

	// FalseMBB:
	// store %SrcReg, %Disp(%Index,%Base)
	// # fallthrough to JoinMBB
	MBB = FalseMBB;
	BuildMI(MBB, DL, TII->get(StoreOpcode))
	.addReg(SrcReg)
	.add(Base)
	.addImm(Disp)
	.addReg(IndexReg);
	MBB->addSuccessor(JoinMBB);

	MI.eraseFromParent();
	return JoinMBB;
	}

	// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_LOAD{,W}_*
	// or ATOMIC_SWAP{,W} instruction MI. BinOpcode is the instruction that
	// performs the binary operation elided by "*", or 0 for ATOMIC_SWAP{,W}.
	// BitSize is the width of the field in bits, or 0 if this is a partword
	// ATOMIC_LOADW_* or ATOMIC_SWAPW instruction, in which case the bitsize
	// is one of the operands. Invert says whether the field should be
	// inverted after performing BinOpcode (e.g. for NAND).
	MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
	MachineInstr &MI, MachineBasicBlock *MBB, unsigned BinOpcode,
	unsigned BitSize, bool Invert) const {
	MachineFunction &MF = *MBB->getParent();
	const SystemZInstrInfo *TII =
	static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
	MachineRegisterInfo &MRI = MF.getRegInfo();
	bool IsSubWord = (BitSize < 32);

	// Extract the operands. Base can be a register or a frame index.
	// Src2 can be a register or immediate.
	unsigned Dest = MI.getOperand(0).getReg();
	MachineOperand Base = earlyUseOperand(MI.getOperand(1));
	int64_t Disp = MI.getOperand(2).getImm();
	MachineOperand Src2 = earlyUseOperand(MI.getOperand(3));
	Register BitShift = IsSubWord ? MI.getOperand(4).getReg() : Register();
	Register NegBitShift = IsSubWord ? MI.getOperand(5).getReg() : Register();
	DebugLoc DL = MI.getDebugLoc();
	if (IsSubWord)
	BitSize = MI.getOperand(6).getImm();

	// Subword operations use 32-bit registers.
	const TargetRegisterClass *RC = (BitSize <= 32 ?
	&SystemZ::GR32BitRegClass :
	&SystemZ::GR64BitRegClass);
	unsigned LOpcode = BitSize <= 32 ? SystemZ::L : SystemZ::LG;
	unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;

	// Get the right opcodes for the displacement.
	LOpcode = TII->getOpcodeForOffset(LOpcode, Disp);
	CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
	assert(LOpcode && CSOpcode && "Displacement out of range");

	// Create virtual registers for temporary results.
	Register OrigVal = MRI.createVirtualRegister(RC);
	Register OldVal = MRI.createVirtualRegister(RC);
	Register NewVal = (BinOpcode \|\| IsSubWord ?
	MRI.createVirtualRegister(RC) : Src2.getReg());
	Register RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
	Register RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);

	// Insert a basic block for the main loop.
	MachineBasicBlock *StartMBB = MBB;
	MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
	MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);

	// StartMBB:
	// ...
	// %OrigVal = L Disp(%Base)
	// # fall through to LoopMMB
	MBB = StartMBB;
	BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
	MBB->addSuccessor(LoopMBB);

	// LoopMBB:
	// %OldVal = phi [ %OrigVal, StartMBB ], [ %Dest, LoopMBB ]
	// %RotatedOldVal = RLL %OldVal, 0(%BitShift)
	// %RotatedNewVal = OP %RotatedOldVal, %Src2
	// %NewVal = RLL %RotatedNewVal, 0(%NegBitShift)
	// %Dest = CS %OldVal, %NewVal, Disp(%Base)
	// JNE LoopMBB
	// # fall through to DoneMMB
	MBB = LoopMBB;
	BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
	.addReg(OrigVal).addMBB(StartMBB)
	.addReg(Dest).addMBB(LoopMBB);
	if (IsSubWord)
	BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
	.addReg(OldVal).addReg(BitShift).addImm(0);
	if (Invert) {
	// Perform the operation normally and then invert every bit of the field.
	unsigned Tmp = MRI.createVirtualRegister(RC);
	BuildMI(MBB, DL, TII->get(BinOpcode), Tmp).addReg(RotatedOldVal).add(Src2);
	if (BitSize <= 32)
	// XILF with the upper BitSize bits set.
	BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
	.addReg(Tmp).addImm(-1U << (32 - BitSize));
	else {
	// Use LCGR and add -1 to the result, which is more compact than
	// an XILF, XILH pair.
	unsigned Tmp2 = MRI.createVirtualRegister(RC);
	BuildMI(MBB, DL, TII->get(SystemZ::LCGR), Tmp2).addReg(Tmp);
	BuildMI(MBB, DL, TII->get(SystemZ::AGHI), RotatedNewVal)
	.addReg(Tmp2).addImm(-1);
	}
	} else if (BinOpcode)
	// A simply binary operation.
	BuildMI(MBB, DL, TII->get(BinOpcode), RotatedNewVal)
	.addReg(RotatedOldVal)
	.add(Src2);
	else if (IsSubWord)
	// Use RISBG to rotate Src2 into position and use it to replace the
	// field in RotatedOldVal.
	BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedNewVal)
	.addReg(RotatedOldVal).addReg(Src2.getReg())
	.addImm(32).addImm(31 + BitSize).addImm(32 - BitSize);
	if (IsSubWord)
	BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
	.addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
	BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
	.addReg(OldVal)
	.addReg(NewVal)
	.add(Base)
	.addImm(Disp);
	BuildMI(MBB, DL, TII->get(SystemZ::BRC))
	.addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
	MBB->addSuccessor(LoopMBB);
	MBB->addSuccessor(DoneMBB);

	MI.eraseFromParent();
	return DoneMBB;
	}

	// Implement EmitInstrWithCustomInserter for pseudo
	// ATOMIC_LOAD{,W}_{,U}{MIN,MAX} instruction MI. CompareOpcode is the
	// instruction that should be used to compare the current field with the
	// minimum or maximum value. KeepOldMask is the BRC condition-code mask
	// for when the current field should be kept. BitSize is the width of
	// the field in bits, or 0 if this is a partword ATOMIC_LOADW_* instruction.
	MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
	MachineInstr &MI, MachineBasicBlock *MBB, unsigned CompareOpcode,
	unsigned KeepOldMask, unsigned BitSize) const {
	MachineFunction &MF = *MBB->getParent();
	const SystemZInstrInfo *TII =
	static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
	MachineRegisterInfo &MRI = MF.getRegInfo();
	bool IsSubWord = (BitSize < 32);

	// Extract the operands. Base can be a register or a frame index.
	unsigned Dest = MI.getOperand(0).getReg();
	MachineOperand Base = earlyUseOperand(MI.getOperand(1));
	int64_t Disp = MI.getOperand(2).getImm();
	Register Src2 = MI.getOperand(3).getReg();
	Register BitShift = (IsSubWord ? MI.getOperand(4).getReg() : Register());
	Register NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : Register());
	DebugLoc DL = MI.getDebugLoc();
	if (IsSubWord)
	BitSize = MI.getOperand(6).getImm();

	// Subword operations use 32-bit registers.
	const TargetRegisterClass *RC = (BitSize <= 32 ?
	&SystemZ::GR32BitRegClass :
	&SystemZ::GR64BitRegClass);
	unsigned LOpcode = BitSize <= 32 ? SystemZ::L : SystemZ::LG;
	unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;

	// Get the right opcodes for the displacement.
	LOpcode = TII->getOpcodeForOffset(LOpcode, Disp);
	CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
	assert(LOpcode && CSOpcode && "Displacement out of range");

	// Create virtual registers for temporary results.
	Register OrigVal = MRI.createVirtualRegister(RC);
	Register OldVal = MRI.createVirtualRegister(RC);
	Register NewVal = MRI.createVirtualRegister(RC);
	Register RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
	Register RotatedAltVal = (IsSubWord ? MRI.createVirtualRegister(RC) : Src2);
	Register RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);

	// Insert 3 basic blocks for the loop.
	MachineBasicBlock *StartMBB = MBB;
	MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
	MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
	MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB);
	MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB);

	// StartMBB:
	// ...
	// %OrigVal = L Disp(%Base)
	// # fall through to LoopMMB
	MBB = StartMBB;
	BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
	MBB->addSuccessor(LoopMBB);

	// LoopMBB:
	// %OldVal = phi [ %OrigVal, StartMBB ], [ %Dest, UpdateMBB ]
	// %RotatedOldVal = RLL %OldVal, 0(%BitShift)
	// CompareOpcode %RotatedOldVal, %Src2
	// BRC KeepOldMask, UpdateMBB
	MBB = LoopMBB;
	BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
	.addReg(OrigVal).addMBB(StartMBB)
	.addReg(Dest).addMBB(UpdateMBB);
	if (IsSubWord)
	BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
	.addReg(OldVal).addReg(BitShift).addImm(0);
	BuildMI(MBB, DL, TII->get(CompareOpcode))
	.addReg(RotatedOldVal).addReg(Src2);
	BuildMI(MBB, DL, TII->get(SystemZ::BRC))
	.addImm(SystemZ::CCMASK_ICMP).addImm(KeepOldMask).addMBB(UpdateMBB);
	MBB->addSuccessor(UpdateMBB);
	MBB->addSuccessor(UseAltMBB);

	// UseAltMBB:
	// %RotatedAltVal = RISBG %RotatedOldVal, %Src2, 32, 31 + BitSize, 0
	// # fall through to UpdateMMB
	MBB = UseAltMBB;
	if (IsSubWord)
	BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedAltVal)
	.addReg(RotatedOldVal).addReg(Src2)
	.addImm(32).addImm(31 + BitSize).addImm(0);
	MBB->addSuccessor(UpdateMBB);

	// UpdateMBB:
	// %RotatedNewVal = PHI [ %RotatedOldVal, LoopMBB ],
	// [ %RotatedAltVal, UseAltMBB ]
	// %NewVal = RLL %RotatedNewVal, 0(%NegBitShift)
	// %Dest = CS %OldVal, %NewVal, Disp(%Base)
	// JNE LoopMBB
	// # fall through to DoneMMB
	MBB = UpdateMBB;
	BuildMI(MBB, DL, TII->get(SystemZ::PHI), RotatedNewVal)
	.addReg(RotatedOldVal).addMBB(LoopMBB)
	.addReg(RotatedAltVal).addMBB(UseAltMBB);
	if (IsSubWord)
	BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
	.addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
	BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
	.addReg(OldVal)
	.addReg(NewVal)
	.add(Base)
	.addImm(Disp);
	BuildMI(MBB, DL, TII->get(SystemZ::BRC))
	.addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
	MBB->addSuccessor(LoopMBB);
	MBB->addSuccessor(DoneMBB);

	MI.eraseFromParent();
	return DoneMBB;
	}

	// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_CMP_SWAPW
	// instruction MI.
	MachineBasicBlock *
	SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
	MachineBasicBlock *MBB) const {

	MachineFunction &MF = *MBB->getParent();
	const SystemZInstrInfo *TII =
	static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
	MachineRegisterInfo &MRI = MF.getRegInfo();

	// Extract the operands. Base can be a register or a frame index.
	unsigned Dest = MI.getOperand(0).getReg();
	MachineOperand Base = earlyUseOperand(MI.getOperand(1));
	int64_t Disp = MI.getOperand(2).getImm();
	unsigned OrigCmpVal = MI.getOperand(3).getReg();
	unsigned OrigSwapVal = MI.getOperand(4).getReg();
	unsigned BitShift = MI.getOperand(5).getReg();
	unsigned NegBitShift = MI.getOperand(6).getReg();
	int64_t BitSize = MI.getOperand(7).getImm();
	DebugLoc DL = MI.getDebugLoc();

	const TargetRegisterClass *RC = &SystemZ::GR32BitRegClass;

	// Get the right opcodes for the displacement.
	unsigned LOpcode = TII->getOpcodeForOffset(SystemZ::L, Disp);
	unsigned CSOpcode = TII->getOpcodeForOffset(SystemZ::CS, Disp);
	assert(LOpcode && CSOpcode && "Displacement out of range");

	// Create virtual registers for temporary results.
	unsigned OrigOldVal = MRI.createVirtualRegister(RC);
	unsigned OldVal = MRI.createVirtualRegister(RC);
	unsigned CmpVal = MRI.createVirtualRegister(RC);
	unsigned SwapVal = MRI.createVirtualRegister(RC);
	unsigned StoreVal = MRI.createVirtualRegister(RC);
	unsigned RetryOldVal = MRI.createVirtualRegister(RC);
	unsigned RetryCmpVal = MRI.createVirtualRegister(RC);
	unsigned RetrySwapVal = MRI.createVirtualRegister(RC);

	// Insert 2 basic blocks for the loop.
	MachineBasicBlock *StartMBB = MBB;
	MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
	MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
	MachineBasicBlock *SetMBB = emitBlockAfter(LoopMBB);

	// StartMBB:
	// ...
	// %OrigOldVal = L Disp(%Base)
	// # fall through to LoopMMB
	MBB = StartMBB;
	BuildMI(MBB, DL, TII->get(LOpcode), OrigOldVal)
	.add(Base)
	.addImm(Disp)
	.addReg(0);
	MBB->addSuccessor(LoopMBB);

	// LoopMBB:
	// %OldVal = phi [ %OrigOldVal, EntryBB ], [ %RetryOldVal, SetMBB ]
	// %CmpVal = phi [ %OrigCmpVal, EntryBB ], [ %RetryCmpVal, SetMBB ]
	// %SwapVal = phi [ %OrigSwapVal, EntryBB ], [ %RetrySwapVal, SetMBB ]
	// %Dest = RLL %OldVal, BitSize(%BitShift)
	// ^^ The low BitSize bits contain the field
	// of interest.
	// %RetryCmpVal = RISBG32 %CmpVal, %Dest, 32, 63-BitSize, 0
	// ^^ Replace the upper 32-BitSize bits of the
	// comparison value with those that we loaded,
	// so that we can use a full word comparison.
	// CR %Dest, %RetryCmpVal
	// JNE DoneMBB
	// # Fall through to SetMBB
	MBB = LoopMBB;
	BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
	.addReg(OrigOldVal).addMBB(StartMBB)
	.addReg(RetryOldVal).addMBB(SetMBB);
	BuildMI(MBB, DL, TII->get(SystemZ::PHI), CmpVal)
	.addReg(OrigCmpVal).addMBB(StartMBB)
	.addReg(RetryCmpVal).addMBB(SetMBB);
	BuildMI(MBB, DL, TII->get(SystemZ::PHI), SwapVal)
	.addReg(OrigSwapVal).addMBB(StartMBB)
	.addReg(RetrySwapVal).addMBB(SetMBB);
	BuildMI(MBB, DL, TII->get(SystemZ::RLL), Dest)
	.addReg(OldVal).addReg(BitShift).addImm(BitSize);
	BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetryCmpVal)
	.addReg(CmpVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
	BuildMI(MBB, DL, TII->get(SystemZ::CR))
	.addReg(Dest).addReg(RetryCmpVal);
	BuildMI(MBB, DL, TII->get(SystemZ::BRC))
	.addImm(SystemZ::CCMASK_ICMP)
	.addImm(SystemZ::CCMASK_CMP_NE).addMBB(DoneMBB);
	MBB->addSuccessor(DoneMBB);
	MBB->addSuccessor(SetMBB);

	// SetMBB:
	// %RetrySwapVal = RISBG32 %SwapVal, %Dest, 32, 63-BitSize, 0
	// ^^ Replace the upper 32-BitSize bits of the new
	// value with those that we loaded.
	// %StoreVal = RLL %RetrySwapVal, -BitSize(%NegBitShift)
	// ^^ Rotate the new field to its proper position.
	// %RetryOldVal = CS %Dest, %StoreVal, Disp(%Base)
	// JNE LoopMBB
	// # fall through to ExitMMB
	MBB = SetMBB;
	BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetrySwapVal)
	.addReg(SwapVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
	BuildMI(MBB, DL, TII->get(SystemZ::RLL), StoreVal)
	.addReg(RetrySwapVal).addReg(NegBitShift).addImm(-BitSize);
	BuildMI(MBB, DL, TII->get(CSOpcode), RetryOldVal)
	.addReg(OldVal)
	.addReg(StoreVal)
	.add(Base)
	.addImm(Disp);
	BuildMI(MBB, DL, TII->get(SystemZ::BRC))
	.addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
	MBB->addSuccessor(LoopMBB);
	MBB->addSuccessor(DoneMBB);

	// If the CC def wasn't dead in the ATOMIC_CMP_SWAPW, mark CC as live-in
	// to the block after the loop. At this point, CC may have been defined
	// either by the CR in LoopMBB or by the CS in SetMBB.
	if (!MI.registerDefIsDead(SystemZ::CC))
	DoneMBB->addLiveIn(SystemZ::CC);

	MI.eraseFromParent();
	return DoneMBB;
	}

	// Emit a move from two GR64s to a GR128.
	MachineBasicBlock *
	SystemZTargetLowering::emitPair128(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	MachineFunction &MF = *MBB->getParent();
	const SystemZInstrInfo *TII =
	static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
	MachineRegisterInfo &MRI = MF.getRegInfo();
	DebugLoc DL = MI.getDebugLoc();

	unsigned Dest = MI.getOperand(0).getReg();
	unsigned Hi = MI.getOperand(1).getReg();
	unsigned Lo = MI.getOperand(2).getReg();
	unsigned Tmp1 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
	unsigned Tmp2 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);

	BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Tmp1);
	BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Tmp2)
	.addReg(Tmp1).addReg(Hi).addImm(SystemZ::subreg_h64);
	BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
	.addReg(Tmp2).addReg(Lo).addImm(SystemZ::subreg_l64);

	MI.eraseFromParent();
	return MBB;
	}

	// Emit an extension from a GR64 to a GR128. ClearEven is true
	// if the high register of the GR128 value must be cleared or false if
	// it's "don't care".
	MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
	MachineBasicBlock *MBB,
	bool ClearEven) const {
	MachineFunction &MF = *MBB->getParent();
	const SystemZInstrInfo *TII =
	static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
	MachineRegisterInfo &MRI = MF.getRegInfo();
	DebugLoc DL = MI.getDebugLoc();

	unsigned Dest = MI.getOperand(0).getReg();
	unsigned Src = MI.getOperand(1).getReg();
	unsigned In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);

	BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), In128);
	if (ClearEven) {
	unsigned NewIn128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
	unsigned Zero64 = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);

	BuildMI(*MBB, MI, DL, TII->get(SystemZ::LLILL), Zero64)
	.addImm(0);
	BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewIn128)
	.addReg(In128).addReg(Zero64).addImm(SystemZ::subreg_h64);
	In128 = NewIn128;
	}
	BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
	.addReg(In128).addReg(Src).addImm(SystemZ::subreg_l64);

	MI.eraseFromParent();
	return MBB;
	}

	MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
	MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
	MachineFunction &MF = *MBB->getParent();
	const SystemZInstrInfo *TII =
	static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
	MachineRegisterInfo &MRI = MF.getRegInfo();
	DebugLoc DL = MI.getDebugLoc();

	MachineOperand DestBase = earlyUseOperand(MI.getOperand(0));
	uint64_t DestDisp = MI.getOperand(1).getImm();
	MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
	uint64_t SrcDisp = MI.getOperand(3).getImm();
	uint64_t Length = MI.getOperand(4).getImm();

	// When generating more than one CLC, all but the last will need to
	// branch to the end when a difference is found.
	MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
	splitBlockAfter(MI, MBB) : nullptr);

	// Check for the loop form, in which operand 5 is the trip count.
	if (MI.getNumExplicitOperands() > 5) {
	bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);

	Register StartCountReg = MI.getOperand(5).getReg();
	Register StartSrcReg = forceReg(MI, SrcBase, TII);
	Register StartDestReg = (HaveSingleBase ? StartSrcReg :
	forceReg(MI, DestBase, TII));

	const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
	Register ThisSrcReg = MRI.createVirtualRegister(RC);
	Register ThisDestReg = (HaveSingleBase ? ThisSrcReg :
	MRI.createVirtualRegister(RC));
	Register NextSrcReg = MRI.createVirtualRegister(RC);
	Register NextDestReg = (HaveSingleBase ? NextSrcReg :
	MRI.createVirtualRegister(RC));

	RC = &SystemZ::GR64BitRegClass;
	Register ThisCountReg = MRI.createVirtualRegister(RC);
	Register NextCountReg = MRI.createVirtualRegister(RC);

	MachineBasicBlock *StartMBB = MBB;
	MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
	MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
	MachineBasicBlock *NextMBB = (EndMBB ? emitBlockAfter(LoopMBB) : LoopMBB);

	// StartMBB:
	// # fall through to LoopMMB
	MBB->addSuccessor(LoopMBB);

	// LoopMBB:
	// %ThisDestReg = phi [ %StartDestReg, StartMBB ],
	// [ %NextDestReg, NextMBB ]
	// %ThisSrcReg = phi [ %StartSrcReg, StartMBB ],
	// [ %NextSrcReg, NextMBB ]
	// %ThisCountReg = phi [ %StartCountReg, StartMBB ],
	// [ %NextCountReg, NextMBB ]
	// ( PFD 2, 768+DestDisp(%ThisDestReg) )
	// Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg)
	// ( JLH EndMBB )
	//
	// The prefetch is used only for MVC. The JLH is used only for CLC.
	MBB = LoopMBB;

	BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
	.addReg(StartDestReg).addMBB(StartMBB)
	.addReg(NextDestReg).addMBB(NextMBB);
	if (!HaveSingleBase)
	BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)
	.addReg(StartSrcReg).addMBB(StartMBB)
	.addReg(NextSrcReg).addMBB(NextMBB);
	BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)
	.addReg(StartCountReg).addMBB(StartMBB)
	.addReg(NextCountReg).addMBB(NextMBB);
	if (Opcode == SystemZ::MVC)
	BuildMI(MBB, DL, TII->get(SystemZ::PFD))
	.addImm(SystemZ::PFD_WRITE)
	.addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
	BuildMI(MBB, DL, TII->get(Opcode))
	.addReg(ThisDestReg).addImm(DestDisp).addImm(256)
	.addReg(ThisSrcReg).addImm(SrcDisp);
	if (EndMBB) {
	BuildMI(MBB, DL, TII->get(SystemZ::BRC))
	.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
	.addMBB(EndMBB);
	MBB->addSuccessor(EndMBB);
	MBB->addSuccessor(NextMBB);
	}

	// NextMBB:
	// %NextDestReg = LA 256(%ThisDestReg)
	// %NextSrcReg = LA 256(%ThisSrcReg)
	// %NextCountReg = AGHI %ThisCountReg, -1
	// CGHI %NextCountReg, 0
	// JLH LoopMBB
	// # fall through to DoneMMB
	//
	// The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
	MBB = NextMBB;

	BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
	.addReg(ThisDestReg).addImm(256).addReg(0);
	if (!HaveSingleBase)
	BuildMI(MBB, DL, TII->get(SystemZ::LA), NextSrcReg)
	.addReg(ThisSrcReg).addImm(256).addReg(0);
	BuildMI(MBB, DL, TII->get(SystemZ::AGHI), NextCountReg)
	.addReg(ThisCountReg).addImm(-1);
	BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
	.addReg(NextCountReg).addImm(0);
	BuildMI(MBB, DL, TII->get(SystemZ::BRC))
	.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
	.addMBB(LoopMBB);
	MBB->addSuccessor(LoopMBB);
	MBB->addSuccessor(DoneMBB);

	DestBase = MachineOperand::CreateReg(NextDestReg, false);
	SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
	Length &= 255;
	if (EndMBB && !Length)
	// If the loop handled the whole CLC range, DoneMBB will be empty with
	// CC live-through into EndMBB, so add it as live-in.
	DoneMBB->addLiveIn(SystemZ::CC);
	MBB = DoneMBB;
	}
	// Handle any remaining bytes with straight-line code.
	while (Length > 0) {
	uint64_t ThisLength = std::min(Length, uint64_t(256));
	// The previous iteration might have created out-of-range displacements.
	// Apply them using LAY if so.
	if (!isUInt<12>(DestDisp)) {
	unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
	BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
	.add(DestBase)
	.addImm(DestDisp)
	.addReg(0);
	DestBase = MachineOperand::CreateReg(Reg, false);
	DestDisp = 0;
	}
	if (!isUInt<12>(SrcDisp)) {
	unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
	BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
	.add(SrcBase)
	.addImm(SrcDisp)
	.addReg(0);
	SrcBase = MachineOperand::CreateReg(Reg, false);
	SrcDisp = 0;
	}
	BuildMI(*MBB, MI, DL, TII->get(Opcode))
	.add(DestBase)
	.addImm(DestDisp)
	.addImm(ThisLength)
	.add(SrcBase)
	.addImm(SrcDisp)
	.setMemRefs(MI.memoperands());
	DestDisp += ThisLength;
	SrcDisp += ThisLength;
	Length -= ThisLength;
	// If there's another CLC to go, branch to the end if a difference
	// was found.
	if (EndMBB && Length > 0) {
	MachineBasicBlock *NextMBB = splitBlockBefore(MI, MBB);
	BuildMI(MBB, DL, TII->get(SystemZ::BRC))
	.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
	.addMBB(EndMBB);
	MBB->addSuccessor(EndMBB);
	MBB->addSuccessor(NextMBB);
	MBB = NextMBB;
	}
	}
	if (EndMBB) {
	MBB->addSuccessor(EndMBB);
	MBB = EndMBB;
	MBB->addLiveIn(SystemZ::CC);
	}

	MI.eraseFromParent();
	return MBB;
	}

	// Decompose string pseudo-instruction MI into a loop that continually performs
	// Opcode until CC != 3.
	MachineBasicBlock *SystemZTargetLowering::emitStringWrapper(
	MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
	MachineFunction &MF = *MBB->getParent();
	const SystemZInstrInfo *TII =
	static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
	MachineRegisterInfo &MRI = MF.getRegInfo();
	DebugLoc DL = MI.getDebugLoc();

	uint64_t End1Reg = MI.getOperand(0).getReg();
	uint64_t Start1Reg = MI.getOperand(1).getReg();
	uint64_t Start2Reg = MI.getOperand(2).getReg();
	uint64_t CharReg = MI.getOperand(3).getReg();

	const TargetRegisterClass *RC = &SystemZ::GR64BitRegClass;
	uint64_t This1Reg = MRI.createVirtualRegister(RC);
	uint64_t This2Reg = MRI.createVirtualRegister(RC);
	uint64_t End2Reg = MRI.createVirtualRegister(RC);

	MachineBasicBlock *StartMBB = MBB;
	MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
	MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);

	// StartMBB:
	// # fall through to LoopMMB
	MBB->addSuccessor(LoopMBB);

	// LoopMBB:
	// %This1Reg = phi [ %Start1Reg, StartMBB ], [ %End1Reg, LoopMBB ]
	// %This2Reg = phi [ %Start2Reg, StartMBB ], [ %End2Reg, LoopMBB ]
	// R0L = %CharReg
	// %End1Reg, %End2Reg = CLST %This1Reg, %This2Reg -- uses R0L
	// JO LoopMBB
	// # fall through to DoneMMB
	//
	// The load of R0L can be hoisted by post-RA LICM.
	MBB = LoopMBB;

	BuildMI(MBB, DL, TII->get(SystemZ::PHI), This1Reg)
	.addReg(Start1Reg).addMBB(StartMBB)
	.addReg(End1Reg).addMBB(LoopMBB);
	BuildMI(MBB, DL, TII->get(SystemZ::PHI), This2Reg)
	.addReg(Start2Reg).addMBB(StartMBB)
	.addReg(End2Reg).addMBB(LoopMBB);
	BuildMI(MBB, DL, TII->get(TargetOpcode::COPY), SystemZ::R0L).addReg(CharReg);
	BuildMI(MBB, DL, TII->get(Opcode))
	.addReg(End1Reg, RegState::Define).addReg(End2Reg, RegState::Define)
	.addReg(This1Reg).addReg(This2Reg);
	BuildMI(MBB, DL, TII->get(SystemZ::BRC))
	.addImm(SystemZ::CCMASK_ANY).addImm(SystemZ::CCMASK_3).addMBB(LoopMBB);
	MBB->addSuccessor(LoopMBB);
	MBB->addSuccessor(DoneMBB);

	DoneMBB->addLiveIn(SystemZ::CC);

	MI.eraseFromParent();
	return DoneMBB;
	}

	// Update TBEGIN instruction with final opcode and register clobbers.
	MachineBasicBlock *SystemZTargetLowering::emitTransactionBegin(
	MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode,
	bool NoFloat) const {
	MachineFunction &MF = *MBB->getParent();
	const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
	const SystemZInstrInfo *TII = Subtarget.getInstrInfo();

	// Update opcode.
	MI.setDesc(TII->get(Opcode));

	// We cannot handle a TBEGIN that clobbers the stack or frame pointer.
	// Make sure to add the corresponding GRSM bits if they are missing.
	uint64_t Control = MI.getOperand(2).getImm();
	static const unsigned GPRControlBit[16] = {
	0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000,
	0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100
	};
	Control \|= GPRControlBit[15];
	if (TFI->hasFP(MF))
	Control \|= GPRControlBit[11];
	MI.getOperand(2).setImm(Control);

	// Add GPR clobbers.
	for (int I = 0; I < 16; I++) {
	if ((Control & GPRControlBit[I]) == 0) {
	unsigned Reg = SystemZMC::GR64Regs[I];
	MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
	}
	}

	// Add FPR/VR clobbers.
	if (!NoFloat && (Control & 4) != 0) {
	if (Subtarget.hasVector()) {
	for (int I = 0; I < 32; I++) {
	unsigned Reg = SystemZMC::VR128Regs[I];
	MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
	}
	} else {
	for (int I = 0; I < 16; I++) {
	unsigned Reg = SystemZMC::FP64Regs[I];
	MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
	}
	}
	}

	return MBB;
	}

	MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0(
	MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
	MachineFunction &MF = *MBB->getParent();
	MachineRegisterInfo *MRI = &MF.getRegInfo();
	const SystemZInstrInfo *TII =
	static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
	DebugLoc DL = MI.getDebugLoc();

	unsigned SrcReg = MI.getOperand(0).getReg();

	// Create new virtual register of the same class as source.
	const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
	unsigned DstReg = MRI->createVirtualRegister(RC);

	// Replace pseudo with a normal load-and-test that models the def as
	// well.
	BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg)
	.addReg(SrcReg);
	MI.eraseFromParent();

	return MBB;
	}

	MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *MBB) const {
	switch (MI.getOpcode()) {
	case SystemZ::Select32:
	case SystemZ::Select64:
	case SystemZ::SelectF32:
	case SystemZ::SelectF64:
	case SystemZ::SelectF128:
	case SystemZ::SelectVR32:
	case SystemZ::SelectVR64:
	case SystemZ::SelectVR128:
	return emitSelect(MI, MBB);

	case SystemZ::CondStore8Mux:
	return emitCondStore(MI, MBB, SystemZ::STCMux, 0, false);
	case SystemZ::CondStore8MuxInv:
	return emitCondStore(MI, MBB, SystemZ::STCMux, 0, true);
	case SystemZ::CondStore16Mux:
	return emitCondStore(MI, MBB, SystemZ::STHMux, 0, false);
	case SystemZ::CondStore16MuxInv:
	return emitCondStore(MI, MBB, SystemZ::STHMux, 0, true);
	case SystemZ::CondStore32Mux:
	return emitCondStore(MI, MBB, SystemZ::STMux, SystemZ::STOCMux, false);
	case SystemZ::CondStore32MuxInv:
	return emitCondStore(MI, MBB, SystemZ::STMux, SystemZ::STOCMux, true);
	case SystemZ::CondStore8:
	return emitCondStore(MI, MBB, SystemZ::STC, 0, false);
	case SystemZ::CondStore8Inv:
	return emitCondStore(MI, MBB, SystemZ::STC, 0, true);
	case SystemZ::CondStore16:
	return emitCondStore(MI, MBB, SystemZ::STH, 0, false);
	case SystemZ::CondStore16Inv:
	return emitCondStore(MI, MBB, SystemZ::STH, 0, true);
	case SystemZ::CondStore32:
	return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, false);
	case SystemZ::CondStore32Inv:
	return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, true);
	case SystemZ::CondStore64:
	return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, false);
	case SystemZ::CondStore64Inv:
	return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, true);
	case SystemZ::CondStoreF32:
	return emitCondStore(MI, MBB, SystemZ::STE, 0, false);
	case SystemZ::CondStoreF32Inv:
	return emitCondStore(MI, MBB, SystemZ::STE, 0, true);
	case SystemZ::CondStoreF64:
	return emitCondStore(MI, MBB, SystemZ::STD, 0, false);
	case SystemZ::CondStoreF64Inv:
	return emitCondStore(MI, MBB, SystemZ::STD, 0, true);

	case SystemZ::PAIR128:
	return emitPair128(MI, MBB);
	case SystemZ::AEXT128:
	return emitExt128(MI, MBB, false);
	case SystemZ::ZEXT128:
	return emitExt128(MI, MBB, true);

	case SystemZ::ATOMIC_SWAPW:
	return emitAtomicLoadBinary(MI, MBB, 0, 0);
	case SystemZ::ATOMIC_SWAP_32:
	return emitAtomicLoadBinary(MI, MBB, 0, 32);
	case SystemZ::ATOMIC_SWAP_64:
	return emitAtomicLoadBinary(MI, MBB, 0, 64);

	case SystemZ::ATOMIC_LOADW_AR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 0);
	case SystemZ::ATOMIC_LOADW_AFI:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 0);
	case SystemZ::ATOMIC_LOAD_AR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 32);
	case SystemZ::ATOMIC_LOAD_AHI:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::AHI, 32);
	case SystemZ::ATOMIC_LOAD_AFI:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 32);
	case SystemZ::ATOMIC_LOAD_AGR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::AGR, 64);
	case SystemZ::ATOMIC_LOAD_AGHI:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::AGHI, 64);
	case SystemZ::ATOMIC_LOAD_AGFI:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::AGFI, 64);

	case SystemZ::ATOMIC_LOADW_SR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 0);
	case SystemZ::ATOMIC_LOAD_SR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 32);
	case SystemZ::ATOMIC_LOAD_SGR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::SGR, 64);

	case SystemZ::ATOMIC_LOADW_NR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0);
	case SystemZ::ATOMIC_LOADW_NILH:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0);
	case SystemZ::ATOMIC_LOAD_NR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32);
	case SystemZ::ATOMIC_LOAD_NILL:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32);
	case SystemZ::ATOMIC_LOAD_NILH:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32);
	case SystemZ::ATOMIC_LOAD_NILF:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32);
	case SystemZ::ATOMIC_LOAD_NGR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64);
	case SystemZ::ATOMIC_LOAD_NILL64:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64);
	case SystemZ::ATOMIC_LOAD_NILH64:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64);
	case SystemZ::ATOMIC_LOAD_NIHL64:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64);
	case SystemZ::ATOMIC_LOAD_NIHH64:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64);
	case SystemZ::ATOMIC_LOAD_NILF64:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64);
	case SystemZ::ATOMIC_LOAD_NIHF64:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64);

	case SystemZ::ATOMIC_LOADW_OR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 0);
	case SystemZ::ATOMIC_LOADW_OILH:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 0);
	case SystemZ::ATOMIC_LOAD_OR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 32);
	case SystemZ::ATOMIC_LOAD_OILL:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 32);
	case SystemZ::ATOMIC_LOAD_OILH:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 32);
	case SystemZ::ATOMIC_LOAD_OILF:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 32);
	case SystemZ::ATOMIC_LOAD_OGR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64);
	case SystemZ::ATOMIC_LOAD_OILL64:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL64, 64);
	case SystemZ::ATOMIC_LOAD_OILH64:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH64, 64);
	case SystemZ::ATOMIC_LOAD_OIHL64:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL64, 64);
	case SystemZ::ATOMIC_LOAD_OIHH64:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH64, 64);
	case SystemZ::ATOMIC_LOAD_OILF64:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF64, 64);
	case SystemZ::ATOMIC_LOAD_OIHF64:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF64, 64);

	case SystemZ::ATOMIC_LOADW_XR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 0);
	case SystemZ::ATOMIC_LOADW_XILF:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 0);
	case SystemZ::ATOMIC_LOAD_XR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 32);
	case SystemZ::ATOMIC_LOAD_XILF:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 32);
	case SystemZ::ATOMIC_LOAD_XGR:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::XGR, 64);
	case SystemZ::ATOMIC_LOAD_XILF64:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF64, 64);
	case SystemZ::ATOMIC_LOAD_XIHF64:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF64, 64);

	case SystemZ::ATOMIC_LOADW_NRi:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0, true);
	case SystemZ::ATOMIC_LOADW_NILHi:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0, true);
	case SystemZ::ATOMIC_LOAD_NRi:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32, true);
	case SystemZ::ATOMIC_LOAD_NILLi:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32, true);
	case SystemZ::ATOMIC_LOAD_NILHi:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32, true);
	case SystemZ::ATOMIC_LOAD_NILFi:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32, true);
	case SystemZ::ATOMIC_LOAD_NGRi:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true);
	case SystemZ::ATOMIC_LOAD_NILL64i:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64, true);
	case SystemZ::ATOMIC_LOAD_NILH64i:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64, true);
	case SystemZ::ATOMIC_LOAD_NIHL64i:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64, true);
	case SystemZ::ATOMIC_LOAD_NIHH64i:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64, true);
	case SystemZ::ATOMIC_LOAD_NILF64i:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64, true);
	case SystemZ::ATOMIC_LOAD_NIHF64i:
	return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64, true);

	case SystemZ::ATOMIC_LOADW_MIN:
	return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
	SystemZ::CCMASK_CMP_LE, 0);
	case SystemZ::ATOMIC_LOAD_MIN_32:
	return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
	SystemZ::CCMASK_CMP_LE, 32);
	case SystemZ::ATOMIC_LOAD_MIN_64:
	return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
	SystemZ::CCMASK_CMP_LE, 64);

	case SystemZ::ATOMIC_LOADW_MAX:
	return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
	SystemZ::CCMASK_CMP_GE, 0);
	case SystemZ::ATOMIC_LOAD_MAX_32:
	return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
	SystemZ::CCMASK_CMP_GE, 32);
	case SystemZ::ATOMIC_LOAD_MAX_64:
	return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
	SystemZ::CCMASK_CMP_GE, 64);

	case SystemZ::ATOMIC_LOADW_UMIN:
	return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
	SystemZ::CCMASK_CMP_LE, 0);
	case SystemZ::ATOMIC_LOAD_UMIN_32:
	return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
	SystemZ::CCMASK_CMP_LE, 32);
	case SystemZ::ATOMIC_LOAD_UMIN_64:
	return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
	SystemZ::CCMASK_CMP_LE, 64);

	case SystemZ::ATOMIC_LOADW_UMAX:
	return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
	SystemZ::CCMASK_CMP_GE, 0);
	case SystemZ::ATOMIC_LOAD_UMAX_32:
	return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
	SystemZ::CCMASK_CMP_GE, 32);
	case SystemZ::ATOMIC_LOAD_UMAX_64:
	return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
	SystemZ::CCMASK_CMP_GE, 64);

	case SystemZ::ATOMIC_CMP_SWAPW:
	return emitAtomicCmpSwapW(MI, MBB);
	case SystemZ::MVCSequence:
	case SystemZ::MVCLoop:
	return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
	case SystemZ::NCSequence:
	case SystemZ::NCLoop:
	return emitMemMemWrapper(MI, MBB, SystemZ::NC);
	case SystemZ::OCSequence:
	case SystemZ::OCLoop:
	return emitMemMemWrapper(MI, MBB, SystemZ::OC);
	case SystemZ::XCSequence:
	case SystemZ::XCLoop:
	return emitMemMemWrapper(MI, MBB, SystemZ::XC);
	case SystemZ::CLCSequence:
	case SystemZ::CLCLoop:
	return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
	case SystemZ::CLSTLoop:
	return emitStringWrapper(MI, MBB, SystemZ::CLST);
	case SystemZ::MVSTLoop:
	return emitStringWrapper(MI, MBB, SystemZ::MVST);
	case SystemZ::SRSTLoop:
	return emitStringWrapper(MI, MBB, SystemZ::SRST);
	case SystemZ::TBEGIN:
	return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, false);
	case SystemZ::TBEGIN_nofloat:
	return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, true);
	case SystemZ::TBEGINC:
	return emitTransactionBegin(MI, MBB, SystemZ::TBEGINC, true);
	case SystemZ::LTEBRCompare_VecPseudo:
	return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTEBR);
	case SystemZ::LTDBRCompare_VecPseudo:
	return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTDBR);
	case SystemZ::LTXBRCompare_VecPseudo:
	return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, MBB);

	default:
	llvm_unreachable("Unexpected instr type to insert");
	}
	}

	// This is only used by the isel schedulers, and is needed only to prevent
	// compiler from crashing when list-ilp is used.
	const TargetRegisterClass *
	SystemZTargetLowering::getRepRegClassFor(MVT VT) const {
	if (VT == MVT::Untyped)
	return &SystemZ::ADDR128BitRegClass;
	return TargetLowering::getRepRegClassFor(VT);
	}
	Index: vendor/llvm/dist-release_90/lib/Target/X86/X86.td
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/X86/X86.td (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/X86/X86.td (revision 351303)
	@@ -1,1253 +1,1254 @@
	//===-- X86.td - Target definition file for the Intel X86 --- tablegen --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This is a target description file for the Intel i386 architecture, referred
	// to here as the "X86" architecture.
	//
	//===----------------------------------------------------------------------===//

	// Get the target-independent interfaces which we are implementing...
	//
	include "llvm/Target/Target.td"

	//===----------------------------------------------------------------------===//
	// X86 Subtarget state
	//

	def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
	"64-bit mode (x86_64)">;
	def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true",
	"32-bit mode (80386)">;
	def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true",
	"16-bit mode (i8086)">;

	//===----------------------------------------------------------------------===//
	// X86 Subtarget features
	//===----------------------------------------------------------------------===//

	def FeatureX87 : SubtargetFeature<"x87","HasX87", "true",
	"Enable X87 float instructions">;

	def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true",
	"Enable NOPL instruction">;

	def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true",
	"Enable conditional move instructions">;

	def FeatureCMPXCHG8B : SubtargetFeature<"cx8", "HasCmpxchg8b", "true",
	"Support CMPXCHG8B instructions">;

	def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
	"Support POPCNT instruction">;

	def FeatureFXSR : SubtargetFeature<"fxsr", "HasFXSR", "true",
	"Support fxsave/fxrestore instructions">;

	def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true",
	"Support xsave instructions">;

	def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
	"Support xsaveopt instructions">;

	def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
	"Support xsavec instructions">;

	def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true",
	"Support xsaves instructions">;

	def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
	"Enable SSE instructions">;
	def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
	"Enable SSE2 instructions",
	[FeatureSSE1]>;
	def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
	"Enable SSE3 instructions",
	[FeatureSSE2]>;
	def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
	"Enable SSSE3 instructions",
	[FeatureSSE3]>;
	def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
	"Enable SSE 4.1 instructions",
	[FeatureSSSE3]>;
	def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
	"Enable SSE 4.2 instructions",
	[FeatureSSE41]>;
	// The MMX subtarget feature is separate from the rest of the SSE features
	// because it's important (for odd compatibility reasons) to be able to
	// turn it off explicitly while allowing SSE+ to be on.
	def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX",
	"Enable MMX instructions">;
	def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
	"Enable 3DNow! instructions",
	[FeatureMMX]>;
	def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
	"Enable 3DNow! Athlon instructions",
	[Feature3DNow]>;
	// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
	// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
	// without disabling 64-bit mode. Nothing should imply this feature bit. It
	// is used to enforce that only 64-bit capable CPUs are used in 64-bit mode.
	def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
	"Support 64-bit instructions">;
	def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
	- "64-bit with cmpxchg16b">;
	+ "64-bit with cmpxchg16b",
	+ [FeatureCMPXCHG8B]>;
	def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
	"SHLD instruction is slow">;
	def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
	"PMULLD instruction is slow">;
	def FeatureSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
	"true",
	"PMADDWD is slower than PMULLD">;
	// FIXME: This should not apply to CPUs that do not have SSE.
	def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
	"IsUAMem16Slow", "true",
	"Slow unaligned 16-byte memory access">;
	def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
	"IsUAMem32Slow", "true",
	"Slow unaligned 32-byte memory access">;
	def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
	"Support SSE 4a instructions",
	[FeatureSSE3]>;

	def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
	"Enable AVX instructions",
	[FeatureSSE42]>;
	def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
	"Enable AVX2 instructions",
	[FeatureAVX]>;
	def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
	"Enable three-operand fused multiple-add",
	[FeatureAVX]>;
	def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
	"Support 16-bit floating point conversion instructions",
	[FeatureAVX]>;
	def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
	"Enable AVX-512 instructions",
	[FeatureAVX2, FeatureFMA, FeatureF16C]>;
	def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true",
	"Enable AVX-512 Exponential and Reciprocal Instructions",
	[FeatureAVX512]>;
	def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true",
	"Enable AVX-512 Conflict Detection Instructions",
	[FeatureAVX512]>;
	def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ",
	"true", "Enable AVX-512 Population Count Instructions",
	[FeatureAVX512]>;
	def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true",
	"Enable AVX-512 PreFetch Instructions",
	[FeatureAVX512]>;
	def FeaturePREFETCHWT1 : SubtargetFeature<"prefetchwt1", "HasPREFETCHWT1",
	"true",
	"Prefetch with Intent to Write and T1 Hint">;
	def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true",
	"Enable AVX-512 Doubleword and Quadword Instructions",
	[FeatureAVX512]>;
	def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true",
	"Enable AVX-512 Byte and Word Instructions",
	[FeatureAVX512]>;
	def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true",
	"Enable AVX-512 Vector Length eXtensions",
	[FeatureAVX512]>;
	def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true",
	"Enable AVX-512 Vector Byte Manipulation Instructions",
	[FeatureBWI]>;
	def FeatureVBMI2 : SubtargetFeature<"avx512vbmi2", "HasVBMI2", "true",
	"Enable AVX-512 further Vector Byte Manipulation Instructions",
	[FeatureBWI]>;
	def FeatureIFMA : SubtargetFeature<"avx512ifma", "HasIFMA", "true",
	"Enable AVX-512 Integer Fused Multiple-Add",
	[FeatureAVX512]>;
	def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
	"Enable protection keys">;
	def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
	"Enable AVX-512 Vector Neural Network Instructions",
	[FeatureAVX512]>;
	def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true",
	"Support bfloat16 floating point",
	[FeatureBWI]>;
	def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true",
	"Enable AVX-512 Bit Algorithms",
	[FeatureBWI]>;
	def FeatureVP2INTERSECT : SubtargetFeature<"avx512vp2intersect",
	"HasVP2INTERSECT", "true",
	"Enable AVX-512 vp2intersect",
	[FeatureAVX512]>;
	def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
	"Enable packed carry-less multiplication instructions",
	[FeatureSSE2]>;
	def FeatureGFNI : SubtargetFeature<"gfni", "HasGFNI", "true",
	"Enable Galois Field Arithmetic Instructions",
	[FeatureSSE2]>;
	def FeatureVPCLMULQDQ : SubtargetFeature<"vpclmulqdq", "HasVPCLMULQDQ", "true",
	"Enable vpclmulqdq instructions",
	[FeatureAVX, FeaturePCLMUL]>;
	def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true",
	"Enable four-operand fused multiple-add",
	[FeatureAVX, FeatureSSE4A]>;
	def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true",
	"Enable XOP instructions",
	[FeatureFMA4]>;
	def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem",
	"HasSSEUnalignedMem", "true",
	"Allow unaligned memory operands with SSE instructions">;
	def FeatureAES : SubtargetFeature<"aes", "HasAES", "true",
	"Enable AES instructions",
	[FeatureSSE2]>;
	def FeatureVAES : SubtargetFeature<"vaes", "HasVAES", "true",
	"Promote selected AES instructions to AVX512/AVX registers",
	[FeatureAVX, FeatureAES]>;
	def FeatureTBM : SubtargetFeature<"tbm", "HasTBM", "true",
	"Enable TBM instructions">;
	def FeatureLWP : SubtargetFeature<"lwp", "HasLWP", "true",
	"Enable LWP instructions">;
	def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true",
	"Support MOVBE instruction">;
	def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
	"Support RDRAND instruction">;
	def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
	"Support FS/GS Base instructions">;
	def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
	"Support LZCNT instruction">;
	def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true",
	"Support BMI instructions">;
	def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true",
	"Support BMI2 instructions">;
	def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true",
	"Support RTM instructions">;
	def FeatureADX : SubtargetFeature<"adx", "HasADX", "true",
	"Support ADX instructions">;
	def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true",
	"Enable SHA instructions",
	[FeatureSSE2]>;
	def FeatureSHSTK : SubtargetFeature<"shstk", "HasSHSTK", "true",
	"Support CET Shadow-Stack instructions">;
	def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
	"Support PRFCHW instructions">;
	def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
	"Support RDSEED instruction">;
	def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
	"Support LAHF and SAHF instructions">;
	def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
	"Enable MONITORX/MWAITX timer functionality">;
	def FeatureCLZERO : SubtargetFeature<"clzero", "HasCLZERO", "true",
	"Enable Cache Line Zero">;
	def FeatureCLDEMOTE : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true",
	"Enable Cache Demote">;
	def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
	"Support ptwrite instruction">;
	def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true",
	"Support MPX instructions">;
	def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
	"Use LEA for adjusting the stack pointer">;
	def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
	"HasSlowDivide32", "true",
	"Use 8-bit divide for positive values less than 256">;
	def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
	"HasSlowDivide64", "true",
	"Use 32-bit divide for positive values less than 2^32">;
	def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
	"PadShortFunctions", "true",
	"Pad short functions">;
	def FeatureINVPCID : SubtargetFeature<"invpcid", "HasINVPCID", "true",
	"Invalidate Process-Context Identifier">;
	def FeatureSGX : SubtargetFeature<"sgx", "HasSGX", "true",
	"Enable Software Guard Extensions">;
	def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true",
	"Flush A Cache Line Optimized">;
	def FeatureCLWB : SubtargetFeature<"clwb", "HasCLWB", "true",
	"Cache Line Write Back">;
	def FeatureWBNOINVD : SubtargetFeature<"wbnoinvd", "HasWBNOINVD", "true",
	"Write Back No Invalidate">;
	def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true",
	"Support RDPID instructions">;
	def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
	"Wait and pause enhancements">;
	def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true",
	"Has ENQCMD instructions">;
	// On some processors, instructions that implicitly take two memory operands are
	// slow. In practice, this means that CALL, PUSH, and POP with memory operands
	// should be avoided in favor of a MOV + register CALL/PUSH/POP.
	def FeatureSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops",
	"SlowTwoMemOps", "true",
	"Two memory operand instructions are slow">;
	def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
	"LEA instruction needs inputs at AG stage">;
	def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
	"LEA instruction with certain arguments is slow">;
	def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
	"LEA instruction with 3 ops or certain registers is slow">;
	def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
	"INC and DEC instructions are slower than ADD and SUB">;
	def FeatureSoftFloat
	: SubtargetFeature<"soft-float", "UseSoftFloat", "true",
	"Use software floating point features">;
	def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt",
	"HasPOPCNTFalseDeps", "true",
	"POPCNT has a false dependency on dest register">;
	def FeatureLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
	"HasLZCNTFalseDeps", "true",
	"LZCNT/TZCNT have a false dependency on dest register">;
	def FeaturePCONFIG : SubtargetFeature<"pconfig", "HasPCONFIG", "true",
	"platform configuration instruction">;
	// On recent X86 (port bound) processors, its preferable to combine to a single shuffle
	// using a variable mask over multiple fixed shuffles.
	def FeatureFastVariableShuffle
	: SubtargetFeature<"fast-variable-shuffle",
	"HasFastVariableShuffle",
	"true", "Shuffles with variable masks are fast">;
	// On some X86 processors, there is no performance hazard to writing only the
	// lower parts of a YMM or ZMM register without clearing the upper part.
	def FeatureFastPartialYMMorZMMWrite
	: SubtargetFeature<"fast-partial-ymm-or-zmm-write",
	"HasFastPartialYMMorZMMWrite",
	"true", "Partial writes to YMM/ZMM registers are fast">;
	// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
	// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
	// vector FSQRT has higher throughput than the corresponding NR code.
	// The idea is that throughput bound code is likely to be vectorized, so for
	// vectorized code we should care about the throughput of SQRT operations.
	// But if the code is scalar that probably means that the code has some kind of
	// dependency and we should care more about reducing the latency.
	def FeatureFastScalarFSQRT
	: SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
	"true", "Scalar SQRT is fast (disable Newton-Raphson)">;
	def FeatureFastVectorFSQRT
	: SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
	"true", "Vector SQRT is fast (disable Newton-Raphson)">;
	// If lzcnt has equivalent latency/throughput to most simple integer ops, it can
	// be used to replace test/set sequences.
	def FeatureFastLZCNT
	: SubtargetFeature<
	"fast-lzcnt", "HasFastLZCNT", "true",
	"LZCNT instructions are as fast as most simple integer ops">;
	// If the target can efficiently decode NOPs upto 11-bytes in length.
	def FeatureFast11ByteNOP
	: SubtargetFeature<
	"fast-11bytenop", "HasFast11ByteNOP", "true",
	"Target can quickly decode up to 11 byte NOPs">;
	// If the target can efficiently decode NOPs upto 15-bytes in length.
	def FeatureFast15ByteNOP
	: SubtargetFeature<
	"fast-15bytenop", "HasFast15ByteNOP", "true",
	"Target can quickly decode up to 15 byte NOPs">;
	// Sandy Bridge and newer processors can use SHLD with the same source on both
	// inputs to implement rotate to avoid the partial flag update of the normal
	// rotate instructions.
	def FeatureFastSHLDRotate
	: SubtargetFeature<
	"fast-shld-rotate", "HasFastSHLDRotate", "true",
	"SHLD can be used as a faster rotate">;

	// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka
	// "string operations"). See "REP String Enhancement" in the Intel Software
	// Development Manual. This feature essentially means that REP MOVSB will copy
	// using the largest available size instead of copying bytes one by one, making
	// it at least as fast as REPMOVS{W,D,Q}.
	def FeatureERMSB
	: SubtargetFeature<
	"ermsb", "HasERMSB", "true",
	"REP MOVS/STOS are fast">;

	// Bulldozer and newer processors can merge CMP/TEST (but not other
	// instructions) with conditional branches.
	def FeatureBranchFusion
	: SubtargetFeature<"branchfusion", "HasBranchFusion", "true",
	"CMP/TEST can be fused with conditional branches">;

	// Sandy Bridge and newer processors have many instructions that can be
	// fused with conditional branches and pass through the CPU as a single
	// operation.
	def FeatureMacroFusion
	: SubtargetFeature<"macrofusion", "HasMacroFusion", "true",
	"Various instructions can be fused with conditional branches">;

	// Gather is available since Haswell (AVX2 set). So technically, we can
	// generate Gathers on all AVX2 processors. But the overhead on HSW is high.
	// Skylake Client processor has faster Gathers than HSW and performance is
	// similar to Skylake Server (AVX-512).
	def FeatureHasFastGather
	: SubtargetFeature<"fast-gather", "HasFastGather", "true",
	"Indicates if gather is reasonably fast">;

	def FeaturePrefer256Bit
	: SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
	"Prefer 256-bit AVX instructions">;

	// Lower indirect calls using a special construct called a `retpoline` to
	// mitigate potential Spectre v2 attacks against them.
	def FeatureRetpolineIndirectCalls
	: SubtargetFeature<
	"retpoline-indirect-calls", "UseRetpolineIndirectCalls", "true",
	"Remove speculation of indirect calls from the generated code">;

	// Lower indirect branches and switches either using conditional branch trees
	// or using a special construct called a `retpoline` to mitigate potential
	// Spectre v2 attacks against them.
	def FeatureRetpolineIndirectBranches
	: SubtargetFeature<
	"retpoline-indirect-branches", "UseRetpolineIndirectBranches", "true",
	"Remove speculation of indirect branches from the generated code">;

	// Deprecated umbrella feature for enabling both `retpoline-indirect-calls` and
	// `retpoline-indirect-branches` above.
	def FeatureRetpoline
	: SubtargetFeature<"retpoline", "DeprecatedUseRetpoline", "true",
	"Remove speculation of indirect branches from the "
	"generated code, either by avoiding them entirely or "
	"lowering them with a speculation blocking construct",
	[FeatureRetpolineIndirectCalls,
	FeatureRetpolineIndirectBranches]>;

	// Rely on external thunks for the emitted retpoline calls. This allows users
	// to provide their own custom thunk definitions in highly specialized
	// environments such as a kernel that does boot-time hot patching.
	def FeatureRetpolineExternalThunk
	: SubtargetFeature<
	"retpoline-external-thunk", "UseRetpolineExternalThunk", "true",
	"When lowering an indirect call or branch using a `retpoline`, rely "
	"on the specified user provided thunk rather than emitting one "
	"ourselves. Only has effect when combined with some other retpoline "
	"feature", [FeatureRetpolineIndirectCalls]>;

	// Direct Move instructions.
	def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
	"Support movdiri instruction">;
	def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
	"Support movdir64b instruction">;

	def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true",
	"Indicates that the BEXTR instruction is implemented as a single uop "
	"with good throughput">;

	// Combine vector math operations with shuffles into horizontal math
	// instructions if a CPU implements horizontal operations (introduced with
	// SSE3) with better latency/throughput than the alternative sequence.
	def FeatureFastHorizontalOps
	: SubtargetFeature<
	"fast-hops", "HasFastHorizontalOps", "true",
	"Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
	"normal vector instructions with shuffles", [FeatureSSE3]>;

	def FeatureFastScalarShiftMasks
	: SubtargetFeature<
	"fast-scalar-shift-masks", "HasFastScalarShiftMasks", "true",
	"Prefer a left/right scalar logical shift pair over a shift+and pair">;

	def FeatureFastVectorShiftMasks
	: SubtargetFeature<
	"fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
	"Prefer a left/right vector logical shift pair over a shift+and pair">;

	// Merge branches using three-way conditional code.
	def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
	"ThreewayBranchProfitable", "true",
	"Merge branches to a three-way "
	"conditional branch">;

	// Bonnell
	def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;
	// Silvermont
	def ProcIntelSLM : SubtargetFeature<"", "X86ProcFamily", "IntelSLM", "">;
	// Goldmont
	def ProcIntelGLM : SubtargetFeature<"", "X86ProcFamily", "IntelGLM", "">;
	// Goldmont Plus
	def ProcIntelGLP : SubtargetFeature<"", "X86ProcFamily", "IntelGLP", "">;
	// Tremont
	def ProcIntelTRM : SubtargetFeature<"", "X86ProcFamily", "IntelTRM", "">;

	//===----------------------------------------------------------------------===//
	// Register File Description
	//===----------------------------------------------------------------------===//

	include "X86RegisterInfo.td"
	include "X86RegisterBanks.td"

	//===----------------------------------------------------------------------===//
	// Instruction Descriptions
	//===----------------------------------------------------------------------===//

	include "X86Schedule.td"
	include "X86InstrInfo.td"
	include "X86SchedPredicates.td"

	def X86InstrInfo : InstrInfo;

	//===----------------------------------------------------------------------===//
	// X86 Scheduler Models
	//===----------------------------------------------------------------------===//

	include "X86ScheduleAtom.td"
	include "X86SchedSandyBridge.td"
	include "X86SchedHaswell.td"
	include "X86SchedBroadwell.td"
	include "X86ScheduleSLM.td"
	include "X86ScheduleZnver1.td"
	include "X86ScheduleBdVer2.td"
	include "X86ScheduleBtVer2.td"
	include "X86SchedSkylakeClient.td"
	include "X86SchedSkylakeServer.td"

	//===----------------------------------------------------------------------===//
	// X86 Processor Feature Lists
	//===----------------------------------------------------------------------===//

	def ProcessorFeatures {
	// Nehalem
	list<SubtargetFeature> NHMInheritableFeatures = [FeatureX87,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureSSE42,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeaturePOPCNT,
	FeatureLAHFSAHF,
	FeatureMacroFusion];
	list<SubtargetFeature> NHMSpecificFeatures = [];
	list<SubtargetFeature> NHMFeatures =
	!listconcat(NHMInheritableFeatures, NHMSpecificFeatures);

	// Westmere
	list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
	list<SubtargetFeature> WSMSpecificFeatures = [];
	list<SubtargetFeature> WSMInheritableFeatures =
	!listconcat(NHMInheritableFeatures, WSMAdditionalFeatures);
	list<SubtargetFeature> WSMFeatures =
	!listconcat(WSMInheritableFeatures, WSMSpecificFeatures);

	// Sandybridge
	list<SubtargetFeature> SNBAdditionalFeatures = [FeatureAVX,
	FeatureSlowDivide64,
	FeatureXSAVE,
	FeatureXSAVEOPT,
	FeatureSlow3OpsLEA,
	FeatureFastScalarFSQRT,
	FeatureFastSHLDRotate,
	FeatureMergeToThreeWayBranch];
	list<SubtargetFeature> SNBSpecificFeatures = [FeatureSlowUAMem32,
	FeaturePOPCNTFalseDeps];
	list<SubtargetFeature> SNBInheritableFeatures =
	!listconcat(WSMInheritableFeatures, SNBAdditionalFeatures);
	list<SubtargetFeature> SNBFeatures =
	!listconcat(SNBInheritableFeatures, SNBSpecificFeatures);

	// Ivybridge
	list<SubtargetFeature> IVBAdditionalFeatures = [FeatureRDRAND,
	FeatureF16C,
	FeatureFSGSBase];
	list<SubtargetFeature> IVBSpecificFeatures = [FeatureSlowUAMem32,
	FeaturePOPCNTFalseDeps];
	list<SubtargetFeature> IVBInheritableFeatures =
	!listconcat(SNBInheritableFeatures, IVBAdditionalFeatures);
	list<SubtargetFeature> IVBFeatures =
	!listconcat(IVBInheritableFeatures, IVBSpecificFeatures);

	// Haswell
	list<SubtargetFeature> HSWAdditionalFeatures = [FeatureAVX2,
	FeatureBMI,
	FeatureBMI2,
	FeatureERMSB,
	FeatureFMA,
	FeatureINVPCID,
	FeatureLZCNT,
	FeatureMOVBE,
	FeatureFastVariableShuffle];
	list<SubtargetFeature> HSWSpecificFeatures = [FeaturePOPCNTFalseDeps,
	FeatureLZCNTFalseDeps];
	list<SubtargetFeature> HSWInheritableFeatures =
	!listconcat(IVBInheritableFeatures, HSWAdditionalFeatures);
	list<SubtargetFeature> HSWFeatures =
	!listconcat(HSWInheritableFeatures, HSWSpecificFeatures);

	// Broadwell
	list<SubtargetFeature> BDWAdditionalFeatures = [FeatureADX,
	FeatureRDSEED,
	FeaturePRFCHW];
	list<SubtargetFeature> BDWSpecificFeatures = [FeaturePOPCNTFalseDeps,
	FeatureLZCNTFalseDeps];
	list<SubtargetFeature> BDWInheritableFeatures =
	!listconcat(HSWInheritableFeatures, BDWAdditionalFeatures);
	list<SubtargetFeature> BDWFeatures =
	!listconcat(BDWInheritableFeatures, BDWSpecificFeatures);

	// Skylake
	list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES,
	FeatureMPX,
	FeatureXSAVEC,
	FeatureXSAVES,
	FeatureCLFLUSHOPT,
	FeatureFastVectorFSQRT];
	list<SubtargetFeature> SKLSpecificFeatures = [FeatureHasFastGather,
	FeaturePOPCNTFalseDeps,
	FeatureSGX];
	list<SubtargetFeature> SKLInheritableFeatures =
	!listconcat(BDWInheritableFeatures, SKLAdditionalFeatures);
	list<SubtargetFeature> SKLFeatures =
	!listconcat(SKLInheritableFeatures, SKLSpecificFeatures);

	// Skylake-AVX512
	list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAVX512,
	FeatureCDI,
	FeatureDQI,
	FeatureBWI,
	FeatureVLX,
	FeaturePKU,
	FeatureCLWB];
	list<SubtargetFeature> SKXSpecificFeatures = [FeatureHasFastGather,
	FeaturePOPCNTFalseDeps];
	list<SubtargetFeature> SKXInheritableFeatures =
	!listconcat(SKLInheritableFeatures, SKXAdditionalFeatures);
	list<SubtargetFeature> SKXFeatures =
	!listconcat(SKXInheritableFeatures, SKXSpecificFeatures);

	// Cascadelake
	list<SubtargetFeature> CLXAdditionalFeatures = [FeatureVNNI];
	list<SubtargetFeature> CLXSpecificFeatures = [FeatureHasFastGather,
	FeaturePOPCNTFalseDeps];
	list<SubtargetFeature> CLXInheritableFeatures =
	!listconcat(SKXInheritableFeatures, CLXAdditionalFeatures);
	list<SubtargetFeature> CLXFeatures =
	!listconcat(CLXInheritableFeatures, CLXSpecificFeatures);

	// Cooperlake
	list<SubtargetFeature> CPXAdditionalFeatures = [FeatureBF16];
	list<SubtargetFeature> CPXSpecificFeatures = [FeatureHasFastGather,
	FeaturePOPCNTFalseDeps];
	list<SubtargetFeature> CPXInheritableFeatures =
	!listconcat(CLXInheritableFeatures, CPXAdditionalFeatures);
	list<SubtargetFeature> CPXFeatures =
	!listconcat(CPXInheritableFeatures, CPXSpecificFeatures);

	// Cannonlake
	list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
	FeatureCDI,
	FeatureDQI,
	FeatureBWI,
	FeatureVLX,
	FeaturePKU,
	FeatureVBMI,
	FeatureIFMA,
	FeatureSHA,
	FeatureSGX];
	list<SubtargetFeature> CNLSpecificFeatures = [FeatureHasFastGather];
	list<SubtargetFeature> CNLInheritableFeatures =
	!listconcat(SKLInheritableFeatures, CNLAdditionalFeatures);
	list<SubtargetFeature> CNLFeatures =
	!listconcat(CNLInheritableFeatures, CNLSpecificFeatures);

	// Icelake
	list<SubtargetFeature> ICLAdditionalFeatures = [FeatureBITALG,
	FeatureVAES,
	FeatureVBMI2,
	FeatureVNNI,
	FeatureVPCLMULQDQ,
	FeatureVPOPCNTDQ,
	FeatureGFNI,
	FeatureCLWB,
	FeatureRDPID];
	list<SubtargetFeature> ICLSpecificFeatures = [FeatureHasFastGather];
	list<SubtargetFeature> ICLInheritableFeatures =
	!listconcat(CNLInheritableFeatures, ICLAdditionalFeatures);
	list<SubtargetFeature> ICLFeatures =
	!listconcat(ICLInheritableFeatures, ICLSpecificFeatures);

	// Icelake Server
	list<SubtargetFeature> ICXSpecificFeatures = [FeaturePCONFIG,
	FeatureWBNOINVD,
	FeatureHasFastGather];
	list<SubtargetFeature> ICXFeatures =
	!listconcat(ICLInheritableFeatures, ICXSpecificFeatures);

	// Atom
	list<SubtargetFeature> AtomInheritableFeatures = [FeatureX87,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureSSSE3,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeatureMOVBE,
	FeatureSlowTwoMemOps,
	FeatureLAHFSAHF];
	list<SubtargetFeature> AtomSpecificFeatures = [ProcIntelAtom,
	FeatureSlowUAMem16,
	FeatureLEAForSP,
	FeatureSlowDivide32,
	FeatureSlowDivide64,
	FeatureLEAUsesAG,
	FeaturePadShortFunctions];
	list<SubtargetFeature> AtomFeatures =
	!listconcat(AtomInheritableFeatures, AtomSpecificFeatures);

	// Silvermont
	list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42,
	FeaturePOPCNT,
	FeaturePCLMUL,
	FeaturePRFCHW,
	FeatureSlowLEA,
	FeatureSlowIncDec,
	FeatureRDRAND];
	list<SubtargetFeature> SLMSpecificFeatures = [ProcIntelSLM,
	FeatureSlowDivide64,
	FeatureSlowPMULLD,
	FeaturePOPCNTFalseDeps];
	list<SubtargetFeature> SLMInheritableFeatures =
	!listconcat(AtomInheritableFeatures, SLMAdditionalFeatures);
	list<SubtargetFeature> SLMFeatures =
	!listconcat(SLMInheritableFeatures, SLMSpecificFeatures);

	// Goldmont
	list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES,
	FeatureMPX,
	FeatureSHA,
	FeatureRDSEED,
	FeatureXSAVE,
	FeatureXSAVEOPT,
	FeatureXSAVEC,
	FeatureXSAVES,
	FeatureCLFLUSHOPT,
	FeatureFSGSBase];
	list<SubtargetFeature> GLMSpecificFeatures = [ProcIntelGLM,
	FeaturePOPCNTFalseDeps];
	list<SubtargetFeature> GLMInheritableFeatures =
	!listconcat(SLMInheritableFeatures, GLMAdditionalFeatures);
	list<SubtargetFeature> GLMFeatures =
	!listconcat(GLMInheritableFeatures, GLMSpecificFeatures);

	// Goldmont Plus
	list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE,
	FeatureRDPID,
	FeatureSGX];
	list<SubtargetFeature> GLPSpecificFeatures = [ProcIntelGLP];
	list<SubtargetFeature> GLPInheritableFeatures =
	!listconcat(GLMInheritableFeatures, GLPAdditionalFeatures);
	list<SubtargetFeature> GLPFeatures =
	!listconcat(GLPInheritableFeatures, GLPSpecificFeatures);

	// Tremont
	list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLDEMOTE,
	FeatureGFNI,
	FeatureMOVDIRI,
	FeatureMOVDIR64B,
	FeatureWAITPKG];
	list<SubtargetFeature> TRMSpecificFeatures = [ProcIntelTRM];
	list<SubtargetFeature> TRMFeatures =
	!listconcat(GLPInheritableFeatures, TRMAdditionalFeatures,
	TRMSpecificFeatures);

	// Knights Landing
	list<SubtargetFeature> KNLFeatures = [FeatureX87,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeaturePOPCNT,
	FeatureSlowDivide64,
	FeaturePCLMUL,
	FeatureXSAVE,
	FeatureXSAVEOPT,
	FeatureLAHFSAHF,
	FeatureSlow3OpsLEA,
	FeatureSlowIncDec,
	FeatureAES,
	FeatureRDRAND,
	FeatureF16C,
	FeatureFSGSBase,
	FeatureAVX512,
	FeatureERI,
	FeatureCDI,
	FeaturePFI,
	FeaturePREFETCHWT1,
	FeatureADX,
	FeatureRDSEED,
	FeatureMOVBE,
	FeatureLZCNT,
	FeatureBMI,
	FeatureBMI2,
	FeatureFMA,
	FeaturePRFCHW,
	FeatureSlowTwoMemOps,
	FeatureFastPartialYMMorZMMWrite,
	FeatureHasFastGather,
	FeatureSlowPMADDWD];
	// TODO Add AVX5124FMAPS/AVX5124VNNIW features
	list<SubtargetFeature> KNMFeatures =
	!listconcat(KNLFeatures, [FeatureVPOPCNTDQ]);


	// Bobcat
	list<SubtargetFeature> BtVer1InheritableFeatures = [FeatureX87,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureSSSE3,
	FeatureSSE4A,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeaturePRFCHW,
	FeatureLZCNT,
	FeaturePOPCNT,
	FeatureSlowSHLD,
	FeatureLAHFSAHF,
	FeatureFast15ByteNOP,
	FeatureFastScalarShiftMasks,
	FeatureFastVectorShiftMasks];
	list<SubtargetFeature> BtVer1Features = BtVer1InheritableFeatures;

	// Jaguar
	list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
	FeatureAES,
	FeaturePCLMUL,
	FeatureBMI,
	FeatureF16C,
	FeatureMOVBE,
	FeatureXSAVE,
	FeatureXSAVEOPT];
	list<SubtargetFeature> BtVer2SpecificFeatures = [FeatureFastLZCNT,
	FeatureFastBEXTR,
	FeatureFastPartialYMMorZMMWrite,
	FeatureFastHorizontalOps];
	list<SubtargetFeature> BtVer2InheritableFeatures =
	!listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures);
	list<SubtargetFeature> BtVer2Features =
	!listconcat(BtVer2InheritableFeatures, BtVer2SpecificFeatures);

	// Bulldozer
	list<SubtargetFeature> BdVer1InheritableFeatures = [FeatureX87,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureXOP,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeatureAES,
	FeaturePRFCHW,
	FeaturePCLMUL,
	FeatureMMX,
	FeatureFXSR,
	FeatureNOPL,
	FeatureLZCNT,
	FeaturePOPCNT,
	FeatureXSAVE,
	FeatureLWP,
	FeatureSlowSHLD,
	FeatureLAHFSAHF,
	FeatureFast11ByteNOP,
	FeatureFastScalarShiftMasks,
	FeatureBranchFusion];
	list<SubtargetFeature> BdVer1Features = BdVer1InheritableFeatures;

	// PileDriver
	list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
	FeatureBMI,
	FeatureTBM,
	FeatureFMA,
	FeatureFastBEXTR];
	list<SubtargetFeature> BdVer2InheritableFeatures =
	!listconcat(BdVer1InheritableFeatures, BdVer2AdditionalFeatures);
	list<SubtargetFeature> BdVer2Features = BdVer2InheritableFeatures;

	// Steamroller
	list<SubtargetFeature> BdVer3AdditionalFeatures = [FeatureXSAVEOPT,
	FeatureFSGSBase];
	list<SubtargetFeature> BdVer3InheritableFeatures =
	!listconcat(BdVer2InheritableFeatures, BdVer3AdditionalFeatures);
	list<SubtargetFeature> BdVer3Features = BdVer3InheritableFeatures;

	// Excavator
	list<SubtargetFeature> BdVer4AdditionalFeatures = [FeatureAVX2,
	FeatureBMI2,
	FeatureMWAITX];
	list<SubtargetFeature> BdVer4InheritableFeatures =
	!listconcat(BdVer3InheritableFeatures, BdVer4AdditionalFeatures);
	list<SubtargetFeature> BdVer4Features = BdVer4InheritableFeatures;


	// AMD Zen Processors common ISAs
	list<SubtargetFeature> ZNFeatures = [FeatureADX,
	FeatureAES,
	FeatureAVX2,
	FeatureBMI,
	FeatureBMI2,
	FeatureCLFLUSHOPT,
	FeatureCLZERO,
	FeatureCMOV,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeatureF16C,
	FeatureFMA,
	FeatureFSGSBase,
	FeatureFXSR,
	FeatureNOPL,
	FeatureFastLZCNT,
	FeatureLAHFSAHF,
	FeatureLZCNT,
	FeatureFastBEXTR,
	FeatureFast15ByteNOP,
	FeatureBranchFusion,
	FeatureFastScalarShiftMasks,
	FeatureMMX,
	FeatureMOVBE,
	FeatureMWAITX,
	FeaturePCLMUL,
	FeaturePOPCNT,
	FeaturePRFCHW,
	FeatureRDRAND,
	FeatureRDSEED,
	FeatureSHA,
	FeatureSSE4A,
	FeatureSlowSHLD,
	FeatureX87,
	FeatureXSAVE,
	FeatureXSAVEC,
	FeatureXSAVEOPT,
	FeatureXSAVES];
	list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
	FeatureRDPID,
	FeatureWBNOINVD];
	list<SubtargetFeature> ZN2Features =
	!listconcat(ZNFeatures, ZN2AdditionalFeatures);
	}

	//===----------------------------------------------------------------------===//
	// X86 processors supported.
	//===----------------------------------------------------------------------===//

	class Proc<string Name, list<SubtargetFeature> Features>
	: ProcessorModel<Name, GenericModel, Features>;

	// NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled
	// if i386/i486 is specifically requested.
	def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16,
	FeatureCMPXCHG8B]>;
	def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16]>;
	def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>;
	def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16,
	FeatureCMPXCHG8B]>;
	def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16,
	FeatureCMPXCHG8B]>;
	def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16,
	FeatureCMPXCHG8B, FeatureMMX]>;

	def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureCMOV]>;
	def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureCMOV, FeatureNOPL]>;

	def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureMMX, FeatureCMOV, FeatureFXSR,
	FeatureNOPL]>;

	foreach P = ["pentium3", "pentium3m"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,FeatureMMX,
	FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
	}

	// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
	// The intent is to enable it for pentium4 which is the current default
	// processor in a vanilla 32-bit clang compilation when no specific
	// architecture is specified. This generally gives a nice performance
	// increase on silvermont, with largely neutral behavior on other
	// contemporary large core processors.
	// pentium-m, pentium4m, prescott and nocona are included as a preventative
	// measure to avoid performance surprises, in case clang's default cpu
	// changes slightly.

	def : ProcessorModel<"pentium-m", GenericPostRAModel,
	[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
	FeatureCMOV]>;

	foreach P = ["pentium4", "pentium4m"] in {
	def : ProcessorModel<P, GenericPostRAModel,
	[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
	FeatureCMOV]>;
	}

	// Intel Quark.
	def : Proc<"lakemont", []>;

	// Intel Core Duo.
	def : ProcessorModel<"yonah", SandyBridgeModel,
	[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
	FeatureCMOV]>;

	// NetBurst.
	def : ProcessorModel<"prescott", GenericPostRAModel,
	[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
	FeatureCMOV]>;
	def : ProcessorModel<"nocona", GenericPostRAModel, [
	FeatureX87,
	FeatureSlowUAMem16,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureSSE3,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureCMPXCHG16B
	]>;

	// Intel Core 2 Solo/Duo.
	def : ProcessorModel<"core2", SandyBridgeModel, [
	FeatureX87,
	FeatureSlowUAMem16,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureSSSE3,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeatureLAHFSAHF,
	FeatureMacroFusion
	]>;
	def : ProcessorModel<"penryn", SandyBridgeModel, [
	FeatureX87,
	FeatureSlowUAMem16,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureSSE41,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureCMPXCHG16B,
	FeatureLAHFSAHF,
	FeatureMacroFusion
	]>;

	// Atom CPUs.
	foreach P = ["bonnell", "atom"] in {
	def : ProcessorModel<P, AtomModel, ProcessorFeatures.AtomFeatures>;
	}

	foreach P = ["silvermont", "slm"] in {
	def : ProcessorModel<P, SLMModel, ProcessorFeatures.SLMFeatures>;
	}

	def : ProcessorModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures>;
	def : ProcessorModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures>;
	def : ProcessorModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures>;

	// "Arrandale" along with corei3 and corei5
	foreach P = ["nehalem", "corei7"] in {
	def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures>;
	}

	// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
	def : ProcessorModel<"westmere", SandyBridgeModel,
	ProcessorFeatures.WSMFeatures>;

	foreach P = ["sandybridge", "corei7-avx"] in {
	def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures>;
	}

	foreach P = ["ivybridge", "core-avx-i"] in {
	def : ProcessorModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures>;
	}

	foreach P = ["haswell", "core-avx2"] in {
	def : ProcessorModel<P, HaswellModel, ProcessorFeatures.HSWFeatures>;
	}

	def : ProcessorModel<"broadwell", BroadwellModel,
	ProcessorFeatures.BDWFeatures>;

	def : ProcessorModel<"skylake", SkylakeClientModel,
	ProcessorFeatures.SKLFeatures>;

	// FIXME: define KNL scheduler model
	def : ProcessorModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures>;
	def : ProcessorModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures>;

	foreach P = ["skylake-avx512", "skx"] in {
	def : ProcessorModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures>;
	}

	def : ProcessorModel<"cascadelake", SkylakeServerModel,
	ProcessorFeatures.CLXFeatures>;
	def : ProcessorModel<"cooperlake", SkylakeServerModel,
	ProcessorFeatures.CPXFeatures>;
	def : ProcessorModel<"cannonlake", SkylakeServerModel,
	ProcessorFeatures.CNLFeatures>;
	def : ProcessorModel<"icelake-client", SkylakeServerModel,
	ProcessorFeatures.ICLFeatures>;
	def : ProcessorModel<"icelake-server", SkylakeServerModel,
	ProcessorFeatures.ICXFeatures>;

	// AMD CPUs.

	def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureMMX]>;
	def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	Feature3DNow]>;
	def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	Feature3DNow]>;

	foreach P = ["athlon", "athlon-tbird"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
	Feature3DNowA, FeatureNOPL, FeatureSlowSHLD]>;
	}

	foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
	FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL,
	FeatureSlowSHLD]>;
	}

	foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureSSE2, Feature3DNowA, FeatureFXSR, FeatureNOPL,
	Feature64Bit, FeatureSlowSHLD, FeatureCMOV,
	FeatureFastScalarShiftMasks]>;
	}

	foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
	def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureSSE3,
	Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B,
	FeatureSlowSHLD, FeatureCMOV, Feature64Bit,
	FeatureFastScalarShiftMasks]>;
	}

	foreach P = ["amdfam10", "barcelona"] in {
	def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE4A, Feature3DNowA,
	FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT,
	FeaturePOPCNT, FeatureSlowSHLD, FeatureLAHFSAHF, FeatureCMOV,
	Feature64Bit, FeatureFastScalarShiftMasks]>;
	}

	// Bobcat
	def : Proc<"btver1", ProcessorFeatures.BtVer1Features>;
	// Jaguar
	def : ProcessorModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features>;

	// Bulldozer
	def : ProcessorModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features>;
	// Piledriver
	def : ProcessorModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features>;
	// Steamroller
	def : Proc<"bdver3", ProcessorFeatures.BdVer3Features>;
	// Excavator
	def : Proc<"bdver4", ProcessorFeatures.BdVer4Features>;

	def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>;
	def : ProcessorModel<"znver2", Znver1Model, ProcessorFeatures.ZN2Features>;

	def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	Feature3DNowA]>;

	def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
	def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
	def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
	def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
	FeatureMMX, FeatureSSE1, FeatureFXSR,
	FeatureCMOV]>;

	// We also provide a generic 64-bit specific x86 processor model which tries to
	// be good for modern chips without enabling instruction set encodings past the
	// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and
	// modern 64-bit x86 chip, and enables features that are generally beneficial.
	//
	// We currently use the Sandy Bridge model as the default scheduling model as
	// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
	// covers a huge swath of x86 processors. If there are specific scheduling
	// knobs which need to be tuned differently for AMD chips, we might consider
	// forming a common base for them.
	def : ProcessorModel<"x86-64", SandyBridgeModel, [
	FeatureX87,
	FeatureCMPXCHG8B,
	FeatureCMOV,
	FeatureMMX,
	FeatureSSE2,
	FeatureFXSR,
	FeatureNOPL,
	Feature64Bit,
	FeatureSlow3OpsLEA,
	FeatureSlowIncDec,
	FeatureMacroFusion
	]>;

	//===----------------------------------------------------------------------===//
	// Calling Conventions
	//===----------------------------------------------------------------------===//

	include "X86CallingConv.td"


	//===----------------------------------------------------------------------===//
	// Assembly Parser
	//===----------------------------------------------------------------------===//

	def ATTAsmParserVariant : AsmParserVariant {
	int Variant = 0;

	// Variant name.
	string Name = "att";

	// Discard comments in assembly strings.
	string CommentDelimiter = "#";

	// Recognize hard coded registers.
	string RegisterPrefix = "%";
	}

	def IntelAsmParserVariant : AsmParserVariant {
	int Variant = 1;

	// Variant name.
	string Name = "intel";

	// Discard comments in assembly strings.
	string CommentDelimiter = ";";

	// Recognize hard coded registers.
	string RegisterPrefix = "";
	}

	//===----------------------------------------------------------------------===//
	// Assembly Printers
	//===----------------------------------------------------------------------===//

	// The X86 target supports two different syntaxes for emitting machine code.
	// This is controlled by the -x86-asm-syntax={att\|intel}
	def ATTAsmWriter : AsmWriter {
	string AsmWriterClassName = "ATTInstPrinter";
	int Variant = 0;
	}
	def IntelAsmWriter : AsmWriter {
	string AsmWriterClassName = "IntelInstPrinter";
	int Variant = 1;
	}

	def X86 : Target {
	// Information about the instructions...
	let InstructionSet = X86InstrInfo;
	let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
	let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
	let AllowRegisterRenaming = 1;
	}

	//===----------------------------------------------------------------------===//
	// Pfm Counters
	//===----------------------------------------------------------------------===//

	include "X86PfmCounters.td"
	Index: vendor/llvm/dist-release_90/lib/Target/X86/X86ISelDAGToDAG.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/X86/X86ISelDAGToDAG.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/X86/X86ISelDAGToDAG.cpp (revision 351303)
	@@ -1,5045 +1,5080 @@
	//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines a DAG pattern matching instruction selector for X86,
	// converting from a legalized dag to a X86 dag.
	//
	//===----------------------------------------------------------------------===//

	#include "X86.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86RegisterInfo.h"
	#include "X86Subtarget.h"
	#include "X86TargetMachine.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/SelectionDAGISel.h"
	#include "llvm/Config/llvm-config.h"
	#include "llvm/IR/ConstantRange.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/Type.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <stdint.h>
	using namespace llvm;

	#define DEBUG_TYPE "x86-isel"

	STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");

	static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
	cl::desc("Enable setting constant bits to reduce size of mask immediates"),
	cl::Hidden);

	//===----------------------------------------------------------------------===//
	// Pattern Matcher Implementation
	//===----------------------------------------------------------------------===//

	namespace {
	/// This corresponds to X86AddressMode, but uses SDValue's instead of register
	/// numbers for the leaves of the matched tree.
	struct X86ISelAddressMode {
	enum {
	RegBase,
	FrameIndexBase
	} BaseType;

	// This is really a union, discriminated by BaseType!
	SDValue Base_Reg;
	int Base_FrameIndex;

	unsigned Scale;
	SDValue IndexReg;
	int32_t Disp;
	SDValue Segment;
	const GlobalValue *GV;
	const Constant *CP;
	const BlockAddress *BlockAddr;
	const char *ES;
	MCSymbol *MCSym;
	int JT;
	unsigned Align; // CP alignment.
	unsigned char SymbolFlags; // X86II::MO_*
	bool NegateIndex = false;

	X86ISelAddressMode()
	: BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
	Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
	MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}

	bool hasSymbolicDisplacement() const {
	return GV != nullptr \|\| CP != nullptr \|\| ES != nullptr \|\|
	MCSym != nullptr \|\| JT != -1 \|\| BlockAddr != nullptr;
	}

	bool hasBaseOrIndexReg() const {
	return BaseType == FrameIndexBase \|\|
	IndexReg.getNode() != nullptr \|\| Base_Reg.getNode() != nullptr;
	}

	/// Return true if this addressing mode is already RIP-relative.
	bool isRIPRelative() const {
	if (BaseType != RegBase) return false;
	if (RegisterSDNode *RegNode =
	dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
	return RegNode->getReg() == X86::RIP;
	return false;
	}

	void setBaseReg(SDValue Reg) {
	BaseType = RegBase;
	Base_Reg = Reg;
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	void dump(SelectionDAG *DAG = nullptr) {
	dbgs() << "X86ISelAddressMode " << this << '\n';
	dbgs() << "Base_Reg ";
	if (Base_Reg.getNode())
	Base_Reg.getNode()->dump(DAG);
	else
	dbgs() << "nul\n";
	if (BaseType == FrameIndexBase)
	dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
	dbgs() << " Scale " << Scale << '\n'
	<< "IndexReg ";
	if (NegateIndex)
	dbgs() << "negate ";
	if (IndexReg.getNode())
	IndexReg.getNode()->dump(DAG);
	else
	dbgs() << "nul\n";
	dbgs() << " Disp " << Disp << '\n'
	<< "GV ";
	if (GV)
	GV->dump();
	else
	dbgs() << "nul";
	dbgs() << " CP ";
	if (CP)
	CP->dump();
	else
	dbgs() << "nul";
	dbgs() << '\n'
	<< "ES ";
	if (ES)
	dbgs() << ES;
	else
	dbgs() << "nul";
	dbgs() << " MCSym ";
	if (MCSym)
	dbgs() << MCSym;
	else
	dbgs() << "nul";
	dbgs() << " JT" << JT << " Align" << Align << '\n';
	}
	#endif
	};
	}

	namespace {
	//===--------------------------------------------------------------------===//
	/// ISel - X86-specific code to select X86 machine instructions for
	/// SelectionDAG operations.
	///
	class X86DAGToDAGISel final : public SelectionDAGISel {
	/// Keep a pointer to the X86Subtarget around so that we can
	/// make the right decision when generating code for different targets.
	const X86Subtarget *Subtarget;

	/// If true, selector should try to optimize for code size instead of
	/// performance.
	bool OptForSize;

	/// If true, selector should try to optimize for minimum code size.
	bool OptForMinSize;

	/// Disable direct TLS access through segment registers.
	bool IndirectTlsSegRefs;

	public:
	explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
	: SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
	OptForMinSize(false), IndirectTlsSegRefs(false) {}

	StringRef getPassName() const override {
	return "X86 DAG->DAG Instruction Selection";
	}

	bool runOnMachineFunction(MachineFunction &MF) override {
	// Reset the subtarget each time through.
	Subtarget = &MF.getSubtarget<X86Subtarget>();
	IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
	"indirect-tls-seg-refs");

	// OptFor[Min]Size are used in pattern predicates that isel is matching.
	OptForSize = MF.getFunction().hasOptSize();
	OptForMinSize = MF.getFunction().hasMinSize();
	assert((!OptForMinSize \|\| OptForSize) &&
	"OptForMinSize implies OptForSize");

	SelectionDAGISel::runOnMachineFunction(MF);
	return true;
	}

	void EmitFunctionEntryCode() override;

	bool IsProfitableToFold(SDValue N, SDNode U, SDNode Root) const override;

	void PreprocessISelDAG() override;
	void PostprocessISelDAG() override;

	// Include the pieces autogenerated from the target description.
	#include "X86GenDAGISel.inc"

	private:
	void Select(SDNode *N) override;

	bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
	bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
	bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
	bool matchAddress(SDValue N, X86ISelAddressMode &AM);
	bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
	bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
	bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
	unsigned Depth);
	bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
	bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectMOV64Imm32(SDValue N, SDValue &Imm);
	bool selectLEAAddr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectLEA64_32Addr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectTLSADDRAddr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index, SDValue &Disp,
	SDValue &Segment);
	bool selectScalarSSELoad(SDNode Root, SDNode Parent, SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment,
	SDValue &NodeWithChain);
	bool selectRelocImm(SDValue N, SDValue &Op);

	bool tryFoldLoad(SDNode Root, SDNode P, SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment);

	// Convenience method where P is also root.
	bool tryFoldLoad(SDNode *P, SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment) {
	return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
	}

	/// Implement addressing mode selection for inline asm expressions.
	bool SelectInlineAsmMemoryOperand(const SDValue &Op,
	unsigned ConstraintID,
	std::vector<SDValue> &OutOps) override;

	void emitSpecialCodeForMain();

	inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
	MVT VT, SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment) {
	if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
	Base = CurDAG->getTargetFrameIndex(
	AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
	else if (AM.Base_Reg.getNode())
	Base = AM.Base_Reg;
	else
	Base = CurDAG->getRegister(0, VT);

	Scale = getI8Imm(AM.Scale, DL);

	// Negate the index if needed.
	if (AM.NegateIndex) {
	unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
	SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
	AM.IndexReg), 0);
	AM.IndexReg = Neg;
	}

	if (AM.IndexReg.getNode())
	Index = AM.IndexReg;
	else
	Index = CurDAG->getRegister(0, VT);

	// These are 32-bit even in 64-bit mode since RIP-relative offset
	// is 32-bit.
	if (AM.GV)
	Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
	MVT::i32, AM.Disp,
	AM.SymbolFlags);
	else if (AM.CP)
	Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
	AM.Align, AM.Disp, AM.SymbolFlags);
	else if (AM.ES) {
	assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
	Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
	} else if (AM.MCSym) {
	assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
	assert(AM.SymbolFlags == 0 && "oo");
	Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
	} else if (AM.JT != -1) {
	assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
	Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
	} else if (AM.BlockAddr)
	Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
	AM.SymbolFlags);
	else
	Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);

	if (AM.Segment.getNode())
	Segment = AM.Segment;
	else
	Segment = CurDAG->getRegister(0, MVT::i16);
	}

	// Utility function to determine whether we should avoid selecting
	// immediate forms of instructions for better code size or not.
	// At a high level, we'd like to avoid such instructions when
	// we have similar constants used within the same basic block
	// that can be kept in a register.
	//
	bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
	uint32_t UseCount = 0;

	// Do not want to hoist if we're not optimizing for size.
	// TODO: We'd like to remove this restriction.
	// See the comment in X86InstrInfo.td for more info.
	if (!OptForSize)
	return false;

	// Walk all the users of the immediate.
	for (SDNode::use_iterator UI = N->use_begin(),
	UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {

	SDNode User = UI;

	// This user is already selected. Count it as a legitimate use and
	// move on.
	if (User->isMachineOpcode()) {
	UseCount++;
	continue;
	}

	// We want to count stores of immediates as real uses.
	if (User->getOpcode() == ISD::STORE &&
	User->getOperand(1).getNode() == N) {
	UseCount++;
	continue;
	}

	// We don't currently match users that have > 2 operands (except
	// for stores, which are handled above)
	// Those instruction won't match in ISEL, for now, and would
	// be counted incorrectly.
	// This may change in the future as we add additional instruction
	// types.
	if (User->getNumOperands() != 2)
	continue;

	// Immediates that are used for offsets as part of stack
	// manipulation should be left alone. These are typically
	// used to indicate SP offsets for argument passing and
	// will get pulled into stores/pushes (implicitly).
	if (User->getOpcode() == X86ISD::ADD \|\|
	User->getOpcode() == ISD::ADD \|\|
	User->getOpcode() == X86ISD::SUB \|\|
	User->getOpcode() == ISD::SUB) {

	// Find the other operand of the add/sub.
	SDValue OtherOp = User->getOperand(0);
	if (OtherOp.getNode() == N)
	OtherOp = User->getOperand(1);

	// Don't count if the other operand is SP.
	RegisterSDNode *RegNode;
	if (OtherOp->getOpcode() == ISD::CopyFromReg &&
	(RegNode = dyn_cast_or_null<RegisterSDNode>(
	OtherOp->getOperand(1).getNode())))
	if ((RegNode->getReg() == X86::ESP) \|\|
	(RegNode->getReg() == X86::RSP))
	continue;
	}

	// ... otherwise, count this and move on.
	UseCount++;
	}

	// If we have more than 1 use, then recommend for hoisting.
	return (UseCount > 1);
	}

	/// Return a target constant with the specified value of type i8.
	inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
	return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
	}

	/// Return a target constant with the specified value, of type i32.
	inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
	return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
	}

	/// Return a target constant with the specified value, of type i64.
	inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
	return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
	}

	SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
	const SDLoc &DL) {
	assert((VecWidth == 128 \|\| VecWidth == 256) && "Unexpected vector width");
	uint64_t Index = N->getConstantOperandVal(1);
	MVT VecVT = N->getOperand(0).getSimpleValueType();
	return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
	}

	SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
	const SDLoc &DL) {
	assert((VecWidth == 128 \|\| VecWidth == 256) && "Unexpected vector width");
	uint64_t Index = N->getConstantOperandVal(2);
	MVT VecVT = N->getSimpleValueType(0);
	return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
	}

	// Helper to detect unneeded and instructions on shift amounts. Called
	// from PatFrags in tablegen.
	bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
	assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
	const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();

	if (Val.countTrailingOnes() >= Width)
	return true;

	APInt Mask = Val \| CurDAG->computeKnownBits(N->getOperand(0)).Zero;
	return Mask.countTrailingOnes() >= Width;
	}

	/// Return an SDNode that returns the value of the global base register.
	/// Output instructions required to initialize the global base register,
	/// if necessary.
	SDNode *getGlobalBaseReg();

	/// Return a reference to the TargetMachine, casted to the target-specific
	/// type.
	const X86TargetMachine &getTargetMachine() const {
	return static_cast<const X86TargetMachine &>(TM);
	}

	/// Return a reference to the TargetInstrInfo, casted to the target-specific
	/// type.
	const X86InstrInfo *getInstrInfo() const {
	return Subtarget->getInstrInfo();
	}

	/// Address-mode matching performs shift-of-and to and-of-shift
	/// reassociation in order to expose more scaled addressing
	/// opportunities.
	bool ComplexPatternFuncMutatesDAG() const override {
	return true;
	}

	bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;

	/// Returns whether this is a relocatable immediate in the range
	/// [-2^Width .. 2^Width-1].
	template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
	if (auto *CN = dyn_cast<ConstantSDNode>(N))
	return isInt<Width>(CN->getSExtValue());
	return isSExtAbsoluteSymbolRef(Width, N);
	}

	// Indicates we should prefer to use a non-temporal load for this load.
	bool useNonTemporalLoad(LoadSDNode *N) const {
	if (!N->isNonTemporal())
	return false;

	unsigned StoreSize = N->getMemoryVT().getStoreSize();

	if (N->getAlignment() < StoreSize)
	return false;

	switch (StoreSize) {
	default: llvm_unreachable("Unsupported store size");
	case 4:
	case 8:
	return false;
	case 16:
	return Subtarget->hasSSE41();
	case 32:
	return Subtarget->hasAVX2();
	case 64:
	return Subtarget->hasAVX512();
	}
	}

	bool foldLoadStoreIntoMemOperand(SDNode *Node);
	MachineSDNode matchBEXTRFromAndImm(SDNode Node);
	bool matchBitExtract(SDNode *Node);
	bool shrinkAndImmediate(SDNode *N);
	bool isMaskZeroExtended(SDNode *N) const;
	bool tryShiftAmountMod(SDNode *N);
	bool tryShrinkShlLogicImm(SDNode *N);
	bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);

	MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
	const SDLoc &dl, MVT VT, SDNode *Node);
	MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
	const SDLoc &dl, MVT VT, SDNode *Node,
	SDValue &InFlag);

	bool tryOptimizeRem8Extend(SDNode *N);

	bool onlyUsesZeroFlag(SDValue Flags) const;
	bool hasNoSignFlagUses(SDValue Flags) const;
	bool hasNoCarryFlagUses(SDValue Flags) const;
	};
	}


	// Returns true if this masked compare can be implemented legally with this
	// type.
	static bool isLegalMaskCompare(SDNode N, const X86Subtarget Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == X86ISD::CMPM \|\| Opcode == ISD::SETCC \|\|
	Opcode == X86ISD::CMPM_SAE \|\| Opcode == X86ISD::VFPCLASS) {
	// We can get 256-bit 8 element types here without VLX being enabled. When
	// this happens we will use 512-bit operations and the mask will not be
	// zero extended.
	EVT OpVT = N->getOperand(0).getValueType();
	if (OpVT.is256BitVector() \|\| OpVT.is128BitVector())
	return Subtarget->hasVLX();

	return true;
	}
	// Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
	if (Opcode == X86ISD::VFPCLASSS \|\| Opcode == X86ISD::FSETCCM \|\|
	Opcode == X86ISD::FSETCCM_SAE)
	return true;

	return false;
	}

	// Returns true if we can assume the writer of the mask has zero extended it
	// for us.
	bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
	// If this is an AND, check if we have a compare on either side. As long as
	// one side guarantees the mask is zero extended, the AND will preserve those
	// zeros.
	if (N->getOpcode() == ISD::AND)
	return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) \|\|
	isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);

	return isLegalMaskCompare(N, Subtarget);
	}

	bool
	X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode U, SDNode Root) const {
	if (OptLevel == CodeGenOpt::None) return false;

	if (!N.hasOneUse())
	return false;

	if (N.getOpcode() != ISD::LOAD)
	return true;

	// Don't fold non-temporal loads if we have an instruction for them.
	if (useNonTemporalLoad(cast<LoadSDNode>(N)))
	return false;

	// If N is a load, do additional profitability checks.
	if (U == Root) {
	switch (U->getOpcode()) {
	default: break;
	case X86ISD::ADD:
	case X86ISD::ADC:
	case X86ISD::SUB:
	case X86ISD::SBB:
	case X86ISD::AND:
	case X86ISD::XOR:
	case X86ISD::OR:
	case ISD::ADD:
	case ISD::ADDCARRY:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR: {
	SDValue Op1 = U->getOperand(1);

	// If the other operand is a 8-bit immediate we should fold the immediate
	// instead. This reduces code size.
	// e.g.
	// movl 4(%esp), %eax
	// addl $4, %eax
	// vs.
	// movl $4, %eax
	// addl 4(%esp), %eax
	// The former is 2 bytes shorter. In case where the increment is 1, then
	// the saving can be 4 bytes (by using incl %eax).
	if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
	if (Imm->getAPIntValue().isSignedIntN(8))
	return false;

	// If this is a 64-bit AND with an immediate that fits in 32-bits,
	// prefer using the smaller and over folding the load. This is needed to
	// make sure immediates created by shrinkAndImmediate are always folded.
	// Ideally we would narrow the load during DAG combine and get the
	// best of both worlds.
	if (U->getOpcode() == ISD::AND &&
	Imm->getAPIntValue().getBitWidth() == 64 &&
	Imm->getAPIntValue().isIntN(32))
	return false;

	// If this really a zext_inreg that can be represented with a movzx
	// instruction, prefer that.
	// TODO: We could shrink the load and fold if it is non-volatile.
	if (U->getOpcode() == ISD::AND &&
	(Imm->getAPIntValue() == UINT8_MAX \|\|
	Imm->getAPIntValue() == UINT16_MAX \|\|
	Imm->getAPIntValue() == UINT32_MAX))
	return false;

	// ADD/SUB with can negate the immediate and use the opposite operation
	// to fit 128 into a sign extended 8 bit immediate.
	if ((U->getOpcode() == ISD::ADD \|\| U->getOpcode() == ISD::SUB) &&
	(-Imm->getAPIntValue()).isSignedIntN(8))
	return false;
	}

	// If the other operand is a TLS address, we should fold it instead.
	// This produces
	// movl %gs:0, %eax
	// leal i@NTPOFF(%eax), %eax
	// instead of
	// movl $i@NTPOFF, %eax
	// addl %gs:0, %eax
	// if the block also has an access to a second TLS address this will save
	// a load.
	// FIXME: This is probably also true for non-TLS addresses.
	if (Op1.getOpcode() == X86ISD::Wrapper) {
	SDValue Val = Op1.getOperand(0);
	if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
	return false;
	}

	// Don't fold load if this matches the BTS/BTR/BTC patterns.
	// BTS: (or X, (shl 1, n))
	// BTR: (and X, (rotl -2, n))
	// BTC: (xor X, (shl 1, n))
	if (U->getOpcode() == ISD::OR \|\| U->getOpcode() == ISD::XOR) {
	if (U->getOperand(0).getOpcode() == ISD::SHL &&
	isOneConstant(U->getOperand(0).getOperand(0)))
	return false;

	if (U->getOperand(1).getOpcode() == ISD::SHL &&
	isOneConstant(U->getOperand(1).getOperand(0)))
	return false;
	}
	if (U->getOpcode() == ISD::AND) {
	SDValue U0 = U->getOperand(0);
	SDValue U1 = U->getOperand(1);
	if (U0.getOpcode() == ISD::ROTL) {
	auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
	if (C && C->getSExtValue() == -2)
	return false;
	}

	if (U1.getOpcode() == ISD::ROTL) {
	auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
	if (C && C->getSExtValue() == -2)
	return false;
	}
	}

	break;
	}
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	// Don't fold a load into a shift by immediate. The BMI2 instructions
	// support folding a load, but not an immediate. The legacy instructions
	// support folding an immediate, but can't fold a load. Folding an
	// immediate is preferable to folding a load.
	if (isa<ConstantSDNode>(U->getOperand(1)))
	return false;

	break;
	}
	}

	// Prevent folding a load if this can implemented with an insert_subreg or
	// a move that implicitly zeroes.
	if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
	isNullConstant(Root->getOperand(2)) &&
	(Root->getOperand(0).isUndef() \|\|
	ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
	return false;

	return true;
	}

	/// Replace the original chain operand of the call with
	/// load's chain operand and move load below the call's chain operand.
	static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
	SDValue Call, SDValue OrigChain) {
	SmallVector<SDValue, 8> Ops;
	SDValue Chain = OrigChain.getOperand(0);
	if (Chain.getNode() == Load.getNode())
	Ops.push_back(Load.getOperand(0));
	else {
	assert(Chain.getOpcode() == ISD::TokenFactor &&
	"Unexpected chain operand");
	for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
	if (Chain.getOperand(i).getNode() == Load.getNode())
	Ops.push_back(Load.getOperand(0));
	else
	Ops.push_back(Chain.getOperand(i));
	SDValue NewChain =
	CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
	Ops.clear();
	Ops.push_back(NewChain);
	}
	Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
	CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
	CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
	Load.getOperand(1), Load.getOperand(2));

	Ops.clear();
	Ops.push_back(SDValue(Load.getNode(), 1));
	Ops.append(Call->op_begin() + 1, Call->op_end());
	CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
	}

	/// Return true if call address is a load and it can be
	/// moved below CALLSEQ_START and the chains leading up to the call.
	/// Return the CALLSEQ_START by reference as a second output.
	/// In the case of a tail call, there isn't a callseq node between the call
	/// chain and the load.
	static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
	// The transformation is somewhat dangerous if the call's chain was glued to
	// the call. After MoveBelowOrigChain the load is moved between the call and
	// the chain, this can create a cycle if the load is not folded. So it is
	// really important that we are sure the load will be folded.
	if (Callee.getNode() == Chain.getNode() \|\| !Callee.hasOneUse())
	return false;
	LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
	if (!LD \|\|
	LD->isVolatile() \|\|
	LD->getAddressingMode() != ISD::UNINDEXED \|\|
	LD->getExtensionType() != ISD::NON_EXTLOAD)
	return false;

	// Now let's find the callseq_start.
	while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
	if (!Chain.hasOneUse())
	return false;
	Chain = Chain.getOperand(0);
	}

	if (!Chain.getNumOperands())
	return false;
	// Since we are not checking for AA here, conservatively abort if the chain
	// writes to memory. It's not safe to move the callee (a load) across a store.
	if (isa<MemSDNode>(Chain.getNode()) &&
	cast<MemSDNode>(Chain.getNode())->writeMem())
	return false;
	if (Chain.getOperand(0).getNode() == Callee.getNode())
	return true;
	if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
	Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
	Callee.getValue(1).hasOneUse())
	return true;
	return false;
	}

	void X86DAGToDAGISel::PreprocessISelDAG() {
	for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
	E = CurDAG->allnodes_end(); I != E; ) {
	SDNode N = &I++; // Preincrement iterator to avoid invalidation issues.

	// If this is a target specific AND node with no flag usages, turn it back
	// into ISD::AND to enable test instruction matching.
	if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
	SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N->getOperand(1));
	--I;
	CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
	++I;
	CurDAG->DeleteNode(N);
	continue;
	}

	switch (N->getOpcode()) {
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: {
	// Replace vector fp_to_s/uint with their X86 specific equivalent so we
	// don't need 2 sets of patterns.
	if (!N->getSimpleValueType(0).isVector())
	break;

	unsigned NewOpc;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
	case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
	}
	SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
	N->getOperand(0));
	--I;
	CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
	++I;
	CurDAG->DeleteNode(N);
	continue;
	}
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL: {
	// Replace vector shifts with their X86 specific equivalent so we don't
	// need 2 sets of patterns.
	if (!N->getValueType(0).isVector())
	break;

	unsigned NewOpc;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
	case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
	case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
	}
	SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N->getOperand(1));
	--I;
	CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
	++I;
	CurDAG->DeleteNode(N);
	continue;
	}
	case ISD::ANY_EXTEND:
	case ISD::ANY_EXTEND_VECTOR_INREG: {
	// Replace vector any extend with the zero extend equivalents so we don't
	// need 2 sets of patterns. Ignore vXi1 extensions.
	if (!N->getValueType(0).isVector() \|\|
	N->getOperand(0).getScalarValueSizeInBits() == 1)
	break;

	unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
	? ISD::ZERO_EXTEND
	: ISD::ZERO_EXTEND_VECTOR_INREG;

	SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
	N->getOperand(0));
	--I;
	CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
	++I;
	CurDAG->DeleteNode(N);
	continue;
	}
	case ISD::FCEIL:
	case ISD::FFLOOR:
	case ISD::FTRUNC:
	case ISD::FNEARBYINT:
	case ISD::FRINT: {
	// Replace fp rounding with their X86 specific equivalent so we don't
	// need 2 sets of patterns.
	unsigned Imm;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::FCEIL: Imm = 0xA; break;
	case ISD::FFLOOR: Imm = 0x9; break;
	case ISD::FTRUNC: Imm = 0xB; break;
	case ISD::FNEARBYINT: Imm = 0xC; break;
	case ISD::FRINT: Imm = 0x4; break;
	}
	SDLoc dl(N);
	SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
	N->getValueType(0),
	N->getOperand(0),
	CurDAG->getConstant(Imm, dl, MVT::i8));
	--I;
	CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
	++I;
	CurDAG->DeleteNode(N);
	continue;
	}
	case X86ISD::FANDN:
	case X86ISD::FAND:
	case X86ISD::FOR:
	case X86ISD::FXOR: {
	// Widen scalar fp logic ops to vector to reduce isel patterns.
	// FIXME: Can we do this during lowering/combine.
	MVT VT = N->getSimpleValueType(0);
	if (VT.isVector() \|\| VT == MVT::f128)
	break;

	MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
	SDLoc dl(N);
	SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
	N->getOperand(0));
	SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
	N->getOperand(1));

	SDValue Res;
	if (Subtarget->hasSSE2()) {
	EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
	Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
	Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
	unsigned Opc;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode!");
	case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
	case X86ISD::FAND: Opc = ISD::AND; break;
	case X86ISD::FOR: Opc = ISD::OR; break;
	case X86ISD::FXOR: Opc = ISD::XOR; break;
	}
	Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
	Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
	} else {
	Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
	}
	Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
	CurDAG->getIntPtrConstant(0, dl));
	--I;
	CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
	++I;
	CurDAG->DeleteNode(N);
	continue;
	}
	}

	if (OptLevel != CodeGenOpt::None &&
	// Only do this when the target can fold the load into the call or
	// jmp.
	!Subtarget->useRetpolineIndirectCalls() &&
	((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) \|\|
	(N->getOpcode() == X86ISD::TC_RETURN &&
	(Subtarget->is64Bit() \|\|
	!getTargetMachine().isPositionIndependent())))) {
	/// Also try moving call address load from outside callseq_start to just
	/// before the call to allow it to be folded.
	///
	/// [Load chain]
	/// ^
	/// \|
	/// [Load]
	/// ^ ^
	/// \| \|
	/// / \--
	/// / \|
	///[CALLSEQ_START] \|
	/// ^ \|
	/// \| \|
	/// [LOAD/C2Reg] \|
	/// \| \|
	/// \ /
	/// \ /
	/// [CALL]
	bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
	SDValue Chain = N->getOperand(0);
	SDValue Load = N->getOperand(1);
	if (!isCalleeLoad(Load, Chain, HasCallSeq))
	continue;
	moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
	++NumLoadMoved;
	continue;
	}

	// Lower fpround and fpextend nodes that target the FP stack to be store and
	// load to the stack. This is a gross hack. We would like to simply mark
	// these as being illegal, but when we do that, legalize produces these when
	// it expands calls, then expands these in the same legalize pass. We would
	// like dag combine to be able to hack on these between the call expansion
	// and the node legalization. As such this pass basically does "really
	// late" legalization of these inline with the X86 isel pass.
	// FIXME: This should only happen when not compiled with -O0.
	switch (N->getOpcode()) {
	default: continue;
	case ISD::FP_ROUND:
	case ISD::FP_EXTEND:
	{
	MVT SrcVT = N->getOperand(0).getSimpleValueType();
	MVT DstVT = N->getSimpleValueType(0);

	// If any of the sources are vectors, no fp stack involved.
	if (SrcVT.isVector() \|\| DstVT.isVector())
	continue;

	// If the source and destination are SSE registers, then this is a legal
	// conversion that should not be lowered.
	const X86TargetLowering *X86Lowering =
	static_cast<const X86TargetLowering *>(TLI);
	bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
	bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
	if (SrcIsSSE && DstIsSSE)
	continue;

	if (!SrcIsSSE && !DstIsSSE) {
	// If this is an FPStack extension, it is a noop.
	if (N->getOpcode() == ISD::FP_EXTEND)
	continue;
	// If this is a value-preserving FPStack truncation, it is a noop.
	if (N->getConstantOperandVal(1))
	continue;
	}

	// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
	// FPStack has extload and truncstore. SSE can fold direct loads into other
	// operations. Based on this, decide what we want to do.
	MVT MemVT;
	if (N->getOpcode() == ISD::FP_ROUND)
	MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
	else
	MemVT = SrcIsSSE ? SrcVT : DstVT;

	SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
	SDLoc dl(N);

	// FIXME: optimize the case where the src/dest is a load or store?

	SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
	MemTmp, MachinePointerInfo(), MemVT);
	SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
	MachinePointerInfo(), MemVT);

	// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
	// extload we created. This will cause general havok on the dag because
	// anything below the conversion could be folded into other existing nodes.
	// To avoid invalidating 'I', back it up to the convert node.
	--I;
	CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
	break;
	}

	//The sequence of events for lowering STRICT_FP versions of these nodes requires
	//dealing with the chain differently, as there is already a preexisting chain.
	case ISD::STRICT_FP_ROUND:
	case ISD::STRICT_FP_EXTEND:
	{
	MVT SrcVT = N->getOperand(1).getSimpleValueType();
	MVT DstVT = N->getSimpleValueType(0);

	// If any of the sources are vectors, no fp stack involved.
	if (SrcVT.isVector() \|\| DstVT.isVector())
	continue;

	// If the source and destination are SSE registers, then this is a legal
	// conversion that should not be lowered.
	const X86TargetLowering *X86Lowering =
	static_cast<const X86TargetLowering *>(TLI);
	bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
	bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
	if (SrcIsSSE && DstIsSSE)
	continue;

	if (!SrcIsSSE && !DstIsSSE) {
	// If this is an FPStack extension, it is a noop.
	if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
	continue;
	// If this is a value-preserving FPStack truncation, it is a noop.
	if (N->getConstantOperandVal(2))
	continue;
	}

	// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
	// FPStack has extload and truncstore. SSE can fold direct loads into other
	// operations. Based on this, decide what we want to do.
	MVT MemVT;
	if (N->getOpcode() == ISD::STRICT_FP_ROUND)
	MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
	else
	MemVT = SrcIsSSE ? SrcVT : DstVT;

	SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
	SDLoc dl(N);

	// FIXME: optimize the case where the src/dest is a load or store?

	//Since the operation is StrictFP, use the preexisting chain.
	SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1),
	MemTmp, MachinePointerInfo(), MemVT);
	SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
	MachinePointerInfo(), MemVT);

	// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
	// extload we created. This will cause general havok on the dag because
	// anything below the conversion could be folded into other existing nodes.
	// To avoid invalidating 'I', back it up to the convert node.
	--I;
	CurDAG->ReplaceAllUsesWith(N, Result.getNode());
	break;
	}
	}


	// Now that we did that, the node is dead. Increment the iterator to the
	// next node to process, then delete N.
	++I;
	CurDAG->DeleteNode(N);
	}

	// The load+call transform above can leave some dead nodes in the graph. Make
	// sure we remove them. Its possible some of the other transforms do to so
	// just remove dead nodes unconditionally.
	CurDAG->RemoveDeadNodes();
	}

	// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
	bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
	unsigned Opc = N->getMachineOpcode();
	if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
	Opc != X86::MOVSX64rr8)
	return false;

	SDValue N0 = N->getOperand(0);

	// We need to be extracting the lower bit of an extend.
	if (!N0.isMachineOpcode() \|\|
	N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG \|\|
	N0.getConstantOperandVal(1) != X86::sub_8bit)
	return false;

	// We're looking for either a movsx or movzx to match the original opcode.
	unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
	: X86::MOVSX32rr8_NOREX;
	SDValue N00 = N0.getOperand(0);
	if (!N00.isMachineOpcode() \|\| N00.getMachineOpcode() != ExpectedOpc)
	return false;

	if (Opc == X86::MOVSX64rr8) {
	// If we had a sign extend from 8 to 64 bits. We still need to go from 32
	// to 64.
	MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
	MVT::i64, N00);
	ReplaceUses(N, Extend);
	} else {
	// Ok we can drop this extend and just use the original extend.
	ReplaceUses(N, N00.getNode());
	}

	return true;
	}

	void X86DAGToDAGISel::PostprocessISelDAG() {
	// Skip peepholes at -O0.
	if (TM.getOptLevel() == CodeGenOpt::None)
	return;

	SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();

	bool MadeChange = false;
	while (Position != CurDAG->allnodes_begin()) {
	SDNode N = &--Position;
	// Skip dead nodes and any non-machine opcodes.
	if (N->use_empty() \|\| !N->isMachineOpcode())
	continue;

	if (tryOptimizeRem8Extend(N)) {
	MadeChange = true;
	continue;
	}

	// Look for a TESTrr+ANDrr pattern where both operands of the test are
	// the same. Rewrite to remove the AND.
	unsigned Opc = N->getMachineOpcode();
	if ((Opc == X86::TEST8rr \|\| Opc == X86::TEST16rr \|\|
	Opc == X86::TEST32rr \|\| Opc == X86::TEST64rr) &&
	N->getOperand(0) == N->getOperand(1) &&
	N->isOnlyUserOf(N->getOperand(0).getNode()) &&
	N->getOperand(0).isMachineOpcode()) {
	SDValue And = N->getOperand(0);
	unsigned N0Opc = And.getMachineOpcode();
	if (N0Opc == X86::AND8rr \|\| N0Opc == X86::AND16rr \|\|
	N0Opc == X86::AND32rr \|\| N0Opc == X86::AND64rr) {
	MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
	MVT::i32,
	And.getOperand(0),
	And.getOperand(1));
	ReplaceUses(N, Test);
	MadeChange = true;
	continue;
	}
	if (N0Opc == X86::AND8rm \|\| N0Opc == X86::AND16rm \|\|
	N0Opc == X86::AND32rm \|\| N0Opc == X86::AND64rm) {
	unsigned NewOpc;
	switch (N0Opc) {
	case X86::AND8rm: NewOpc = X86::TEST8mr; break;
	case X86::AND16rm: NewOpc = X86::TEST16mr; break;
	case X86::AND32rm: NewOpc = X86::TEST32mr; break;
	case X86::AND64rm: NewOpc = X86::TEST64mr; break;
	}

	// Need to swap the memory and register operand.
	SDValue Ops[] = { And.getOperand(1),
	And.getOperand(2),
	And.getOperand(3),
	And.getOperand(4),
	And.getOperand(5),
	And.getOperand(0),
	And.getOperand(6) /* Chain */ };
	MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
	MVT::i32, MVT::Other, Ops);
	ReplaceUses(N, Test);
	MadeChange = true;
	continue;
	}
	}

	// Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
	// used. We're doing this late so we can prefer to fold the AND into masked
	// comparisons. Doing that can be better for the live range of the mask
	// register.
	if ((Opc == X86::KORTESTBrr \|\| Opc == X86::KORTESTWrr \|\|
	Opc == X86::KORTESTDrr \|\| Opc == X86::KORTESTQrr) &&
	N->getOperand(0) == N->getOperand(1) &&
	N->isOnlyUserOf(N->getOperand(0).getNode()) &&
	N->getOperand(0).isMachineOpcode() &&
	onlyUsesZeroFlag(SDValue(N, 0))) {
	SDValue And = N->getOperand(0);
	unsigned N0Opc = And.getMachineOpcode();
	// KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
	// KAND instructions and KTEST use the same ISA feature.
	if (N0Opc == X86::KANDBrr \|\|
	(N0Opc == X86::KANDWrr && Subtarget->hasDQI()) \|\|
	N0Opc == X86::KANDDrr \|\| N0Opc == X86::KANDQrr) {
	unsigned NewOpc;
	switch (Opc) {
	default: llvm_unreachable("Unexpected opcode!");
	case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
	case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
	case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
	case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
	}
	MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
	MVT::i32,
	And.getOperand(0),
	And.getOperand(1));
	ReplaceUses(N, KTest);
	MadeChange = true;
	continue;
	}
	}

	// Attempt to remove vectors moves that were inserted to zero upper bits.
	if (Opc != TargetOpcode::SUBREG_TO_REG)
	continue;

	unsigned SubRegIdx = N->getConstantOperandVal(2);
	if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
	continue;

	SDValue Move = N->getOperand(1);
	if (!Move.isMachineOpcode())
	continue;

	// Make sure its one of the move opcodes we recognize.
	switch (Move.getMachineOpcode()) {
	default:
	continue;
	case X86::VMOVAPDrr: case X86::VMOVUPDrr:
	case X86::VMOVAPSrr: case X86::VMOVUPSrr:
	case X86::VMOVDQArr: case X86::VMOVDQUrr:
	case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
	case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
	case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
	case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
	case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
	case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
	case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
	case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
	case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
	case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
	case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
	break;
	}

	SDValue In = Move.getOperand(0);
	if (!In.isMachineOpcode() \|\|
	In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
	continue;

	// Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
	// the SHA instructions which use a legacy encoding.
	uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
	if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
	(TSFlags & X86II::EncodingMask) != X86II::EVEX &&
	(TSFlags & X86II::EncodingMask) != X86II::XOP)
	continue;

	// Producing instruction is another vector instruction. We can drop the
	// move.
	CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
	MadeChange = true;
	}

	if (MadeChange)
	CurDAG->RemoveDeadNodes();
	}


	/// Emit any code that needs to be executed only in the main function.
	void X86DAGToDAGISel::emitSpecialCodeForMain() {
	if (Subtarget->isTargetCygMing()) {
	TargetLowering::ArgListTy Args;
	auto &DL = CurDAG->getDataLayout();

	TargetLowering::CallLoweringInfo CLI(*CurDAG);
	CLI.setChain(CurDAG->getRoot())
	.setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
	CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
	std::move(Args));
	const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
	std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
	CurDAG->setRoot(Result.second);
	}
	}

	void X86DAGToDAGISel::EmitFunctionEntryCode() {
	// If this is main, emit special code for main.
	const Function &F = MF->getFunction();
	if (F.hasExternalLinkage() && F.getName() == "main")
	emitSpecialCodeForMain();
	}

	static bool isDispSafeForFrameIndex(int64_t Val) {
	// On 64-bit platforms, we can run into an issue where a frame index
	// includes a displacement that, when added to the explicit displacement,
	// will overflow the displacement field. Assuming that the frame index
	// displacement fits into a 31-bit integer (which is only slightly more
	// aggressive than the current fundamental assumption that it fits into
	// a 32-bit integer), a 31-bit disp should always be safe.
	return isInt<31>(Val);
	}

	bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
	X86ISelAddressMode &AM) {
	// If there's no offset to fold, we don't need to do any work.
	if (Offset == 0)
	return false;

	// Cannot combine ExternalSymbol displacements with integer offsets.
	if (AM.ES \|\| AM.MCSym)
	return true;

	int64_t Val = AM.Disp + Offset;
	CodeModel::Model M = TM.getCodeModel();
	if (Subtarget->is64Bit()) {
	if (!X86::isOffsetSuitableForCodeModel(Val, M,
	AM.hasSymbolicDisplacement()))
	return true;
	// In addition to the checks required for a register base, check that
	// we do not try to use an unsafe Disp with a frame index.
	if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
	!isDispSafeForFrameIndex(Val))
	return true;
	}
	AM.Disp = Val;
	return false;

	}

	bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
	SDValue Address = N->getOperand(1);

	// load gs:0 -> GS segment register.
	// load fs:0 -> FS segment register.
	//
	// This optimization is valid because the GNU TLS model defines that
	// gs:0 (or fs:0 on X86-64) contains its own address.
	// For more information see http://people.redhat.com/drepper/tls.pdf
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
	if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
	!IndirectTlsSegRefs &&
	(Subtarget->isTargetGlibc() \|\| Subtarget->isTargetAndroid() \|\|
	Subtarget->isTargetFuchsia()))
	switch (N->getPointerInfo().getAddrSpace()) {
	case 256:
	AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
	return false;
	case 257:
	AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
	return false;
	// Address space 258 is not handled here, because it is not used to
	// address TLS areas.
	}

	return true;
	}

	/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
	/// mode. These wrap things that will resolve down into a symbol reference.
	/// If no match is possible, this returns true, otherwise it returns false.
	bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
	// If the addressing mode already has a symbol as the displacement, we can
	// never match another symbol.
	if (AM.hasSymbolicDisplacement())
	return true;

	bool IsRIPRelTLS = false;
	bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
	if (IsRIPRel) {
	SDValue Val = N.getOperand(0);
	if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
	IsRIPRelTLS = true;
	}

	// We can't use an addressing mode in the 64-bit large code model.
	// Global TLS addressing is an exception. In the medium code model,
	// we use can use a mode when RIP wrappers are present.
	// That signifies access to globals that are known to be "near",
	// such as the GOT itself.
	CodeModel::Model M = TM.getCodeModel();
	if (Subtarget->is64Bit() &&
	((M == CodeModel::Large && !IsRIPRelTLS) \|\|
	(M == CodeModel::Medium && !IsRIPRel)))
	return true;

	// Base and index reg must be 0 in order to use %rip as base.
	if (IsRIPRel && AM.hasBaseOrIndexReg())
	return true;

	// Make a local copy in case we can't do this fold.
	X86ISelAddressMode Backup = AM;

	int64_t Offset = 0;
	SDValue N0 = N.getOperand(0);
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
	AM.GV = G->getGlobal();
	AM.SymbolFlags = G->getTargetFlags();
	Offset = G->getOffset();
	} else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
	AM.CP = CP->getConstVal();
	AM.Align = CP->getAlignment();
	AM.SymbolFlags = CP->getTargetFlags();
	Offset = CP->getOffset();
	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
	AM.ES = S->getSymbol();
	AM.SymbolFlags = S->getTargetFlags();
	} else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
	AM.MCSym = S->getMCSymbol();
	} else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
	AM.JT = J->getIndex();
	AM.SymbolFlags = J->getTargetFlags();
	} else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
	AM.BlockAddr = BA->getBlockAddress();
	AM.SymbolFlags = BA->getTargetFlags();
	Offset = BA->getOffset();
	} else
	llvm_unreachable("Unhandled symbol reference node.");

	if (foldOffsetIntoAddress(Offset, AM)) {
	AM = Backup;
	return true;
	}

	if (IsRIPRel)
	AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));

	// Commit the changes now that we know this fold is safe.
	return false;
	}

	/// Add the specified node to the specified addressing mode, returning true if
	/// it cannot be done. This just pattern matches for the addressing mode.
	bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
	if (matchAddressRecursively(N, AM, 0))
	return true;

	// Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
	// a smaller encoding and avoids a scaled-index.
	if (AM.Scale == 2 &&
	AM.BaseType == X86ISelAddressMode::RegBase &&
	AM.Base_Reg.getNode() == nullptr) {
	AM.Base_Reg = AM.IndexReg;
	AM.Scale = 1;
	}

	// Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
	// because it has a smaller encoding.
	// TODO: Which other code models can use this?
	switch (TM.getCodeModel()) {
	default: break;
	case CodeModel::Small:
	case CodeModel::Kernel:
	if (Subtarget->is64Bit() &&
	AM.Scale == 1 &&
	AM.BaseType == X86ISelAddressMode::RegBase &&
	AM.Base_Reg.getNode() == nullptr &&
	AM.IndexReg.getNode() == nullptr &&
	AM.SymbolFlags == X86II::MO_NO_FLAG &&
	AM.hasSymbolicDisplacement())
	AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
	break;
	}

	return false;
	}

	bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
	unsigned Depth) {
	// Add an artificial use to this node so that we can keep track of
	// it if it gets CSE'd with a different node.
	HandleSDNode Handle(N);

	X86ISelAddressMode Backup = AM;
	if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
	!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
	return false;
	AM = Backup;

	// Try again after commuting the operands.
	if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
	!matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
	return false;
	AM = Backup;

	// If we couldn't fold both operands into the address at the same time,
	// see if we can just put each operand into a register and fold at least
	// the add.
	if (AM.BaseType == X86ISelAddressMode::RegBase &&
	!AM.Base_Reg.getNode() &&
	!AM.IndexReg.getNode()) {
	N = Handle.getValue();
	AM.Base_Reg = N.getOperand(0);
	AM.IndexReg = N.getOperand(1);
	AM.Scale = 1;
	return false;
	}
	N = Handle.getValue();
	return true;
	}

	// Insert a node into the DAG at least before the Pos node's position. This
	// will reposition the node as needed, and will assign it a node ID that is <=
	// the Pos node's ID. Note that this does not preserve the uniqueness of node
	// IDs! The selection DAG must no longer depend on their uniqueness when this
	// is used.
	static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
	if (N->getNodeId() == -1 \|\|
	(SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
	SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
	DAG.RepositionNode(Pos->getIterator(), N.getNode());
	// Mark Node as invalid for pruning as after this it may be a successor to a
	// selected node but otherwise be in the same position of Pos.
	// Conservatively mark it with the same -abs(Id) to assure node id
	// invariant is preserved.
	N->setNodeId(Pos->getNodeId());
	SelectionDAGISel::InvalidateNodeId(N.getNode());
	}
	}

	// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
	// safe. This allows us to convert the shift and and into an h-register
	// extract and a scaled index. Returns false if the simplification is
	// performed.
	static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
	uint64_t Mask,
	SDValue Shift, SDValue X,
	X86ISelAddressMode &AM) {
	if (Shift.getOpcode() != ISD::SRL \|\|
	!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	!Shift.hasOneUse())
	return true;

	int ScaleLog = 8 - Shift.getConstantOperandVal(1);
	if (ScaleLog <= 0 \|\| ScaleLog >= 4 \|\|
	Mask != (0xffu << ScaleLog))
	return true;

	MVT VT = N.getSimpleValueType();
	SDLoc DL(N);
	SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
	SDValue NewMask = DAG.getConstant(0xff, DL, VT);
	SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
	SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
	SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);

	// Insert the new nodes into the topological ordering. We must do this in
	// a valid topological ordering as nothing is going to go back and re-sort
	// these nodes. We continually insert before 'N' in sequence as this is
	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
	// hierarchy left to express.
	insertDAGNode(DAG, N, Eight);
	insertDAGNode(DAG, N, Srl);
	insertDAGNode(DAG, N, NewMask);
	insertDAGNode(DAG, N, And);
	insertDAGNode(DAG, N, ShlCount);
	insertDAGNode(DAG, N, Shl);
	DAG.ReplaceAllUsesWith(N, Shl);
	DAG.RemoveDeadNode(N.getNode());
	AM.IndexReg = And;
	AM.Scale = (1 << ScaleLog);
	return false;
	}

	// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
	// allows us to fold the shift into this addressing mode. Returns false if the
	// transform succeeded.
	static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
	X86ISelAddressMode &AM) {
	SDValue Shift = N.getOperand(0);

	// Use a signed mask so that shifting right will insert sign bits. These
	// bits will be removed when we shift the result left so it doesn't matter
	// what we use. This might allow a smaller immediate encoding.
	int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();

	// If we have an any_extend feeding the AND, look through it to see if there
	// is a shift behind it. But only if the AND doesn't use the extended bits.
	// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
	bool FoundAnyExtend = false;
	if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
	Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
	isUInt<32>(Mask)) {
	FoundAnyExtend = true;
	Shift = Shift.getOperand(0);
	}

	if (Shift.getOpcode() != ISD::SHL \|\|
	!isa<ConstantSDNode>(Shift.getOperand(1)))
	return true;

	SDValue X = Shift.getOperand(0);

	// Not likely to be profitable if either the AND or SHIFT node has more
	// than one use (unless all uses are for address computation). Besides,
	// isel mechanism requires their node ids to be reused.
	if (!N.hasOneUse() \|\| !Shift.hasOneUse())
	return true;

	// Verify that the shift amount is something we can fold.
	unsigned ShiftAmt = Shift.getConstantOperandVal(1);
	if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
	return true;

	MVT VT = N.getSimpleValueType();
	SDLoc DL(N);
	if (FoundAnyExtend) {
	SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
	insertDAGNode(DAG, N, NewX);
	X = NewX;
	}

	SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
	SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));

	// Insert the new nodes into the topological ordering. We must do this in
	// a valid topological ordering as nothing is going to go back and re-sort
	// these nodes. We continually insert before 'N' in sequence as this is
	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
	// hierarchy left to express.
	insertDAGNode(DAG, N, NewMask);
	insertDAGNode(DAG, N, NewAnd);
	insertDAGNode(DAG, N, NewShift);
	DAG.ReplaceAllUsesWith(N, NewShift);
	DAG.RemoveDeadNode(N.getNode());

	AM.Scale = 1 << ShiftAmt;
	AM.IndexReg = NewAnd;
	return false;
	}

	// Implement some heroics to detect shifts of masked values where the mask can
	// be replaced by extending the shift and undoing that in the addressing mode
	// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
	// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
	// the addressing mode. This results in code such as:
	//
	// int f(short y, int lookup_table) {
	// ...
	// return y + lookup_table[y >> 11];
	// }
	//
	// Turning into:
	// movzwl (%rdi), %eax
	// movl %eax, %ecx
	// shrl $11, %ecx
	// addl (%rsi,%rcx,4), %eax
	//
	// Instead of:
	// movzwl (%rdi), %eax
	// movl %eax, %ecx
	// shrl $9, %ecx
	// andl $124, %rcx
	// addl (%rsi,%rcx), %eax
	//
	// Note that this function assumes the mask is provided as a mask after the
	// value is shifted. The input chain may or may not match that, but computing
	// such a mask is trivial.
	static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
	uint64_t Mask,
	SDValue Shift, SDValue X,
	X86ISelAddressMode &AM) {
	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse() \|\|
	!isa<ConstantSDNode>(Shift.getOperand(1)))
	return true;

	unsigned ShiftAmt = Shift.getConstantOperandVal(1);
	unsigned MaskLZ = countLeadingZeros(Mask);
	unsigned MaskTZ = countTrailingZeros(Mask);

	// The amount of shift we're trying to fit into the addressing mode is taken
	// from the trailing zeros of the mask.
	unsigned AMShiftAmt = MaskTZ;

	// There is nothing we can do here unless the mask is removing some bits.
	// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
	if (AMShiftAmt <= 0 \|\| AMShiftAmt > 3) return true;

	// We also need to ensure that mask is a continuous run of bits.
	if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;

	// Scale the leading zero count down based on the actual size of the value.
	// Also scale it down based on the size of the shift.
	unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
	if (MaskLZ < ScaleDown)
	return true;
	MaskLZ -= ScaleDown;

	// The final check is to ensure that any masked out high bits of X are
	// already known to be zero. Otherwise, the mask has a semantic impact
	// other than masking out a couple of low bits. Unfortunately, because of
	// the mask, zero extensions will be removed from operands in some cases.
	// This code works extra hard to look through extensions because we can
	// replace them with zero extensions cheaply if necessary.
	bool ReplacingAnyExtend = false;
	if (X.getOpcode() == ISD::ANY_EXTEND) {
	unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
	X.getOperand(0).getSimpleValueType().getSizeInBits();
	// Assume that we'll replace the any-extend with a zero-extend, and
	// narrow the search to the extended value.
	X = X.getOperand(0);
	MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
	ReplacingAnyExtend = true;
	}
	APInt MaskedHighBits =
	APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
	KnownBits Known = DAG.computeKnownBits(X);
	if (MaskedHighBits != Known.Zero) return true;

	// We've identified a pattern that can be transformed into a single shift
	// and an addressing mode. Make it so.
	MVT VT = N.getSimpleValueType();
	if (ReplacingAnyExtend) {
	assert(X.getValueType() != VT);
	// We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
	SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
	insertDAGNode(DAG, N, NewX);
	X = NewX;
	}
	SDLoc DL(N);
	SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
	SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
	SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
	SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);

	// Insert the new nodes into the topological ordering. We must do this in
	// a valid topological ordering as nothing is going to go back and re-sort
	// these nodes. We continually insert before 'N' in sequence as this is
	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
	// hierarchy left to express.
	insertDAGNode(DAG, N, NewSRLAmt);
	insertDAGNode(DAG, N, NewSRL);
	insertDAGNode(DAG, N, NewSHLAmt);
	insertDAGNode(DAG, N, NewSHL);
	DAG.ReplaceAllUsesWith(N, NewSHL);
	DAG.RemoveDeadNode(N.getNode());

	AM.Scale = 1 << AMShiftAmt;
	AM.IndexReg = NewSRL;
	return false;
	}

	// Transform "(X >> SHIFT) & (MASK << C1)" to
	// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
	// matched to a BEXTR later. Returns false if the simplification is performed.
	static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
	uint64_t Mask,
	SDValue Shift, SDValue X,
	X86ISelAddressMode &AM,
	const X86Subtarget &Subtarget) {
	if (Shift.getOpcode() != ISD::SRL \|\|
	!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	!Shift.hasOneUse() \|\| !N.hasOneUse())
	return true;

	// Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
	if (!Subtarget.hasTBM() &&
	!(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
	return true;

	// We need to ensure that mask is a continuous run of bits.
	if (!isShiftedMask_64(Mask)) return true;

	unsigned ShiftAmt = Shift.getConstantOperandVal(1);

	// The amount of shift we're trying to fit into the addressing mode is taken
	// from the trailing zeros of the mask.
	unsigned AMShiftAmt = countTrailingZeros(Mask);

	// There is nothing we can do here unless the mask is removing some bits.
	// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
	if (AMShiftAmt <= 0 \|\| AMShiftAmt > 3) return true;

	MVT VT = N.getSimpleValueType();
	SDLoc DL(N);
	SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
	SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
	SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
	SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
	SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);

	// Insert the new nodes into the topological ordering. We must do this in
	// a valid topological ordering as nothing is going to go back and re-sort
	// these nodes. We continually insert before 'N' in sequence as this is
	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
	// hierarchy left to express.
	insertDAGNode(DAG, N, NewSRLAmt);
	insertDAGNode(DAG, N, NewSRL);
	insertDAGNode(DAG, N, NewMask);
	insertDAGNode(DAG, N, NewAnd);
	insertDAGNode(DAG, N, NewSHLAmt);
	insertDAGNode(DAG, N, NewSHL);
	DAG.ReplaceAllUsesWith(N, NewSHL);
	DAG.RemoveDeadNode(N.getNode());

	AM.Scale = 1 << AMShiftAmt;
	AM.IndexReg = NewAnd;
	return false;
	}

	bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
	unsigned Depth) {
	SDLoc dl(N);
	LLVM_DEBUG({
	dbgs() << "MatchAddress: ";
	AM.dump(CurDAG);
	});
	// Limit recursion.
	if (Depth > 5)
	return matchAddressBase(N, AM);

	// If this is already a %rip relative address, we can only merge immediates
	// into it. Instead of handling this in every case, we handle it here.
	// RIP relative addressing: %rip + 32-bit displacement!
	if (AM.isRIPRelative()) {
	// FIXME: JumpTable and ExternalSymbol address currently don't like
	// displacements. It isn't very important, but this should be fixed for
	// consistency.
	if (!(AM.ES \|\| AM.MCSym) && AM.JT != -1)
	return true;

	if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
	if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
	return false;
	return true;
	}

	switch (N.getOpcode()) {
	default: break;
	case ISD::LOCAL_RECOVER: {
	if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
	if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
	// Use the symbol and don't prefix it.
	AM.MCSym = ESNode->getMCSymbol();
	return false;
	}
	break;
	}
	case ISD::Constant: {
	uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
	if (!foldOffsetIntoAddress(Val, AM))
	return false;
	break;
	}

	case X86ISD::Wrapper:
	case X86ISD::WrapperRIP:
	if (!matchWrapper(N, AM))
	return false;
	break;

	case ISD::LOAD:
	if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
	return false;
	break;

	case ISD::FrameIndex:
	if (AM.BaseType == X86ISelAddressMode::RegBase &&
	AM.Base_Reg.getNode() == nullptr &&
	(!Subtarget->is64Bit() \|\| isDispSafeForFrameIndex(AM.Disp))) {
	AM.BaseType = X86ISelAddressMode::FrameIndexBase;
	AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
	return false;
	}
	break;

	case ISD::SHL:
	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != 1)
	break;

	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
	unsigned Val = CN->getZExtValue();
	// Note that we handle x<<1 as (,x,2) rather than (x,x) here so
	// that the base operand remains free for further matching. If
	// the base doesn't end up getting used, a post-processing step
	// in MatchAddress turns (,x,2) into (x,x), which is cheaper.
	if (Val == 1 \|\| Val == 2 \|\| Val == 3) {
	AM.Scale = 1 << Val;
	SDValue ShVal = N.getOperand(0);

	// Okay, we know that we have a scale by now. However, if the scaled
	// value is an add of something and a constant, we can fold the
	// constant into the disp field here.
	if (CurDAG->isBaseWithConstantOffset(ShVal)) {
	AM.IndexReg = ShVal.getOperand(0);
	ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
	uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
	if (!foldOffsetIntoAddress(Disp, AM))
	return false;
	}

	AM.IndexReg = ShVal;
	return false;
	}
	}
	break;

	case ISD::SRL: {
	// Scale must not be used already.
	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != 1) break;

	// We only handle up to 64-bit values here as those are what matter for
	// addressing mode optimizations.
	assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
	"Unexpected value size!");

	SDValue And = N.getOperand(0);
	if (And.getOpcode() != ISD::AND) break;
	SDValue X = And.getOperand(0);

	// The mask used for the transform is expected to be post-shift, but we
	// found the shift first so just apply the shift to the mask before passing
	// it down.
	if (!isa<ConstantSDNode>(N.getOperand(1)) \|\|
	!isa<ConstantSDNode>(And.getOperand(1)))
	break;
	uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);

	// Try to fold the mask and shift into the scale, and return false if we
	// succeed.
	if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
	return false;
	break;
	}

	case ISD::SMUL_LOHI:
	case ISD::UMUL_LOHI:
	// A mul_lohi where we need the low part can be folded as a plain multiply.
	if (N.getResNo() != 0) break;
	LLVM_FALLTHROUGH;
	case ISD::MUL:
	case X86ISD::MUL_IMM:
	// X[3,5,9] -> X+X[2,4,8]
	if (AM.BaseType == X86ISelAddressMode::RegBase &&
	AM.Base_Reg.getNode() == nullptr &&
	AM.IndexReg.getNode() == nullptr) {
	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
	if (CN->getZExtValue() == 3 \|\| CN->getZExtValue() == 5 \|\|
	CN->getZExtValue() == 9) {
	AM.Scale = unsigned(CN->getZExtValue())-1;

	SDValue MulVal = N.getOperand(0);
	SDValue Reg;

	// Okay, we know that we have a scale by now. However, if the scaled
	// value is an add of something and a constant, we can fold the
	// constant into the disp field here.
	if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
	isa<ConstantSDNode>(MulVal.getOperand(1))) {
	Reg = MulVal.getOperand(0);
	ConstantSDNode *AddVal =
	cast<ConstantSDNode>(MulVal.getOperand(1));
	uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
	if (foldOffsetIntoAddress(Disp, AM))
	Reg = N.getOperand(0);
	} else {
	Reg = N.getOperand(0);
	}

	AM.IndexReg = AM.Base_Reg = Reg;
	return false;
	}
	}
	break;

	case ISD::SUB: {
	// Given A-B, if A can be completely folded into the address and
	// the index field with the index field unused, use -B as the index.
	// This is a win if a has multiple parts that can be folded into
	// the address. Also, this saves a mov if the base register has
	// other uses, since it avoids a two-address sub instruction, however
	// it costs an additional mov if the index register has other uses.

	// Add an artificial use to this node so that we can keep track of
	// it if it gets CSE'd with a different node.
	HandleSDNode Handle(N);

	// Test if the LHS of the sub can be folded.
	X86ISelAddressMode Backup = AM;
	if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
	N = Handle.getValue();
	AM = Backup;
	break;
	}
	N = Handle.getValue();
	// Test if the index field is free for use.
	if (AM.IndexReg.getNode() \|\| AM.isRIPRelative()) {
	AM = Backup;
	break;
	}

	int Cost = 0;
	SDValue RHS = N.getOperand(1);
	// If the RHS involves a register with multiple uses, this
	// transformation incurs an extra mov, due to the neg instruction
	// clobbering its operand.
	if (!RHS.getNode()->hasOneUse() \|\|
	RHS.getNode()->getOpcode() == ISD::CopyFromReg \|\|
	RHS.getNode()->getOpcode() == ISD::TRUNCATE \|\|
	RHS.getNode()->getOpcode() == ISD::ANY_EXTEND \|\|
	(RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
	RHS.getOperand(0).getValueType() == MVT::i32))
	++Cost;
	// If the base is a register with multiple uses, this
	// transformation may save a mov.
	if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
	!AM.Base_Reg.getNode()->hasOneUse()) \|\|
	AM.BaseType == X86ISelAddressMode::FrameIndexBase)
	--Cost;
	// If the folded LHS was interesting, this transformation saves
	// address arithmetic.
	if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
	((AM.Disp != 0) && (Backup.Disp == 0)) +
	(AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
	--Cost;
	// If it doesn't look like it may be an overall win, don't do it.
	if (Cost >= 0) {
	AM = Backup;
	break;
	}

	// Ok, the transformation is legal and appears profitable. Go for it.
	// Negation will be emitted later to avoid creating dangling nodes if this
	// was an unprofitable LEA.
	AM.IndexReg = RHS;
	AM.NegateIndex = true;
	AM.Scale = 1;
	return false;
	}

	case ISD::ADD:
	if (!matchAdd(N, AM, Depth))
	return false;
	break;

	case ISD::OR:
	// We want to look through a transform in InstCombine and DAGCombiner that
	// turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
	// Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
	// An 'lea' can then be used to match the shift (multiply) and add:
	// and $1, %esi
	// lea (%rsi, %rdi, 8), %rax
	if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
	!matchAdd(N, AM, Depth))
	return false;
	break;

	case ISD::AND: {
	// Perform some heroic transforms on an and of a constant-count shift
	// with a constant to enable use of the scaled offset field.

	// Scale must not be used already.
	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != 1) break;

	// We only handle up to 64-bit values here as those are what matter for
	// addressing mode optimizations.
	assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
	"Unexpected value size!");

	if (!isa<ConstantSDNode>(N.getOperand(1)))
	break;

	if (N.getOperand(0).getOpcode() == ISD::SRL) {
	SDValue Shift = N.getOperand(0);
	SDValue X = Shift.getOperand(0);

	uint64_t Mask = N.getConstantOperandVal(1);

	// Try to fold the mask and shift into an extract and scale.
	if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
	return false;

	// Try to fold the mask and shift directly into the scale.
	if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
	return false;

	// Try to fold the mask and shift into BEXTR and scale.
	if (!foldMaskedShiftToBEXTR(CurDAG, N, Mask, Shift, X, AM, Subtarget))
	return false;
	}

	// Try to swap the mask and shift to place shifts which can be done as
	// a scale on the outside of the mask.
	if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
	return false;

	break;
	}
	case ISD::ZERO_EXTEND: {
	// Try to widen a zexted shift left to the same size as its use, so we can
	// match the shift as a scale factor.
	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != 1)
	break;
	if (N.getOperand(0).getOpcode() != ISD::SHL \|\| !N.getOperand(0).hasOneUse())
	break;

	// Give up if the shift is not a valid scale factor [1,2,3].
	SDValue Shl = N.getOperand(0);
	auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
	if (!ShAmtC \|\| ShAmtC->getZExtValue() > 3)
	break;

	// The narrow shift must only shift out zero bits (it must be 'nuw').
	// That makes it safe to widen to the destination type.
	APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(),
	ShAmtC->getZExtValue());
	if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
	break;

	// zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
	MVT VT = N.getSimpleValueType();
	SDLoc DL(N);
	SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
	SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));

	// Convert the shift to scale factor.
	AM.Scale = 1 << ShAmtC->getZExtValue();
	AM.IndexReg = Zext;

	insertDAGNode(*CurDAG, N, Zext);
	insertDAGNode(*CurDAG, N, NewShl);
	CurDAG->ReplaceAllUsesWith(N, NewShl);
	CurDAG->RemoveDeadNode(N.getNode());
	return false;
	}
	}

	return matchAddressBase(N, AM);
	}

	/// Helper for MatchAddress. Add the specified node to the
	/// specified addressing mode without any further recursion.
	bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
	// Is the base register already occupied?
	if (AM.BaseType != X86ISelAddressMode::RegBase \|\| AM.Base_Reg.getNode()) {
	// If so, check to see if the scale index register is set.
	if (!AM.IndexReg.getNode()) {
	AM.IndexReg = N;
	AM.Scale = 1;
	return false;
	}

	// Otherwise, we cannot select it.
	return true;
	}

	// Default, generate it as a register.
	AM.BaseType = X86ISelAddressMode::RegBase;
	AM.Base_Reg = N;
	return false;
	}

	/// Helper for selectVectorAddr. Handles things that can be folded into a
	/// gather scatter address. The index register and scale should have already
	/// been handled.
	bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
	// TODO: Support other operations.
	switch (N.getOpcode()) {
	case ISD::Constant: {
	uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
	if (!foldOffsetIntoAddress(Val, AM))
	return false;
	break;
	}
	case X86ISD::Wrapper:
	if (!matchWrapper(N, AM))
	return false;
	break;
	}

	return matchAddressBase(N, AM);
	}

	bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment) {
	X86ISelAddressMode AM;
	auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
	AM.IndexReg = Mgs->getIndex();
	AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();

	unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
	// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
	if (AddrSpace == 256)
	AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
	if (AddrSpace == 257)
	AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
	if (AddrSpace == 258)
	AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);

	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();

	// Try to match into the base and displacement fields.
	if (matchVectorAddress(N, AM))
	return false;

	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
	return true;
	}

	/// Returns true if it is able to pattern match an addressing mode.
	/// It returns the operands which make up the maximal addressing mode it can
	/// match by reference.
	///
	/// Parent is the parent node of the addr operand that is being matched. It
	/// is always a load, store, atomic node, or null. It is only null when
	/// checking memory operands for inline asm nodes.
	bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment) {
	X86ISelAddressMode AM;

	if (Parent &&
	// This list of opcodes are all the nodes that have an "addr:$ptr" operand
	// that are not a MemSDNode, and thus don't have proper addrspace info.
	Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
	Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
	Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
	Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
	Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
	Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
	Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
	unsigned AddrSpace =
	cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
	// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
	if (AddrSpace == 256)
	AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
	if (AddrSpace == 257)
	AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
	if (AddrSpace == 258)
	AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
	}

	// Save the DL and VT before calling matchAddress, it can invalidate N.
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();

	if (matchAddress(N, AM))
	return false;

	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
	return true;
	}

	// We can only fold a load if all nodes between it and the root node have a
	// single use. If there are additional uses, we could end up duplicating the
	// load.
	static bool hasSingleUsesFromRoot(SDNode Root, SDNode User) {
	while (User != Root) {
	if (!User->hasOneUse())
	return false;
	User = *User->use_begin();
	}

	return true;
	}

	/// Match a scalar SSE load. In particular, we want to match a load whose top
	/// elements are either undef or zeros. The load flavor is derived from the
	/// type of N, which is either v4f32 or v2f64.
	///
	/// We also return:
	/// PatternChainNode: this is the matched node that has a chain input and
	/// output.
	bool X86DAGToDAGISel::selectScalarSSELoad(SDNode Root, SDNode Parent,
	SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment,
	SDValue &PatternNodeWithChain) {
	if (!hasSingleUsesFromRoot(Root, Parent))
	return false;

	// We can allow a full vector load here since narrowing a load is ok unless
	// it's volatile.
	if (ISD::isNON_EXTLoad(N.getNode())) {
	LoadSDNode *LD = cast<LoadSDNode>(N);
	if (!LD->isVolatile() &&
	IsProfitableToFold(N, LD, Root) &&
	IsLegalToFold(N, Parent, Root, OptLevel)) {
	PatternNodeWithChain = N;
	return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
	Segment);
	}
	}

	// We can also match the special zero extended load opcode.
	if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
	PatternNodeWithChain = N;
	if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
	IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
	auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
	return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
	Segment);
	}
	}

	// Need to make sure that the SCALAR_TO_VECTOR and load are both only used
	// once. Otherwise the load might get duplicated and the chain output of the
	// duplicate load will not be observed by all dependencies.
	if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
	PatternNodeWithChain = N.getOperand(0);
	if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
	IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
	IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
	LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
	return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
	Segment);
	}
	}

	return false;
	}


	bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
	if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
	uint64_t ImmVal = CN->getZExtValue();
	if (!isUInt<32>(ImmVal))
	return false;

	Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
	return true;
	}

	// In static codegen with small code model, we can get the address of a label
	// into a register with 'movl'
	if (N->getOpcode() != X86ISD::Wrapper)
	return false;

	N = N.getOperand(0);

	// At least GNU as does not accept 'movl' for TPOFF relocations.
	// FIXME: We could use 'movl' when we know we are targeting MC.
	if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
	return false;

	Imm = N;
	if (N->getOpcode() != ISD::TargetGlobalAddress)
	return TM.getCodeModel() == CodeModel::Small;

	Optional<ConstantRange> CR =
	cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
	if (!CR)
	return TM.getCodeModel() == CodeModel::Small;

	return CR->getUnsignedMax().ult(1ull << 32);
	}

	bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment) {
	// Save the debug loc before calling selectLEAAddr, in case it invalidates N.
	SDLoc DL(N);

	if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
	return false;

	RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
	if (RN && RN->getReg() == 0)
	Base = CurDAG->getRegister(0, MVT::i64);
	else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
	// Base could already be %rip, particularly in the x32 ABI.
	SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
	MVT::i64), 0);
	Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
	Base);
	}

	RN = dyn_cast<RegisterSDNode>(Index);
	if (RN && RN->getReg() == 0)
	Index = CurDAG->getRegister(0, MVT::i64);
	else {
	assert(Index.getValueType() == MVT::i32 &&
	"Expect to be extending 32-bit registers for use in LEA");
	SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
	MVT::i64), 0);
	Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
	Index);
	}

	return true;
	}

	/// Calls SelectAddr and determines if the maximal addressing
	/// mode it matches can be cost effectively emitted as an LEA instruction.
	bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment) {
	X86ISelAddressMode AM;

	// Save the DL and VT before calling matchAddress, it can invalidate N.
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();

	// Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
	// segments.
	SDValue Copy = AM.Segment;
	SDValue T = CurDAG->getRegister(0, MVT::i32);
	AM.Segment = T;
	if (matchAddress(N, AM))
	return false;
	assert (T == AM.Segment);
	AM.Segment = Copy;

	unsigned Complexity = 0;
	if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
	Complexity = 1;
	else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
	Complexity = 4;

	if (AM.IndexReg.getNode())
	Complexity++;

	// Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
	// a simple shift.
	if (AM.Scale > 1)
	Complexity++;

	// FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
	// to a LEA. This is determined with some experimentation but is by no means
	// optimal (especially for code size consideration). LEA is nice because of
	// its three-address nature. Tweak the cost function again when we can run
	// convertToThreeAddress() at register allocation time.
	if (AM.hasSymbolicDisplacement()) {
	// For X86-64, always use LEA to materialize RIP-relative addresses.
	if (Subtarget->is64Bit())
	Complexity = 4;
	else
	Complexity += 2;
	}

	+ // Heuristic: try harder to form an LEA from ADD if the operands set flags.
	+ // Unlike ADD, LEA does not affect flags, so we will be less likely to require
	+ // duplicating flag-producing instructions later in the pipeline.
	+ if (N.getOpcode() == ISD::ADD) {
	+ auto isMathWithFlags = [](SDValue V) {
	+ switch (V.getOpcode()) {
	+ case X86ISD::ADD:
	+ case X86ISD::SUB:
	+ case X86ISD::ADC:
	+ case X86ISD::SBB:
	+ /* TODO: These opcodes can be added safely, but we may want to justify
	+ their inclusion for different reasons (better for reg-alloc).
	+ case X86ISD::SMUL:
	+ case X86ISD::UMUL:
	+ case X86ISD::OR:
	+ case X86ISD::XOR:
	+ case X86ISD::AND:
	+ */
	+ // Value 1 is the flag output of the node - verify it's not dead.
	+ return !SDValue(V.getNode(), 1).use_empty();
	+ default:
	+ return false;
	+ }
	+ };
	+ // TODO: This could be an 'or' rather than 'and' to make the transform more
	+ // likely to happen. We might want to factor in whether there's a
	+ // load folding opportunity for the math op that disappears with LEA.
	+ if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
	+ Complexity++;
	+ }
	+
	if (AM.Disp)
	Complexity++;

	// If it isn't worth using an LEA, reject it.
	if (Complexity <= 2)
	return false;

	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
	return true;
	}

	/// This is only run on TargetGlobalTLSAddress nodes.
	bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
	SDValue &Scale, SDValue &Index,
	SDValue &Disp, SDValue &Segment) {
	assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);

	X86ISelAddressMode AM;
	AM.GV = GA->getGlobal();
	AM.Disp += GA->getOffset();
	AM.SymbolFlags = GA->getTargetFlags();

	MVT VT = N.getSimpleValueType();
	if (VT == MVT::i32) {
	AM.Scale = 1;
	AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
	}

	getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
	return true;
	}

	bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
	if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
	Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
	N.getValueType());
	return true;
	}

	// Keep track of the original value type and whether this value was
	// truncated. If we see a truncation from pointer type to VT that truncates
	// bits that are known to be zero, we can use a narrow reference.
	EVT VT = N.getValueType();
	bool WasTruncated = false;
	if (N.getOpcode() == ISD::TRUNCATE) {
	WasTruncated = true;
	N = N.getOperand(0);
	}

	if (N.getOpcode() != X86ISD::Wrapper)
	return false;

	// We can only use non-GlobalValues as immediates if they were not truncated,
	// as we do not have any range information. If we have a GlobalValue and the
	// address was not truncated, we can select it as an operand directly.
	unsigned Opc = N.getOperand(0)->getOpcode();
	if (Opc != ISD::TargetGlobalAddress \|\| !WasTruncated) {
	Op = N.getOperand(0);
	// We can only select the operand directly if we didn't have to look past a
	// truncate.
	return !WasTruncated;
	}

	// Check that the global's range fits into VT.
	auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
	Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
	if (!CR \|\| CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
	return false;

	// Okay, we can use a narrow reference.
	Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
	GA->getOffset(), GA->getTargetFlags());
	return true;
	}

	bool X86DAGToDAGISel::tryFoldLoad(SDNode Root, SDNode P, SDValue N,
	SDValue &Base, SDValue &Scale,
	SDValue &Index, SDValue &Disp,
	SDValue &Segment) {
	if (!ISD::isNON_EXTLoad(N.getNode()) \|\|
	!IsProfitableToFold(N, P, Root) \|\|
	!IsLegalToFold(N, P, Root, OptLevel))
	return false;

	return selectAddr(N.getNode(),
	N.getOperand(1), Base, Scale, Index, Disp, Segment);
	}

	/// Return an SDNode that returns the value of the global base register.
	/// Output instructions required to initialize the global base register,
	/// if necessary.
	SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
	unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
	auto &DL = MF->getDataLayout();
	return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
	}

	bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
	if (N->getOpcode() == ISD::TRUNCATE)
	N = N->getOperand(0).getNode();
	if (N->getOpcode() != X86ISD::Wrapper)
	return false;

	auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
	if (!GA)
	return false;

	Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
	return CR && CR->getSignedMin().sge(-1ull << Width) &&
	CR->getSignedMax().slt(1ull << Width);
	}

	static X86::CondCode getCondFromNode(SDNode *N) {
	assert(N->isMachineOpcode() && "Unexpected node");
	X86::CondCode CC = X86::COND_INVALID;
	unsigned Opc = N->getMachineOpcode();
	if (Opc == X86::JCC_1)
	CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
	else if (Opc == X86::SETCCr)
	CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
	else if (Opc == X86::SETCCm)
	CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
	else if (Opc == X86::CMOV16rr \|\| Opc == X86::CMOV32rr \|\|
	Opc == X86::CMOV64rr)
	CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
	else if (Opc == X86::CMOV16rm \|\| Opc == X86::CMOV32rm \|\|
	Opc == X86::CMOV64rm)
	CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));

	return CC;
	}

	/// Test whether the given X86ISD::CMP node has any users that use a flag
	/// other than ZF.
	bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
	// Examine each user of the node.
	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	// Only check things that use the flags.
	if (UI.getUse().getResNo() != Flags.getResNo())
	continue;
	// Only examine CopyToReg uses that copy to EFLAGS.
	if (UI->getOpcode() != ISD::CopyToReg \|\|
	cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
	return false;
	// Examine each user of the CopyToReg use.
	for (SDNode::use_iterator FlagUI = UI->use_begin(),
	FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
	// Only examine the Flag result.
	if (FlagUI.getUse().getResNo() != 1) continue;
	// Anything unusual: assume conservatively.
	if (!FlagUI->isMachineOpcode()) return false;
	// Examine the condition code of the user.
	X86::CondCode CC = getCondFromNode(*FlagUI);

	switch (CC) {
	// Comparisons which only use the zero flag.
	case X86::COND_E: case X86::COND_NE:
	continue;
	// Anything else: assume conservatively.
	default:
	return false;
	}
	}
	}
	return true;
	}

	/// Test whether the given X86ISD::CMP node has any uses which require the SF
	/// flag to be accurate.
	bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
	// Examine each user of the node.
	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	// Only check things that use the flags.
	if (UI.getUse().getResNo() != Flags.getResNo())
	continue;
	// Only examine CopyToReg uses that copy to EFLAGS.
	if (UI->getOpcode() != ISD::CopyToReg \|\|
	cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
	return false;
	// Examine each user of the CopyToReg use.
	for (SDNode::use_iterator FlagUI = UI->use_begin(),
	FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
	// Only examine the Flag result.
	if (FlagUI.getUse().getResNo() != 1) continue;
	// Anything unusual: assume conservatively.
	if (!FlagUI->isMachineOpcode()) return false;
	// Examine the condition code of the user.
	X86::CondCode CC = getCondFromNode(*FlagUI);

	switch (CC) {
	// Comparisons which don't examine the SF flag.
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	case X86::COND_E: case X86::COND_NE:
	case X86::COND_O: case X86::COND_NO:
	case X86::COND_P: case X86::COND_NP:
	continue;
	// Anything else: assume conservatively.
	default:
	return false;
	}
	}
	}
	return true;
	}

	static bool mayUseCarryFlag(X86::CondCode CC) {
	switch (CC) {
	// Comparisons which don't examine the CF flag.
	case X86::COND_O: case X86::COND_NO:
	case X86::COND_E: case X86::COND_NE:
	case X86::COND_S: case X86::COND_NS:
	case X86::COND_P: case X86::COND_NP:
	case X86::COND_L: case X86::COND_GE:
	case X86::COND_G: case X86::COND_LE:
	return false;
	// Anything else: assume conservatively.
	default:
	return true;
	}
	}

	/// Test whether the given node which sets flags has any uses which require the
	/// CF flag to be accurate.
	bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
	// Examine each user of the node.
	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	// Only check things that use the flags.
	if (UI.getUse().getResNo() != Flags.getResNo())
	continue;

	unsigned UIOpc = UI->getOpcode();

	if (UIOpc == ISD::CopyToReg) {
	// Only examine CopyToReg uses that copy to EFLAGS.
	if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
	return false;
	// Examine each user of the CopyToReg use.
	for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
	FlagUI != FlagUE; ++FlagUI) {
	// Only examine the Flag result.
	if (FlagUI.getUse().getResNo() != 1)
	continue;
	// Anything unusual: assume conservatively.
	if (!FlagUI->isMachineOpcode())
	return false;
	// Examine the condition code of the user.
	X86::CondCode CC = getCondFromNode(*FlagUI);

	if (mayUseCarryFlag(CC))
	return false;
	}

	// This CopyToReg is ok. Move on to the next user.
	continue;
	}

	// This might be an unselected node. So look for the pre-isel opcodes that
	// use flags.
	unsigned CCOpNo;
	switch (UIOpc) {
	default:
	// Something unusual. Be conservative.
	return false;
	case X86ISD::SETCC: CCOpNo = 0; break;
	case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
	case X86ISD::CMOV: CCOpNo = 2; break;
	case X86ISD::BRCOND: CCOpNo = 2; break;
	}

	X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
	if (mayUseCarryFlag(CC))
	return false;
	}
	return true;
	}

	/// Check whether or not the chain ending in StoreNode is suitable for doing
	/// the {load; op; store} to modify transformation.
	static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
	SDValue StoredVal, SelectionDAG *CurDAG,
	unsigned LoadOpNo,
	LoadSDNode *&LoadNode,
	SDValue &InputChain) {
	// Is the stored value result 0 of the operation?
	if (StoredVal.getResNo() != 0) return false;

	// Are there other uses of the operation other than the store?
	if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;

	// Is the store non-extending and non-indexed?
	if (!ISD::isNormalStore(StoreNode) \|\| StoreNode->isNonTemporal())
	return false;

	SDValue Load = StoredVal->getOperand(LoadOpNo);
	// Is the stored value a non-extending and non-indexed load?
	if (!ISD::isNormalLoad(Load.getNode())) return false;

	// Return LoadNode by reference.
	LoadNode = cast<LoadSDNode>(Load);

	// Is store the only read of the loaded value?
	if (!Load.hasOneUse())
	return false;

	// Is the address of the store the same as the load?
	if (LoadNode->getBasePtr() != StoreNode->getBasePtr() \|\|
	LoadNode->getOffset() != StoreNode->getOffset())
	return false;

	bool FoundLoad = false;
	SmallVector<SDValue, 4> ChainOps;
	SmallVector<const SDNode *, 4> LoopWorklist;
	SmallPtrSet<const SDNode *, 16> Visited;
	const unsigned int Max = 1024;

	// Visualization of Load-Op-Store fusion:
	// -------------------------
	// Legend:
	// *-lines = Chain operand dependencies.
	// \|-lines = Normal operand dependencies.
	// Dependencies flow down and right. n-suffix references multiple nodes.
	//
	// C Xn C
	// * * *
	// * * *
	// Xn A-LD Yn TF Yn
	// * * \ \| * \|
	// * * \ \| * \|
	// * * \ \| => A--LD_OP_ST
	// * * \\| \
	// TF OP \
	// * \| \ Zn
	// * \| \
	// A-ST Zn
	//

	// This merge induced dependences from: #1: Xn -> LD, OP, Zn
	// #2: Yn -> LD
	// #3: ST -> Zn

	// Ensure the transform is safe by checking for the dual
	// dependencies to make sure we do not induce a loop.

	// As LD is a predecessor to both OP and ST we can do this by checking:
	// a). if LD is a predecessor to a member of Xn or Yn.
	// b). if a Zn is a predecessor to ST.

	// However, (b) can only occur through being a chain predecessor to
	// ST, which is the same as Zn being a member or predecessor of Xn,
	// which is a subset of LD being a predecessor of Xn. So it's
	// subsumed by check (a).

	SDValue Chain = StoreNode->getChain();

	// Gather X elements in ChainOps.
	if (Chain == Load.getValue(1)) {
	FoundLoad = true;
	ChainOps.push_back(Load.getOperand(0));
	} else if (Chain.getOpcode() == ISD::TokenFactor) {
	for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
	SDValue Op = Chain.getOperand(i);
	if (Op == Load.getValue(1)) {
	FoundLoad = true;
	// Drop Load, but keep its chain. No cycle check necessary.
	ChainOps.push_back(Load.getOperand(0));
	continue;
	}
	LoopWorklist.push_back(Op.getNode());
	ChainOps.push_back(Op);
	}
	}

	if (!FoundLoad)
	return false;

	// Worklist is currently Xn. Add Yn to worklist.
	for (SDValue Op : StoredVal->ops())
	if (Op.getNode() != LoadNode)
	LoopWorklist.push_back(Op.getNode());

	// Check (a) if Load is a predecessor to Xn + Yn
	if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
	true))
	return false;

	InputChain =
	CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
	return true;
	}

	// Change a chain of {load; op; store} of the same value into a simple op
	// through memory of that value, if the uses of the modified value and its
	// address are suitable.
	//
	// The tablegen pattern memory operand pattern is currently not able to match
	// the case where the EFLAGS on the original operation are used.
	//
	// To move this to tablegen, we'll need to improve tablegen to allow flags to
	// be transferred from a node in the pattern to the result node, probably with
	// a new keyword. For example, we have this
	// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
	// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
	// (implicit EFLAGS)]>;
	// but maybe need something like this
	// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
	// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
	// (transferrable EFLAGS)]>;
	//
	// Until then, we manually fold these and instruction select the operation
	// here.
	bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
	StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
	SDValue StoredVal = StoreNode->getOperand(1);
	unsigned Opc = StoredVal->getOpcode();

	// Before we try to select anything, make sure this is memory operand size
	// and opcode we can handle. Note that this must match the code below that
	// actually lowers the opcodes.
	EVT MemVT = StoreNode->getMemoryVT();
	if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
	MemVT != MVT::i8)
	return false;

	bool IsCommutable = false;
	bool IsNegate = false;
	switch (Opc) {
	default:
	return false;
	case X86ISD::SUB:
	IsNegate = isNullConstant(StoredVal.getOperand(0));
	break;
	case X86ISD::SBB:
	break;
	case X86ISD::ADD:
	case X86ISD::ADC:
	case X86ISD::AND:
	case X86ISD::OR:
	case X86ISD::XOR:
	IsCommutable = true;
	break;
	}

	unsigned LoadOpNo = IsNegate ? 1 : 0;
	LoadSDNode *LoadNode = nullptr;
	SDValue InputChain;
	if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
	LoadNode, InputChain)) {
	if (!IsCommutable)
	return false;

	// This operation is commutable, try the other operand.
	LoadOpNo = 1;
	if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
	LoadNode, InputChain))
	return false;
	}

	SDValue Base, Scale, Index, Disp, Segment;
	if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
	Segment))
	return false;

	auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
	unsigned Opc8) {
	switch (MemVT.getSimpleVT().SimpleTy) {
	case MVT::i64:
	return Opc64;
	case MVT::i32:
	return Opc32;
	case MVT::i16:
	return Opc16;
	case MVT::i8:
	return Opc8;
	default:
	llvm_unreachable("Invalid size!");
	}
	};

	MachineSDNode *Result;
	switch (Opc) {
	case X86ISD::SUB:
	// Handle negate.
	if (IsNegate) {
	unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
	X86::NEG8m);
	const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
	Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
	MVT::Other, Ops);
	break;
	}
	LLVM_FALLTHROUGH;
	case X86ISD::ADD:
	// Try to match inc/dec.
	if (!Subtarget->slowIncDec() \|\| OptForSize) {
	bool IsOne = isOneConstant(StoredVal.getOperand(1));
	bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
	// ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
	if ((IsOne \|\| IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
	unsigned NewOpc =
	((Opc == X86ISD::ADD) == IsOne)
	? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
	: SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
	const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
	Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
	MVT::Other, Ops);
	break;
	}
	}
	LLVM_FALLTHROUGH;
	case X86ISD::ADC:
	case X86ISD::SBB:
	case X86ISD::AND:
	case X86ISD::OR:
	case X86ISD::XOR: {
	auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
	switch (Opc) {
	case X86ISD::ADD:
	return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
	X86::ADD8mr);
	case X86ISD::ADC:
	return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
	X86::ADC8mr);
	case X86ISD::SUB:
	return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
	X86::SUB8mr);
	case X86ISD::SBB:
	return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
	X86::SBB8mr);
	case X86ISD::AND:
	return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
	X86::AND8mr);
	case X86ISD::OR:
	return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
	case X86ISD::XOR:
	return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
	X86::XOR8mr);
	default:
	llvm_unreachable("Invalid opcode!");
	}
	};
	auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
	switch (Opc) {
	case X86ISD::ADD:
	return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
	case X86ISD::ADC:
	return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
	case X86ISD::SUB:
	return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
	case X86ISD::SBB:
	return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
	case X86ISD::AND:
	return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
	case X86ISD::OR:
	return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
	case X86ISD::XOR:
	return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
	default:
	llvm_unreachable("Invalid opcode!");
	}
	};
	auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
	switch (Opc) {
	case X86ISD::ADD:
	return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
	X86::ADD8mi);
	case X86ISD::ADC:
	return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
	X86::ADC8mi);
	case X86ISD::SUB:
	return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
	X86::SUB8mi);
	case X86ISD::SBB:
	return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
	X86::SBB8mi);
	case X86ISD::AND:
	return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
	X86::AND8mi);
	case X86ISD::OR:
	return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
	X86::OR8mi);
	case X86ISD::XOR:
	return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
	X86::XOR8mi);
	default:
	llvm_unreachable("Invalid opcode!");
	}
	};

	unsigned NewOpc = SelectRegOpcode(Opc);
	SDValue Operand = StoredVal->getOperand(1-LoadOpNo);

	// See if the operand is a constant that we can fold into an immediate
	// operand.
	if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
	int64_t OperandV = OperandC->getSExtValue();

	// Check if we can shrink the operand enough to fit in an immediate (or
	// fit into a smaller immediate) by negating it and switching the
	// operation.
	if ((Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB) &&
	((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) \|\|
	(MemVT == MVT::i64 && !isInt<32>(OperandV) &&
	isInt<32>(-OperandV))) &&
	hasNoCarryFlagUses(StoredVal.getValue(1))) {
	OperandV = -OperandV;
	Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
	}

	// First try to fit this into an Imm8 operand. If it doesn't fit, then try
	// the larger immediate operand.
	if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
	Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
	NewOpc = SelectImm8Opcode(Opc);
	} else if (MemVT != MVT::i64 \|\| isInt<32>(OperandV)) {
	Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
	NewOpc = SelectImmOpcode(Opc);
	}
	}

	if (Opc == X86ISD::ADC \|\| Opc == X86ISD::SBB) {
	SDValue CopyTo =
	CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
	StoredVal.getOperand(2), SDValue());

	const SDValue Ops[] = {Base, Scale, Index, Disp,
	Segment, Operand, CopyTo, CopyTo.getValue(1)};
	Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
	Ops);
	} else {
	const SDValue Ops[] = {Base, Scale, Index, Disp,
	Segment, Operand, InputChain};
	Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
	Ops);
	}
	break;
	}
	default:
	llvm_unreachable("Invalid opcode!");
	}

	MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
	LoadNode->getMemOperand()};
	CurDAG->setNodeMemRefs(Result, MemOps);

	// Update Load Chain uses as well.
	ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
	ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
	ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
	CurDAG->RemoveDeadNode(Node);
	return true;
	}

	// See if this is an X & Mask that we can match to BEXTR/BZHI.
	// Where Mask is one of the following patterns:
	// a) x & (1 << nbits) - 1
	// b) x & ~(-1 << nbits)
	// c) x & (-1 >> (32 - y))
	// d) x << (32 - y) >> (32 - y)
	bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
	assert(
	(Node->getOpcode() == ISD::AND \|\| Node->getOpcode() == ISD::SRL) &&
	"Should be either an and-mask, or right-shift after clearing high bits.");

	// BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
	if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
	return false;

	MVT NVT = Node->getSimpleValueType(0);

	// Only supported for 32 and 64 bits.
	if (NVT != MVT::i32 && NVT != MVT::i64)
	return false;

	SDValue NBits;

	// If we have BMI2's BZHI, we are ok with muti-use patterns.
	// Else, if we only have BMI1's BEXTR, we require one-use.
	const bool CanHaveExtraUses = Subtarget->hasBMI2();
	auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
	return CanHaveExtraUses \|\|
	Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
	};
	auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
	auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };

	auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
	if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
	assert(V.getSimpleValueType() == MVT::i32 &&
	V.getOperand(0).getSimpleValueType() == MVT::i64 &&
	"Expected i64 -> i32 truncation");
	V = V.getOperand(0);
	}
	return V;
	};

	// a) x & ((1 << nbits) + (-1))
	auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
	&NBits](SDValue Mask) -> bool {
	// Match `add`. Must only have one use!
	if (Mask->getOpcode() != ISD::ADD \|\| !checkOneUse(Mask))
	return false;
	// We should be adding all-ones constant (i.e. subtracting one.)
	if (!isAllOnesConstant(Mask->getOperand(1)))
	return false;
	// Match `1 << nbits`. Might be truncated. Must only have one use!
	SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
	if (M0->getOpcode() != ISD::SHL \|\| !checkOneUse(M0))
	return false;
	if (!isOneConstant(M0->getOperand(0)))
	return false;
	NBits = M0->getOperand(1);
	return true;
	};

	auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
	V = peekThroughOneUseTruncation(V);
	return CurDAG->MaskedValueIsAllOnes(
	V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
	NVT.getSizeInBits()));
	};

	// b) x & ~(-1 << nbits)
	auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
	&NBits](SDValue Mask) -> bool {
	// Match `~()`. Must only have one use!
	if (Mask.getOpcode() != ISD::XOR \|\| !checkOneUse(Mask))
	return false;
	// The -1 only has to be all-ones for the final Node's NVT.
	if (!isAllOnes(Mask->getOperand(1)))
	return false;
	// Match `-1 << nbits`. Might be truncated. Must only have one use!
	SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
	if (M0->getOpcode() != ISD::SHL \|\| !checkOneUse(M0))
	return false;
	// The -1 only has to be all-ones for the final Node's NVT.
	if (!isAllOnes(M0->getOperand(0)))
	return false;
	NBits = M0->getOperand(1);
	return true;
	};

	// Match potentially-truncated (bitwidth - y)
	auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
	unsigned Bitwidth) {
	// Skip over a truncate of the shift amount.
	if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
	ShiftAmt = ShiftAmt.getOperand(0);
	// The trunc should have been the only user of the real shift amount.
	if (!checkOneUse(ShiftAmt))
	return false;
	}
	// Match the shift amount as: (bitwidth - y). It should go away, too.
	if (ShiftAmt.getOpcode() != ISD::SUB)
	return false;
	auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
	if (!V0 \|\| V0->getZExtValue() != Bitwidth)
	return false;
	NBits = ShiftAmt.getOperand(1);
	return true;
	};

	// c) x & (-1 >> (32 - y))
	auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
	matchShiftAmt](SDValue Mask) -> bool {
	// The mask itself may be truncated.
	Mask = peekThroughOneUseTruncation(Mask);
	unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
	// Match `l>>`. Must only have one use!
	if (Mask.getOpcode() != ISD::SRL \|\| !checkOneUse(Mask))
	return false;
	// We should be shifting truly all-ones constant.
	if (!isAllOnesConstant(Mask.getOperand(0)))
	return false;
	SDValue M1 = Mask.getOperand(1);
	// The shift amount should not be used externally.
	if (!checkOneUse(M1))
	return false;
	return matchShiftAmt(M1, Bitwidth);
	};

	SDValue X;

	// d) x << (32 - y) >> (32 - y)
	auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
	&X](SDNode *Node) -> bool {
	if (Node->getOpcode() != ISD::SRL)
	return false;
	SDValue N0 = Node->getOperand(0);
	if (N0->getOpcode() != ISD::SHL \|\| !checkOneUse(N0))
	return false;
	unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
	SDValue N1 = Node->getOperand(1);
	SDValue N01 = N0->getOperand(1);
	// Both of the shifts must be by the exact same value.
	// There should not be any uses of the shift amount outside of the pattern.
	if (N1 != N01 \|\| !checkTwoUse(N1))
	return false;
	if (!matchShiftAmt(N1, Bitwidth))
	return false;
	X = N0->getOperand(0);
	return true;
	};

	auto matchLowBitMask = [matchPatternA, matchPatternB,
	matchPatternC](SDValue Mask) -> bool {
	return matchPatternA(Mask) \|\| matchPatternB(Mask) \|\| matchPatternC(Mask);
	};

	if (Node->getOpcode() == ISD::AND) {
	X = Node->getOperand(0);
	SDValue Mask = Node->getOperand(1);

	if (matchLowBitMask(Mask)) {
	// Great.
	} else {
	std::swap(X, Mask);
	if (!matchLowBitMask(Mask))
	return false;
	}
	} else if (!matchPatternD(Node))
	return false;

	SDLoc DL(Node);

	// Truncate the shift amount.
	NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
	insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);

	// Insert 8-bit NBits into lowest 8 bits of 32-bit register.
	// All the other bits are undefined, we do not care about them.
	SDValue ImplDef = SDValue(
	CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
	insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
	- NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef,
	- NBits);
	+
	+ SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
	+ insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
	+ NBits = SDValue(
	+ CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
	+ NBits, SRIdxVal), 0);
	insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);

	if (Subtarget->hasBMI2()) {
	// Great, just emit the the BZHI..
	if (NVT != MVT::i32) {
	// But have to place the bit count into the wide-enough register first.
	NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
	insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
	}

	SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
	ReplaceNode(Node, Extract.getNode());
	SelectCode(Extract.getNode());
	return true;
	}

	// Else, if we do NOT have BMI2, let's find out if the if the 'X' is
	// logically shifted (potentially with one-use trunc inbetween),
	// and the truncation was the only use of the shift,
	// and if so look past one-use truncation.
	{
	SDValue RealX = peekThroughOneUseTruncation(X);
	// FIXME: only if the shift is one-use?
	if (RealX != X && RealX.getOpcode() == ISD::SRL)
	X = RealX;
	}

	MVT XVT = X.getSimpleValueType();

	// Else, emitting BEXTR requires one more step.
	// The 'control' of BEXTR has the pattern of:
	// [15...8 bit][ 7...0 bit] location
	// [ bit count][ shift] name
	// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11

	// Shift NBits left by 8 bits, thus producing 'control'.
	// This makes the low 8 bits to be zero.
	SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
	SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
	insertDAGNode(*CurDAG, SDValue(Node, 0), Control);

	// If the 'X' is logically shifted, we can fold that shift into 'control'.
	// FIXME: only if the shift is one-use?
	if (X.getOpcode() == ISD::SRL) {
	SDValue ShiftAmt = X.getOperand(1);
	X = X.getOperand(0);

	assert(ShiftAmt.getValueType() == MVT::i8 &&
	"Expected shift amount to be i8");

	// Now, zero-extend the shift amount. The bits 8...15 must be zero!
	// We could zext to i16 in some form, but we intentionally don't do that.
	SDValue OrigShiftAmt = ShiftAmt;
	ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
	insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);

	// And now 'or' these low 8 bits of shift amount into the 'control'.
	Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
	insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
	}

	// But have to place the 'control' into the wide-enough register first.
	if (XVT != MVT::i32) {
	Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
	insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
	}

	// And finally, form the BEXTR itself.
	SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);

	// The 'X' was originally truncated. Do that now.
	if (XVT != NVT) {
	insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
	Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
	}

	ReplaceNode(Node, Extract.getNode());
	SelectCode(Extract.getNode());

	return true;
	}

	// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
	MachineSDNode X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode Node) {
	MVT NVT = Node->getSimpleValueType(0);
	SDLoc dl(Node);

	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	// If we have TBM we can use an immediate for the control. If we have BMI
	// we should only do this if the BEXTR instruction is implemented well.
	// Otherwise moving the control into a register makes this more costly.
	// TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
	// hoisting the move immediate would make it worthwhile with a less optimal
	// BEXTR?
	if (!Subtarget->hasTBM() &&
	!(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
	return nullptr;

	// Must have a shift right.
	if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
	return nullptr;

	// Shift can't have additional users.
	if (!N0->hasOneUse())
	return nullptr;

	// Only supported for 32 and 64 bits.
	if (NVT != MVT::i32 && NVT != MVT::i64)
	return nullptr;

	// Shift amount and RHS of and must be constant.
	ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
	ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
	if (!MaskCst \|\| !ShiftCst)
	return nullptr;

	// And RHS must be a mask.
	uint64_t Mask = MaskCst->getZExtValue();
	if (!isMask_64(Mask))
	return nullptr;

	uint64_t Shift = ShiftCst->getZExtValue();
	uint64_t MaskSize = countPopulation(Mask);

	// Don't interfere with something that can be handled by extracting AH.
	// TODO: If we are able to fold a load, BEXTR might still be better than AH.
	if (Shift == 8 && MaskSize == 8)
	return nullptr;

	// Make sure we are only using bits that were in the original value, not
	// shifted in.
	if (Shift + MaskSize > NVT.getSizeInBits())
	return nullptr;

	SDValue New = CurDAG->getTargetConstant(Shift \| (MaskSize << 8), dl, NVT);
	unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
	unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;

	// BMI requires the immediate to placed in a register.
	if (!Subtarget->hasTBM()) {
	ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
	MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
	unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
	New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
	}

	MachineSDNode *NewNode;
	SDValue Input = N0->getOperand(0);
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
	SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
	NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
	// Update the chain.
	ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
	} else {
	NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New);
	}

	return NewNode;
	}

	// Emit a PCMISTR(I/M) instruction.
	MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
	bool MayFoldLoad, const SDLoc &dl,
	MVT VT, SDNode *Node) {
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);
	SDValue Imm = Node->getOperand(2);
	const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
	Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());

	// Try to fold a load. No need to check alignment.
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
	SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
	N1.getOperand(0) };
	SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
	MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
	// Update the chain.
	ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
	return CNode;
	}

	SDValue Ops[] = { N0, N1, Imm };
	SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
	MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
	return CNode;
	}

	// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
	// to emit a second instruction after this one. This is needed since we have two
	// copyToReg nodes glued before this and we need to continue that glue through.
	MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
	bool MayFoldLoad, const SDLoc &dl,
	MVT VT, SDNode *Node,
	SDValue &InFlag) {
	SDValue N0 = Node->getOperand(0);
	SDValue N2 = Node->getOperand(2);
	SDValue Imm = Node->getOperand(4);
	const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
	Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());

	// Try to fold a load. No need to check alignment.
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
	SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
	N2.getOperand(0), InFlag };
	SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
	MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
	InFlag = SDValue(CNode, 3);
	// Update the chain.
	ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
	return CNode;
	}

	SDValue Ops[] = { N0, N2, Imm, InFlag };
	SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
	MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
	InFlag = SDValue(CNode, 2);
	return CNode;
	}

	bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
	EVT VT = N->getValueType(0);

	// Only handle scalar shifts.
	if (VT.isVector())
	return false;

	// Narrower shifts only mask to 5 bits in hardware.
	unsigned Size = VT == MVT::i64 ? 64 : 32;

	SDValue OrigShiftAmt = N->getOperand(1);
	SDValue ShiftAmt = OrigShiftAmt;
	SDLoc DL(N);

	// Skip over a truncate of the shift amount.
	if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
	ShiftAmt = ShiftAmt->getOperand(0);

	// This function is called after X86DAGToDAGISel::matchBitExtract(),
	// so we are not afraid that we might mess up BZHI/BEXTR pattern.

	SDValue NewShiftAmt;
	if (ShiftAmt->getOpcode() == ISD::ADD \|\| ShiftAmt->getOpcode() == ISD::SUB) {
	SDValue Add0 = ShiftAmt->getOperand(0);
	SDValue Add1 = ShiftAmt->getOperand(1);
	// If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
	// to avoid the ADD/SUB.
	if (isa<ConstantSDNode>(Add1) &&
	cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
	NewShiftAmt = Add0;
	// If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
	// generate a NEG instead of a SUB of a constant.
	} else if (ShiftAmt->getOpcode() == ISD::SUB &&
	isa<ConstantSDNode>(Add0) &&
	cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
	cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
	// Insert a negate op.
	// TODO: This isn't guaranteed to replace the sub if there is a logic cone
	// that uses it that's not a shift.
	EVT SubVT = ShiftAmt.getValueType();
	SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
	SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
	NewShiftAmt = Neg;

	// Insert these operands into a valid topological order so they can
	// get selected independently.
	insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
	insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
	} else
	return false;
	} else
	return false;

	if (NewShiftAmt.getValueType() != MVT::i8) {
	// Need to truncate the shift amount.
	NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
	// Add to a correct topological ordering.
	insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
	}

	// Insert a new mask to keep the shift amount legal. This should be removed
	// by isel patterns.
	NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
	CurDAG->getConstant(Size - 1, DL, MVT::i8));
	// Place in a correct topological ordering.
	insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);

	SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
	NewShiftAmt);
	if (UpdatedNode != N) {
	// If we found an existing node, we should replace ourselves with that node
	// and wait for it to be selected after its other users.
	ReplaceNode(N, UpdatedNode);
	return true;
	}

	// If the original shift amount is now dead, delete it so that we don't run
	// it through isel.
	if (OrigShiftAmt.getNode()->use_empty())
	CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());

	// Now that we've optimized the shift amount, defer to normal isel to get
	// load folding and legacy vs BMI2 selection without repeating it here.
	SelectCode(N);
	return true;
	}

	bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
	MVT NVT = N->getSimpleValueType(0);
	unsigned Opcode = N->getOpcode();
	SDLoc dl(N);

	// For operations of the form (x << C1) op C2, check if we can use a smaller
	// encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
	SDValue Shift = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
	if (!Cst)
	return false;

	int64_t Val = Cst->getSExtValue();

	// If we have an any_extend feeding the AND, look through it to see if there
	// is a shift behind it. But only if the AND doesn't use the extended bits.
	// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
	bool FoundAnyExtend = false;
	if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
	Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
	isUInt<32>(Val)) {
	FoundAnyExtend = true;
	Shift = Shift.getOperand(0);
	}

	if (Shift.getOpcode() != ISD::SHL \|\| !Shift.hasOneUse())
	return false;

	// i8 is unshrinkable, i16 should be promoted to i32.
	if (NVT != MVT::i32 && NVT != MVT::i64)
	return false;

	ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
	if (!ShlCst)
	return false;

	uint64_t ShAmt = ShlCst->getZExtValue();

	// Make sure that we don't change the operation by removing bits.
	// This only matters for OR and XOR, AND is unaffected.
	uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
	if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
	return false;

	// Check the minimum bitwidth for the new constant.
	// TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
	auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
	if (Opcode == ISD::AND) {
	// AND32ri is the same as AND64ri32 with zext imm.
	// Try this before sign extended immediates below.
	ShiftedVal = (uint64_t)Val >> ShAmt;
	if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
	return true;
	// Also swap order when the AND can become MOVZX.
	if (ShiftedVal == UINT8_MAX \|\| ShiftedVal == UINT16_MAX)
	return true;
	}
	ShiftedVal = Val >> ShAmt;
	if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) \|\|
	(!isInt<32>(Val) && isInt<32>(ShiftedVal)))
	return true;
	if (Opcode != ISD::AND) {
	// MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
	ShiftedVal = (uint64_t)Val >> ShAmt;
	if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
	return true;
	}
	return false;
	};

	int64_t ShiftedVal;
	if (!CanShrinkImmediate(ShiftedVal))
	return false;

	// Ok, we can reorder to get a smaller immediate.

	// But, its possible the original immediate allowed an AND to become MOVZX.
	// Doing this late due to avoid the MakedValueIsZero call as late as
	// possible.
	if (Opcode == ISD::AND) {
	// Find the smallest zext this could possibly be.
	unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
	ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));

	// Figure out which bits need to be zero to achieve that mask.
	APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
	ZExtWidth);
	NeededMask &= ~Cst->getAPIntValue();

	if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
	return false;
	}

	SDValue X = Shift.getOperand(0);
	if (FoundAnyExtend) {
	SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
	insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
	X = NewX;
	}

	SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
	insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
	SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
	insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
	SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
	Shift.getOperand(1));
	ReplaceNode(N, NewSHL.getNode());
	SelectCode(NewSHL.getNode());
	return true;
	}

	/// If the high bits of an 'and' operand are known zero, try setting the
	/// high bits of an 'and' constant operand to produce a smaller encoding by
	/// creating a small, sign-extended negative immediate rather than a large
	/// positive one. This reverses a transform in SimplifyDemandedBits that
	/// shrinks mask constants by clearing bits. There is also a possibility that
	/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
	/// case, just replace the 'and'. Return 'true' if the node is replaced.
	bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
	// i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
	// have immediate operands.
	MVT VT = And->getSimpleValueType(0);
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
	if (!And1C)
	return false;

	// Bail out if the mask constant is already negative. It's can't shrink more.
	// If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
	// patterns to use a 32-bit and instead of a 64-bit and by relying on the
	// implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
	// are negative too.
	APInt MaskVal = And1C->getAPIntValue();
	unsigned MaskLZ = MaskVal.countLeadingZeros();
	if (!MaskLZ \|\| (VT == MVT::i64 && MaskLZ == 32))
	return false;

	// Don't extend into the upper 32 bits of a 64 bit mask.
	if (VT == MVT::i64 && MaskLZ >= 32) {
	MaskLZ -= 32;
	MaskVal = MaskVal.trunc(32);
	}

	SDValue And0 = And->getOperand(0);
	APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
	APInt NegMaskVal = MaskVal \| HighZeros;

	// If a negative constant would not allow a smaller encoding, there's no need
	// to continue. Only change the constant when we know it's a win.
	unsigned MinWidth = NegMaskVal.getMinSignedBits();
	if (MinWidth > 32 \|\| (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
	return false;

	// Extend masks if we truncated above.
	if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
	NegMaskVal = NegMaskVal.zext(64);
	HighZeros = HighZeros.zext(64);
	}

	// The variable operand must be all zeros in the top bits to allow using the
	// new, negative constant as the mask.
	if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
	return false;

	// Check if the mask is -1. In that case, this is an unnecessary instruction
	// that escaped earlier analysis.
	if (NegMaskVal.isAllOnesValue()) {
	ReplaceNode(And, And0.getNode());
	return true;
	}

	// A negative mask allows a smaller encoding. Create a new 'and' node.
	SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
	SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
	ReplaceNode(And, NewAnd.getNode());
	SelectCode(NewAnd.getNode());
	return true;
	}

	static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
	bool FoldedBCast, bool Masked) {
	if (Masked) {
	if (FoldedLoad) {
	switch (TestVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::v16i8:
	return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
	case MVT::v8i16:
	return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
	case MVT::v4i32:
	return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
	case MVT::v2i64:
	return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
	case MVT::v32i8:
	return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
	case MVT::v16i16:
	return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
	case MVT::v8i32:
	return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
	case MVT::v4i64:
	return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
	case MVT::v64i8:
	return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
	case MVT::v32i16:
	return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
	case MVT::v16i32:
	return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
	case MVT::v8i64:
	return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
	}
	}

	if (FoldedBCast) {
	switch (TestVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::v4i32:
	return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
	case MVT::v2i64:
	return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
	case MVT::v8i32:
	return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
	case MVT::v4i64:
	return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
	case MVT::v16i32:
	return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
	case MVT::v8i64:
	return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
	}
	}

	switch (TestVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::v16i8:
	return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
	case MVT::v8i16:
	return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
	case MVT::v4i32:
	return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
	case MVT::v2i64:
	return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
	case MVT::v32i8:
	return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
	case MVT::v16i16:
	return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
	case MVT::v8i32:
	return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
	case MVT::v4i64:
	return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
	case MVT::v64i8:
	return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
	case MVT::v32i16:
	return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
	case MVT::v16i32:
	return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
	case MVT::v8i64:
	return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
	}
	}

	if (FoldedLoad) {
	switch (TestVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::v16i8:
	return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
	case MVT::v8i16:
	return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
	case MVT::v4i32:
	return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
	case MVT::v2i64:
	return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
	case MVT::v32i8:
	return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
	case MVT::v16i16:
	return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
	case MVT::v8i32:
	return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
	case MVT::v4i64:
	return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
	case MVT::v64i8:
	return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
	case MVT::v32i16:
	return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
	case MVT::v16i32:
	return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
	case MVT::v8i64:
	return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
	}
	}

	if (FoldedBCast) {
	switch (TestVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::v4i32:
	return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
	case MVT::v2i64:
	return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
	case MVT::v8i32:
	return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
	case MVT::v4i64:
	return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
	case MVT::v16i32:
	return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
	case MVT::v8i64:
	return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
	}
	}

	switch (TestVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::v16i8:
	return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
	case MVT::v8i16:
	return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
	case MVT::v4i32:
	return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
	case MVT::v2i64:
	return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
	case MVT::v32i8:
	return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
	case MVT::v16i16:
	return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
	case MVT::v8i32:
	return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
	case MVT::v4i64:
	return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
	case MVT::v64i8:
	return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
	case MVT::v32i16:
	return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
	case MVT::v16i32:
	return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
	case MVT::v8i64:
	return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
	}
	}

	// Try to create VPTESTM instruction. If InMask is not null, it will be used
	// to form a masked operation.
	bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
	SDValue InMask) {
	assert(Subtarget->hasAVX512() && "Expected AVX512!");
	assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Unexpected VT!");

	// Look for equal and not equal compares.
	ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
	if (CC != ISD::SETEQ && CC != ISD::SETNE)
	return false;

	// See if we're comparing against zero. This should have been canonicalized
	// to RHS during lowering.
	if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode()))
	return false;

	SDValue N0 = Setcc.getOperand(0);

	MVT CmpVT = N0.getSimpleValueType();
	MVT CmpSVT = CmpVT.getVectorElementType();

	// Start with both operands the same. We'll try to refine this.
	SDValue Src0 = N0;
	SDValue Src1 = N0;

	{
	// Look through single use bitcasts.
	SDValue N0Temp = N0;
	if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
	N0Temp = N0.getOperand(0);

	// Look for single use AND.
	if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
	Src0 = N0Temp.getOperand(0);
	Src1 = N0Temp.getOperand(1);
	}
	}

	// Without VLX we need to widen the load.
	bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();

	// We can only fold loads if the sources are unique.
	bool CanFoldLoads = Src0 != Src1;

	// Try to fold loads unless we need to widen.
	bool FoldedLoad = false;
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
	if (!Widen && CanFoldLoads) {
	Load = Src1;
	FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
	Tmp4);
	if (!FoldedLoad) {
	// And is computative.
	Load = Src0;
	FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
	Tmp3, Tmp4);
	if (FoldedLoad)
	std::swap(Src0, Src1);
	}
	}

	auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
	// Look through single use bitcasts.
	if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse())
	Src = Src.getOperand(0);

	if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) {
	Parent = Src.getNode();
	Src = Src.getOperand(0);
	if (Src.getSimpleValueType() == CmpSVT)
	return Src;
	}

	return SDValue();
	};

	// If we didn't fold a load, try to match broadcast. No widening limitation
	// for this. But only 32 and 64 bit types are supported.
	bool FoldedBCast = false;
	if (!FoldedLoad && CanFoldLoads &&
	(CmpSVT == MVT::i32 \|\| CmpSVT == MVT::i64)) {
	SDNode *ParentNode = nullptr;
	if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
	FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
	Tmp1, Tmp2, Tmp3, Tmp4);
	}

	// Try the other operand.
	if (!FoldedBCast) {
	if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
	FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
	Tmp1, Tmp2, Tmp3, Tmp4);
	if (FoldedBCast)
	std::swap(Src0, Src1);
	}
	}
	}

	auto getMaskRC = [](MVT MaskVT) {
	switch (MaskVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::v2i1: return X86::VK2RegClassID;
	case MVT::v4i1: return X86::VK4RegClassID;
	case MVT::v8i1: return X86::VK8RegClassID;
	case MVT::v16i1: return X86::VK16RegClassID;
	case MVT::v32i1: return X86::VK32RegClassID;
	case MVT::v64i1: return X86::VK64RegClassID;
	}
	};

	bool IsMasked = InMask.getNode() != nullptr;

	SDLoc dl(Root);

	MVT ResVT = Setcc.getSimpleValueType();
	MVT MaskVT = ResVT;
	if (Widen) {
	// Widen the inputs using insert_subreg or copy_to_regclass.
	unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
	unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
	unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
	CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
	MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
	SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
	CmpVT), 0);
	Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);

	assert(!FoldedLoad && "Shouldn't have folded the load");
	if (!FoldedBCast)
	Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);

	if (IsMasked) {
	// Widen the mask.
	unsigned RegClass = getMaskRC(MaskVT);
	SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
	InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
	dl, MaskVT, InMask, RC), 0);
	}
	}

	bool IsTestN = CC == ISD::SETEQ;
	unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
	IsMasked);

	MachineSDNode *CNode;
	if (FoldedLoad \|\| FoldedBCast) {
	SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);

	if (IsMasked) {
	SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
	Load.getOperand(0) };
	CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
	} else {
	SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
	Load.getOperand(0) };
	CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
	}

	// Update the chain.
	ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
	} else {
	if (IsMasked)
	CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
	else
	CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
	}

	// If we widened, we need to shrink the mask VT.
	if (Widen) {
	unsigned RegClass = getMaskRC(ResVT);
	SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
	CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
	dl, ResVT, SDValue(CNode, 0), RC);
	}

	ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
	CurDAG->RemoveDeadNode(Root);
	return true;
	}

	void X86DAGToDAGISel::Select(SDNode *Node) {
	MVT NVT = Node->getSimpleValueType(0);
	unsigned Opcode = Node->getOpcode();
	SDLoc dl(Node);

	if (Node->isMachineOpcode()) {
	LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
	Node->setNodeId(-1);
	return; // Already selected.
	}

	switch (Opcode) {
	default: break;
	case ISD::INTRINSIC_VOID: {
	unsigned IntNo = Node->getConstantOperandVal(1);
	switch (IntNo) {
	default: break;
	case Intrinsic::x86_sse3_monitor:
	case Intrinsic::x86_monitorx:
	case Intrinsic::x86_clzero: {
	bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;

	unsigned Opc = 0;
	switch (IntNo) {
	case Intrinsic::x86_sse3_monitor:
	if (!Subtarget->hasSSE3())
	break;
	Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
	break;
	case Intrinsic::x86_monitorx:
	if (!Subtarget->hasMWAITX())
	break;
	Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
	break;
	case Intrinsic::x86_clzero:
	if (!Subtarget->hasCLZERO())
	break;
	Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
	break;
	}

	if (Opc) {
	unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
	SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
	Node->getOperand(2), SDValue());
	SDValue InFlag = Chain.getValue(1);

	if (IntNo == Intrinsic::x86_sse3_monitor \|\|
	IntNo == Intrinsic::x86_monitorx) {
	// Copy the other two operands to ECX and EDX.
	Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
	InFlag);
	InFlag = Chain.getValue(1);
	Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
	InFlag);
	InFlag = Chain.getValue(1);
	}

	MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
	{ Chain, InFlag});
	ReplaceNode(Node, CNode);
	return;
	}
	}
	}

	break;
	}
	case ISD::BRIND: {
	if (Subtarget->isTargetNaCl())
	// NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
	// leave the instruction alone.
	break;
	if (Subtarget->isTarget64BitILP32()) {
	// Converts a 32-bit register to a 64-bit, zero-extended version of
	// it. This is needed because x86-64 can do many things, but jmp %r32
	// ain't one of them.
	const SDValue &Target = Node->getOperand(1);
	assert(Target.getSimpleValueType() == llvm::MVT::i32);
	SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
	SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
	Node->getOperand(0), ZextTarget);
	ReplaceNode(Node, Brind.getNode());
	SelectCode(ZextTarget.getNode());
	SelectCode(Brind.getNode());
	return;
	}
	break;
	}
	case X86ISD::GlobalBaseReg:
	ReplaceNode(Node, getGlobalBaseReg());
	return;

	case ISD::BITCAST:
	// Just drop all 128/256/512-bit bitcasts.
	if (NVT.is512BitVector() \|\| NVT.is256BitVector() \|\| NVT.is128BitVector() \|\|
	NVT == MVT::f128) {
	ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
	CurDAG->RemoveDeadNode(Node);
	return;
	}
	break;

	case ISD::VSELECT: {
	// Replace VSELECT with non-mask conditions with with BLENDV.
	if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
	break;

	assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
	SDValue Blendv = CurDAG->getNode(
	X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2));
	ReplaceNode(Node, Blendv.getNode());
	SelectCode(Blendv.getNode());
	// We already called ReplaceUses.
	return;
	}

	case ISD::SRL:
	if (matchBitExtract(Node))
	return;
	LLVM_FALLTHROUGH;
	case ISD::SRA:
	case ISD::SHL:
	if (tryShiftAmountMod(Node))
	return;
	break;

	case ISD::AND:
	if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
	// Try to form a masked VPTESTM. Operands can be in either order.
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);
	if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
	tryVPTESTM(Node, N0, N1))
	return;
	if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
	tryVPTESTM(Node, N1, N0))
	return;
	}

	if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
	ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
	CurDAG->RemoveDeadNode(Node);
	return;
	}
	if (matchBitExtract(Node))
	return;
	if (AndImmShrink && shrinkAndImmediate(Node))
	return;

	LLVM_FALLTHROUGH;
	case ISD::OR:
	case ISD::XOR:
	if (tryShrinkShlLogicImm(Node))
	return;

	LLVM_FALLTHROUGH;
	case ISD::ADD:
	case ISD::SUB: {
	// Try to avoid folding immediates with multiple uses for optsize.
	// This code tries to select to register form directly to avoid going
	// through the isel table which might fold the immediate. We can't change
	// the patterns on the add/sub/and/or/xor with immediate paterns in the
	// tablegen files to check immediate use count without making the patterns
	// unavailable to the fast-isel table.
	if (!OptForSize)
	break;

	// Only handle i8/i16/i32/i64.
	if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
	break;

	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
	if (!Cst)
	break;

	int64_t Val = Cst->getSExtValue();

	// Make sure its an immediate that is considered foldable.
	// FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
	if (!isInt<8>(Val) && !isInt<32>(Val))
	break;

	// Check if we should avoid folding this immediate.
	if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
	break;

	// We should not fold the immediate. So we need a register form instead.
	unsigned ROpc, MOpc;
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unexpected VT!");
	case MVT::i8:
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
	case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
	case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
	case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break;
	case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
	}
	break;
	case MVT::i16:
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
	case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
	case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
	case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break;
	case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
	}
	break;
	case MVT::i32:
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
	case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
	case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
	case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break;
	case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
	}
	break;
	case MVT::i64:
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
	case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
	case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
	case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break;
	case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
	}
	break;
	}

	// Ok this is a AND/OR/XOR/ADD/SUB with constant.

	// If this is a not a subtract, we can still try to fold a load.
	if (Opcode != ISD::SUB) {
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
	SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
	SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
	MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
	// Update the chain.
	ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
	ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
	CurDAG->RemoveDeadNode(Node);
	return;
	}
	}

	CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
	return;
	}

	case X86ISD::SMUL:
	// i16/i32/i64 are handled with isel patterns.
	if (NVT != MVT::i8)
	break;
	LLVM_FALLTHROUGH;
	case X86ISD::UMUL: {
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	unsigned LoReg, ROpc, MOpc;
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i8:
	LoReg = X86::AL;
	ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
	MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
	break;
	case MVT::i16:
	LoReg = X86::AX;
	ROpc = X86::MUL16r;
	MOpc = X86::MUL16m;
	break;
	case MVT::i32:
	LoReg = X86::EAX;
	ROpc = X86::MUL32r;
	MOpc = X86::MUL32m;
	break;
	case MVT::i64:
	LoReg = X86::RAX;
	ROpc = X86::MUL64r;
	MOpc = X86::MUL64m;
	break;
	}

	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
	// Multiply is commmutative.
	if (!FoldedLoad) {
	FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
	if (FoldedLoad)
	std::swap(N0, N1);
	}

	SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
	N0, SDValue()).getValue(1);

	MachineSDNode *CNode;
	if (FoldedLoad) {
	// i16/i32/i64 use an instruction that produces a low and high result even
	// though only the low result is used.
	SDVTList VTs;
	if (NVT == MVT::i8)
	VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
	else
	VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);

	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
	InFlag };
	CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);

	// Update the chain.
	ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
	} else {
	// i16/i32/i64 use an instruction that produces a low and high result even
	// though only the low result is used.
	SDVTList VTs;
	if (NVT == MVT::i8)
	VTs = CurDAG->getVTList(NVT, MVT::i32);
	else
	VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);

	CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
	}

	ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
	ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
	CurDAG->RemoveDeadNode(Node);
	return;
	}

	case ISD::SMUL_LOHI:
	case ISD::UMUL_LOHI: {
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	unsigned Opc, MOpc;
	bool isSigned = Opcode == ISD::SMUL_LOHI;
	if (!isSigned) {
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
	case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
	}
	} else {
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
	case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
	}
	}

	unsigned SrcReg, LoReg, HiReg;
	switch (Opc) {
	default: llvm_unreachable("Unknown MUL opcode!");
	case X86::IMUL32r:
	case X86::MUL32r:
	SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
	break;
	case X86::IMUL64r:
	case X86::MUL64r:
	SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
	break;
	}

	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
	// Multiply is commmutative.
	if (!foldedLoad) {
	foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
	if (foldedLoad)
	std::swap(N0, N1);
	}

	SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
	N0, SDValue()).getValue(1);
	if (foldedLoad) {
	SDValue Chain;
	MachineSDNode *CNode = nullptr;
	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
	InFlag };
	SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
	CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
	Chain = SDValue(CNode, 0);
	InFlag = SDValue(CNode, 1);

	// Update the chain.
	ReplaceUses(N1.getValue(1), Chain);
	// Record the mem-refs
	CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
	} else {
	SDValue Ops[] = { N1, InFlag };
	SDVTList VTs = CurDAG->getVTList(MVT::Glue);
	SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
	InFlag = SDValue(CNode, 0);
	}

	// Copy the low half of the result, if it is needed.
	if (!SDValue(Node, 0).use_empty()) {
	assert(LoReg && "Register for low half is not defined!");
	SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
	NVT, InFlag);
	InFlag = ResLo.getValue(2);
	ReplaceUses(SDValue(Node, 0), ResLo);
	LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
	dbgs() << '\n');
	}
	// Copy the high half of the result, if it is needed.
	if (!SDValue(Node, 1).use_empty()) {
	assert(HiReg && "Register for high half is not defined!");
	SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
	NVT, InFlag);
	InFlag = ResHi.getValue(2);
	ReplaceUses(SDValue(Node, 1), ResHi);
	LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
	dbgs() << '\n');
	}

	CurDAG->RemoveDeadNode(Node);
	return;
	}

	case ISD::SDIVREM:
	case ISD::UDIVREM: {
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	unsigned Opc, MOpc;
	bool isSigned = Opcode == ISD::SDIVREM;
	if (!isSigned) {
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
	case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
	case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
	case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
	}
	} else {
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
	case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
	case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
	case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
	}
	}

	unsigned LoReg, HiReg, ClrReg;
	unsigned SExtOpcode;
	switch (NVT.SimpleTy) {
	default: llvm_unreachable("Unsupported VT!");
	case MVT::i8:
	LoReg = X86::AL; ClrReg = HiReg = X86::AH;
	SExtOpcode = X86::CBW;
	break;
	case MVT::i16:
	LoReg = X86::AX; HiReg = X86::DX;
	ClrReg = X86::DX;
	SExtOpcode = X86::CWD;
	break;
	case MVT::i32:
	LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
	SExtOpcode = X86::CDQ;
	break;
	case MVT::i64:
	LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
	SExtOpcode = X86::CQO;
	break;
	}

	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
	bool signBitIsZero = CurDAG->SignBitIsZero(N0);

	SDValue InFlag;
	if (NVT == MVT::i8 && (!isSigned \|\| signBitIsZero)) {
	// Special case for div8, just use a move with zero extension to AX to
	// clear the upper 8 bits (AH).
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
	MachineSDNode *Move;
	if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
	Move = CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
	MVT::Other, Ops);
	Chain = SDValue(Move, 1);
	ReplaceUses(N0.getValue(1), Chain);
	// Record the mem-refs
	CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
	} else {
	Move = CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0);
	Chain = CurDAG->getEntryNode();
	}
	Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, SDValue(Move, 0),
	SDValue());
	InFlag = Chain.getValue(1);
	} else {
	InFlag =
	CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
	LoReg, N0, SDValue()).getValue(1);
	if (isSigned && !signBitIsZero) {
	// Sign extend the low part into the high part.
	InFlag =
	SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
	} else {
	// Zero out the high part, effectively zero extending the input.
	SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
	switch (NVT.SimpleTy) {
	case MVT::i16:
	ClrNode =
	SDValue(CurDAG->getMachineNode(
	TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
	CurDAG->getTargetConstant(X86::sub_16bit, dl,
	MVT::i32)),
	0);
	break;
	case MVT::i32:
	break;
	case MVT::i64:
	ClrNode =
	SDValue(CurDAG->getMachineNode(
	TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
	CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
	CurDAG->getTargetConstant(X86::sub_32bit, dl,
	MVT::i32)),
	0);
	break;
	default:
	llvm_unreachable("Unexpected division source");
	}

	InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
	ClrNode, InFlag).getValue(1);
	}
	}

	if (foldedLoad) {
	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
	InFlag };
	MachineSDNode *CNode =
	CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
	InFlag = SDValue(CNode, 1);
	// Update the chain.
	ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
	} else {
	InFlag =
	SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
	}

	// Prevent use of AH in a REX instruction by explicitly copying it to
	// an ABCD_L register.
	//
	// The current assumption of the register allocator is that isel
	// won't generate explicit references to the GR8_ABCD_H registers. If
	// the allocator and/or the backend get enhanced to be more robust in
	// that regard, this can be, and should be, removed.
	if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
	SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
	unsigned AHExtOpcode =
	isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;

	SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
	MVT::Glue, AHCopy, InFlag);
	SDValue Result(RNode, 0);
	InFlag = SDValue(RNode, 1);

	Result =
	CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);

	ReplaceUses(SDValue(Node, 1), Result);
	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
	dbgs() << '\n');
	}
	// Copy the division (low) result, if it is needed.
	if (!SDValue(Node, 0).use_empty()) {
	SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
	LoReg, NVT, InFlag);
	InFlag = Result.getValue(2);
	ReplaceUses(SDValue(Node, 0), Result);
	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
	dbgs() << '\n');
	}
	// Copy the remainder (high) result, if it is needed.
	if (!SDValue(Node, 1).use_empty()) {
	SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
	HiReg, NVT, InFlag);
	InFlag = Result.getValue(2);
	ReplaceUses(SDValue(Node, 1), Result);
	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
	dbgs() << '\n');
	}
	CurDAG->RemoveDeadNode(Node);
	return;
	}

	case X86ISD::CMP: {
	SDValue N0 = Node->getOperand(0);
	SDValue N1 = Node->getOperand(1);

	// Optimizations for TEST compares.
	if (!isNullConstant(N1))
	break;

	// Save the original VT of the compare.
	MVT CmpVT = N0.getSimpleValueType();

	// If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
	// by a test instruction. The test should be removed later by
	// analyzeCompare if we are using only the zero flag.
	// TODO: Should we check the users and use the BEXTR flags directly?
	if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
	if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
	unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
	: X86::TEST32rr;
	SDValue BEXTR = SDValue(NewNode, 0);
	NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
	ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
	CurDAG->RemoveDeadNode(Node);
	return;
	}
	}

	// We can peek through truncates, but we need to be careful below.
	if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
	N0 = N0.getOperand(0);

	// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
	// use a smaller encoding.
	// Look past the truncate if CMP is the only use of it.
	if (N0.getOpcode() == ISD::AND &&
	N0.getNode()->hasOneUse() &&
	N0.getValueType() != MVT::i8) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!C) break;
	uint64_t Mask = C->getZExtValue();

	// Check if we can replace AND+IMM64 with a shift. This is possible for
	// masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
	// flag.
	if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
	onlyUsesZeroFlag(SDValue(Node, 0))) {
	if (isMask_64(~Mask)) {
	unsigned TrailingZeros = countTrailingZeros(Mask);
	SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
	SDValue Shift =
	SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
	N0.getOperand(0), Imm), 0);
	MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
	MVT::i32, Shift, Shift);
	ReplaceNode(Node, Test);
	return;
	}
	if (isMask_64(Mask)) {
	unsigned LeadingZeros = countLeadingZeros(Mask);
	SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
	SDValue Shift =
	SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
	N0.getOperand(0), Imm), 0);
	MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
	MVT::i32, Shift, Shift);
	ReplaceNode(Node, Test);
	return;
	}
	}

	MVT VT;
	int SubRegOp;
	unsigned ROpc, MOpc;

	// For each of these checks we need to be careful if the sign flag is
	// being used. It is only safe to use the sign flag in two conditions,
	// either the sign bit in the shrunken mask is zero or the final test
	// size is equal to the original compare size.

	if (isUInt<8>(Mask) &&
	(!(Mask & 0x80) \|\| CmpVT == MVT::i8 \|\|
	hasNoSignFlagUses(SDValue(Node, 0)))) {
	// For example, convert "testl %eax, $8" to "testb %al, $8"
	VT = MVT::i8;
	SubRegOp = X86::sub_8bit;
	ROpc = X86::TEST8ri;
	MOpc = X86::TEST8mi;
	} else if (OptForMinSize && isUInt<16>(Mask) &&
	(!(Mask & 0x8000) \|\| CmpVT == MVT::i16 \|\|
	hasNoSignFlagUses(SDValue(Node, 0)))) {
	// For example, "testl %eax, $32776" to "testw %ax, $32776".
	// NOTE: We only want to form TESTW instructions if optimizing for
	// min size. Otherwise we only save one byte and possibly get a length
	// changing prefix penalty in the decoders.
	VT = MVT::i16;
	SubRegOp = X86::sub_16bit;
	ROpc = X86::TEST16ri;
	MOpc = X86::TEST16mi;
	} else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
	((!(Mask & 0x80000000) &&
	// Without minsize 16-bit Cmps can get here so we need to
	// be sure we calculate the correct sign flag if needed.
	(CmpVT != MVT::i16 \|\| !(Mask & 0x8000))) \|\|
	CmpVT == MVT::i32 \|\|
	hasNoSignFlagUses(SDValue(Node, 0)))) {
	// For example, "testq %rax, $268468232" to "testl %eax, $268468232".
	// NOTE: We only want to run that transform if N0 is 32 or 64 bits.
	// Otherwize, we find ourselves in a position where we have to do
	// promotion. If previous passes did not promote the and, we assume
	// they had a good reason not to and do not promote here.
	VT = MVT::i32;
	SubRegOp = X86::sub_32bit;
	ROpc = X86::TEST32ri;
	MOpc = X86::TEST32mi;
	} else {
	// No eligible transformation was found.
	break;
	}

	SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
	SDValue Reg = N0.getOperand(0);

	// Emit a testl or testw.
	MachineSDNode *NewNode;
	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
	if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
	Reg.getOperand(0) };
	NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
	// Update the chain.
	ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
	// Record the mem-refs
	CurDAG->setNodeMemRefs(NewNode,
	{cast<LoadSDNode>(Reg)->getMemOperand()});
	} else {
	// Extract the subregister if necessary.
	if (N0.getValueType() != VT)
	Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);

	NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
	}
	// Replace CMP with TEST.
	ReplaceNode(Node, NewNode);
	return;
	}
	break;
	}
	case X86ISD::PCMPISTR: {
	if (!Subtarget->hasSSE42())
	break;

	bool NeedIndex = !SDValue(Node, 0).use_empty();
	bool NeedMask = !SDValue(Node, 1).use_empty();
	// We can't fold a load if we are going to make two instructions.
	bool MayFoldLoad = !NeedIndex \|\| !NeedMask;

	MachineSDNode *CNode;
	if (NeedMask) {
	unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
	unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
	CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
	ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
	}
	if (NeedIndex \|\| !NeedMask) {
	unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
	unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
	CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
	ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
	}

	// Connect the flag usage to the last instruction created.
	ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
	CurDAG->RemoveDeadNode(Node);
	return;
	}
	case X86ISD::PCMPESTR: {
	if (!Subtarget->hasSSE42())
	break;

	// Copy the two implicit register inputs.
	SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
	Node->getOperand(1),
	SDValue()).getValue(1);
	InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
	Node->getOperand(3), InFlag).getValue(1);

	bool NeedIndex = !SDValue(Node, 0).use_empty();
	bool NeedMask = !SDValue(Node, 1).use_empty();
	// We can't fold a load if we are going to make two instructions.
	bool MayFoldLoad = !NeedIndex \|\| !NeedMask;

	MachineSDNode *CNode;
	if (NeedMask) {
	unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
	unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
	CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
	InFlag);
	ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
	}
	if (NeedIndex \|\| !NeedMask) {
	unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
	unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
	CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
	ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
	}
	// Connect the flag usage to the last instruction created.
	ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
	CurDAG->RemoveDeadNode(Node);
	return;
	}

	case ISD::SETCC: {
	if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
	return;

	break;
	}

	case ISD::STORE:
	if (foldLoadStoreIntoMemOperand(Node))
	return;
	break;
	case ISD::FCEIL:
	case ISD::FFLOOR:
	case ISD::FTRUNC:
	case ISD::FNEARBYINT:
	case ISD::FRINT: {
	// Replace fp rounding with their X86 specific equivalent so we don't
	// need 2 sets of patterns.
	// FIXME: This can only happen when the nodes started as STRICT_* and have
	// been mutated into their non-STRICT equivalents. Eventually this
	// mutation will be removed and we should switch the STRICT_ nodes to a
	// strict version of RNDSCALE in PreProcessISelDAG.
	unsigned Imm;
	switch (Node->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode!");
	case ISD::FCEIL: Imm = 0xA; break;
	case ISD::FFLOOR: Imm = 0x9; break;
	case ISD::FTRUNC: Imm = 0xB; break;
	case ISD::FNEARBYINT: Imm = 0xC; break;
	case ISD::FRINT: Imm = 0x4; break;
	}
	SDLoc dl(Node);
	SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
	Node->getValueType(0),
	Node->getOperand(0),
	CurDAG->getConstant(Imm, dl, MVT::i8));
	ReplaceNode(Node, Res.getNode());
	SelectCode(Res.getNode());
	return;
	}
	}

	SelectCode(Node);
	}

	bool X86DAGToDAGISel::
	SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
	std::vector<SDValue> &OutOps) {
	SDValue Op0, Op1, Op2, Op3, Op4;
	switch (ConstraintID) {
	default:
	llvm_unreachable("Unexpected asm memory constraint");
	case InlineAsm::Constraint_i:
	// FIXME: It seems strange that 'i' is needed here since it's supposed to
	// be an immediate and not a memory constraint.
	LLVM_FALLTHROUGH;
	case InlineAsm::Constraint_o: // offsetable ??
	case InlineAsm::Constraint_v: // not offsetable ??
	case InlineAsm::Constraint_m: // memory
	case InlineAsm::Constraint_X:
	if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
	return true;
	break;
	}

	OutOps.push_back(Op0);
	OutOps.push_back(Op1);
	OutOps.push_back(Op2);
	OutOps.push_back(Op3);
	OutOps.push_back(Op4);
	return false;
	}

	/// This pass converts a legalized DAG into a X86-specific DAG,
	/// ready for instruction scheduling.
	FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
	CodeGenOpt::Level OptLevel) {
	return new X86DAGToDAGISel(TM, OptLevel);
	}
	Index: vendor/llvm/dist-release_90/lib/Target/X86/X86ISelLowering.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Target/X86/X86ISelLowering.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Target/X86/X86ISelLowering.cpp (revision 351303)
	@@ -1,45516 +1,45520 @@
	//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that X86 uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "X86ISelLowering.h"
	#include "Utils/X86ShuffleDecode.h"
	#include "X86CallingConv.h"
	#include "X86FrameLowering.h"
	#include "X86InstrBuilder.h"
	#include "X86IntrinsicsInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86TargetMachine.h"
	#include "X86TargetObjectFile.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/CodeGen/IntrinsicLowering.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cctype>
	#include <numeric>
	using namespace llvm;

	#define DEBUG_TYPE "x86-isel"

	STATISTIC(NumTailCalls, "Number of tail calls");

	static cl::opt<bool> ExperimentalVectorWideningLegalization(
	"x86-experimental-vector-widening-legalization", cl::init(false),
	cl::desc("Enable an experimental vector type legalization through widening "
	"rather than promotion."),
	cl::Hidden);

	static cl::opt<int> ExperimentalPrefLoopAlignment(
	"x86-experimental-pref-loop-alignment", cl::init(4),
	cl::desc("Sets the preferable loop alignment for experiments "
	"(the last x86-experimental-pref-loop-alignment bits"
	" of the loop header PC will be 0)."),
	cl::Hidden);

	static cl::opt<bool> MulConstantOptimization(
	"mul-constant-optimization", cl::init(true),
	cl::desc("Replace 'mul x, Const' with more effective instructions like "
	"SHIFT, LEA, etc."),
	cl::Hidden);

	/// Call this when the user attempts to do something unsupported, like
	/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
	/// report_fatal_error, so calling code should attempt to recover without
	/// crashing.
	static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
	const char *Msg) {
	MachineFunction &MF = DAG.getMachineFunction();
	DAG.getContext()->diagnose(
	DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
	}

	X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
	const X86Subtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
	X86ScalarSSEf64 = Subtarget.hasSSE2();
	X86ScalarSSEf32 = Subtarget.hasSSE1();
	MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));

	// Set up the TargetLowering object.

	// X86 is weird. It always uses i8 for shift amounts and setcc results.
	setBooleanContents(ZeroOrOneBooleanContent);
	// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// For 64-bit, since we have so many registers, use the ILP scheduler.
	// For 32-bit, use the register pressure specific scheduling.
	// For Atom, always use ILP scheduling.
	if (Subtarget.isAtom())
	setSchedulingPreference(Sched::ILP);
	else if (Subtarget.is64Bit())
	setSchedulingPreference(Sched::ILP);
	else
	setSchedulingPreference(Sched::RegPressure);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

	// Bypass expensive divides and use cheaper ones.
	if (TM.getOptLevel() >= CodeGenOpt::Default) {
	if (Subtarget.hasSlowDivide32())
	addBypassSlowDiv(32, 8);
	if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
	addBypassSlowDiv(64, 32);
	}

	if (Subtarget.isTargetWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()) {
	// Setup Windows compiler runtime calls.
	setLibcallName(RTLIB::SDIV_I64, "_alldiv");
	setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
	setLibcallName(RTLIB::SREM_I64, "_allrem");
	setLibcallName(RTLIB::UREM_I64, "_aullrem");
	setLibcallName(RTLIB::MUL_I64, "_allmul");
	setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
	setUseUnderscoreSetJmp(false);
	setUseUnderscoreLongJmp(false);
	} else if (Subtarget.isTargetWindowsGNU()) {
	// MS runtime is weird: it exports _setjmp, but longjmp!
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(false);
	} else {
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(true);
	}

	// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
	// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
	// FIXME: Should we be limitting the atomic size on other configs? Default is
	// 1024.
	if (!Subtarget.hasCmpxchg8b())
	setMaxAtomicSizeInBitsSupported(32);

	// Set up the register classes.
	addRegisterClass(MVT::i8, &X86::GR8RegClass);
	addRegisterClass(MVT::i16, &X86::GR16RegClass);
	addRegisterClass(MVT::i32, &X86::GR32RegClass);
	if (Subtarget.is64Bit())
	addRegisterClass(MVT::i64, &X86::GR64RegClass);

	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

	// We don't accept any truncstore of integer registers.
	setTruncStoreAction(MVT::i64, MVT::i32, Expand);
	setTruncStoreAction(MVT::i64, MVT::i16, Expand);
	setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i32, MVT::i16, Expand);
	setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i16, MVT::i8, Expand);

	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	// SETOEQ and SETUNE require checking two conditions.
	setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);

	// Integer absolute.
	if (Subtarget.hasCMov()) {
	setOperationAction(ISD::ABS , MVT::i16 , Custom);
	setOperationAction(ISD::ABS , MVT::i32 , Custom);
	}
	setOperationAction(ISD::ABS , MVT::i64 , Custom);

	// Funnel shifts.
	for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
	setOperationAction(ShiftOp , MVT::i16 , Custom);
	setOperationAction(ShiftOp , MVT::i32 , Custom);
	if (Subtarget.is64Bit())
	setOperationAction(ShiftOp , MVT::i64 , Custom);
	}

	// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
	// operation.
	setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);

	if (Subtarget.is64Bit()) {
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
	// f32/f64 are legal, f80 is custom.
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
	else
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
	} else if (!Subtarget.useSoftFloat()) {
	// We have an algorithm for SSE2->double, and we turn this into a
	// 64-bit FILD followed by conditional FADD for other targets.
	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
	// We have an algorithm for SSE2, and we turn this into a 64-bit
	// FILD or VCVTUSI2SS/SD for other targets.
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);
	}

	// Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
	setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);

	if (!Subtarget.useSoftFloat()) {
	// SSE has no i16 to fp conversion, only i32.
	if (X86ScalarSSEf32) {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
	// f32 and f64 cases are Legal, f80 case is not
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
	}
	} else {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand);
	}

	// Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
	setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);

	if (!Subtarget.useSoftFloat()) {
	// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
	// are Legal, f80 is custom lowered.
	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
	setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);

	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
	}

	// Handle FP_TO_UINT by promoting the destination to a larger signed
	// conversion.
	setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);

	if (Subtarget.is64Bit()) {
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	// FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
	} else {
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
	}
	} else if (!Subtarget.useSoftFloat()) {
	// Since AVX is a superset of SSE3, only check for SSE here.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
	// Expand FP_TO_UINT into a select.
	// FIXME: We would like to use a Custom expander here eventually to do
	// the optimal thing for SSE vs. the default expansion in the legalizer.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
	else
	// With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
	// With SSE3 we can use fisttpll to convert to a signed i64; without
	// SSE, we're stuck with a fistpll.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);

	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
	}

	// TODO: when we have SSE, these could be more efficient, by using movd/movq.
	if (!X86ScalarSSEf64) {
	setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
	setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
	// Without SSE, i64->f64 goes through memory.
	setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
	}
	} else if (!Subtarget.is64Bit())
	setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

	// Scalar integer divide and remainder are lowered to use operations that
	// produce two results, to match the available instructions. This exposes
	// the two-result form to trivial CSE, which is able to combine x/y and x%y
	// into a single instruction.
	//
	// Scalar integer multiply-high is also lowered to use two-result
	// operations, to match the available instructions. However, plain multiply
	// (low) operations are left as Legal, as there are single-result
	// instructions for this in x86. Using the two-result multiply instructions
	// when both high and low results are needed must be arranged by dagcombine.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	}

	setOperationAction(ISD::BR_JT , MVT::Other, Expand);
	setOperationAction(ISD::BRCOND , MVT::Other, Custom);
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
	MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::BR_CC, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	}
	if (Subtarget.is64Bit())
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
	setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);

	setOperationAction(ISD::FREM , MVT::f32 , Expand);
	setOperationAction(ISD::FREM , MVT::f64 , Expand);
	setOperationAction(ISD::FREM , MVT::f80 , Expand);
	setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);

	// Promote the i8 variants and force them on up to i32 which has a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	if (!Subtarget.hasBMI()) {
	setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
	}
	}

	if (Subtarget.hasLZCNT()) {
	// When promoting the i8 variants, force them to i32 for a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	} else {
	setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
	}
	}

	// Special handling for half-precision floating point conversions.
	// If we don't have F16C support, then lower half float conversions
	// into library calls.
	if (Subtarget.useSoftFloat() \|\| !Subtarget.hasF16C()) {
	setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
	}

	// There's never any support for operations beyond MVT::f32.
	setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
	setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);

	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f80, MVT::f16, Expand);

	if (Subtarget.hasPOPCNT()) {
	setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
	} else {
	setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
	if (Subtarget.is64Bit())
	setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
	else
	setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
	}

	setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

	if (!Subtarget.hasMOVBE())
	setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

	// These should be promoted to a larger select which is supported.
	setOperationAction(ISD::SELECT , MVT::i1 , Promote);
	// X86 wants to expand cmov itself.
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}

	// Custom action for SELECT MMX and expand action for SELECT_CC MMX
	setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

	setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
	// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
	// LLVM/Clang supports zero-cost DWARF and SEH exception handling.
	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
	setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
	if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
	setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

	// Darwin ABI issue.
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::ConstantPool , VT, Custom);
	setOperationAction(ISD::JumpTable , VT, Custom);
	setOperationAction(ISD::GlobalAddress , VT, Custom);
	setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
	setOperationAction(ISD::ExternalSymbol , VT, Custom);
	setOperationAction(ISD::BlockAddress , VT, Custom);
	}

	// 64-bit shl, sra, srl (iff 32-bit x86)
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SHL_PARTS, VT, Custom);
	setOperationAction(ISD::SRA_PARTS, VT, Custom);
	setOperationAction(ISD::SRL_PARTS, VT, Custom);
	}

	if (Subtarget.hasSSEPrefetch() \|\| Subtarget.has3DNow())
	setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

	setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

	// Expand certain atomics
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
	setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
	}

	if (!Subtarget.is64Bit())
	setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

	if (Subtarget.hasCmpxchg16b()) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
	}

	// FIXME - use subtarget debug flags
	if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
	!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
	TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
	setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
	}

	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

	setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
	setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// VASTART needs to be custom lowered to use the VarArgsFrameIndex
	setOperationAction(ISD::VASTART , MVT::Other, Custom);
	setOperationAction(ISD::VAEND , MVT::Other, Expand);
	bool Is64Bit = Subtarget.is64Bit();
	setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

	// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
	setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
	setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

	if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
	// f32 and f64 use SSE.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
	: &X86::FR32RegClass);
	addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
	: &X86::FR64RegClass);

	// Disable f32->f64 extload as we can only generate this in one instruction
	// under optsize. So its easier to pattern match (fpext (load)) for that
	// case instead of needing to emit 2 instructions for extload in the
	// non-optsize case.
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	// Use ANDPD to simulate FABS.
	setOperationAction(ISD::FABS, VT, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG, VT, Custom);

	// Use ANDPD and ORPD to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);

	// These might be better off as horizontal vector ops.
	setOperationAction(ISD::FADD, VT, Custom);
	setOperationAction(ISD::FSUB, VT, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}

	// Lower this to MOVMSK plus an AND.
	setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
	setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

	} else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 \|\| Is64Bit)) {
	// Use SSE for f32, x87 for f64.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, &X86::FR32RegClass);
	if (UseX87)
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);

	// Use ANDPS to simulate FABS.
	setOperationAction(ISD::FABS , MVT::f32, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG , MVT::f32, Custom);

	if (UseX87)
	setOperationAction(ISD::UNDEF, MVT::f64, Expand);

	// Use ANDPS and ORPS to simulate FCOPYSIGN.
	if (UseX87)
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , MVT::f32, Expand);
	setOperationAction(ISD::FCOS , MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

	if (UseX87) {
	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN, MVT::f64, Expand);
	setOperationAction(ISD::FCOS, MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	}
	} else if (UseX87) {
	// f32 and f64 in x87.
	// Set up the FP register classes.
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);
	addRegisterClass(MVT::f32, &X86::RFP32RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	setOperationAction(ISD::UNDEF, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}
	}

	// Expand FP32 immediates into loads from the stack, save special cases.
	if (isTypeLegal(MVT::f32)) {
	if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
	addLegalFPImmediate(APFloat(+0.0f)); // FLD0
	addLegalFPImmediate(APFloat(+1.0f)); // FLD1
	addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
	} else // SSE immediates.
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	}
	// Expand FP64 immediates into loads from the stack, save special cases.
	if (isTypeLegal(MVT::f64)) {
	if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
	} else // SSE immediates.
	addLegalFPImmediate(APFloat(+0.0)); // xorpd
	}

	// We don't support FMA.
	setOperationAction(ISD::FMA, MVT::f64, Expand);
	setOperationAction(ISD::FMA, MVT::f32, Expand);

	// Long double always uses X87, except f128 in MMX.
	if (UseX87) {
	if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
	setOperationAction(ISD::FABS , MVT::f128, Custom);
	setOperationAction(ISD::FNEG , MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
	}

	addRegisterClass(MVT::f80, &X86::RFP80RegClass);
	setOperationAction(ISD::UNDEF, MVT::f80, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
	{
	APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
	addLegalFPImmediate(TmpFlt); // FLD0
	TmpFlt.changeSign();
	addLegalFPImmediate(TmpFlt); // FLD0/FCHS

	bool ignored;
	APFloat TmpFlt2(+1.0);
	TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
	&ignored);
	addLegalFPImmediate(TmpFlt2); // FLD1
	TmpFlt2.changeSign();
	addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
	}

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , MVT::f80, Expand);
	setOperationAction(ISD::FCOS , MVT::f80, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

	setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
	setOperationAction(ISD::FCEIL, MVT::f80, Expand);
	setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
	setOperationAction(ISD::FRINT, MVT::f80, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
	setOperationAction(ISD::FMA, MVT::f80, Expand);
	setOperationAction(ISD::LROUND, MVT::f80, Expand);
	setOperationAction(ISD::LLROUND, MVT::f80, Expand);
	setOperationAction(ISD::LRINT, MVT::f80, Expand);
	setOperationAction(ISD::LLRINT, MVT::f80, Expand);
	}

	// Always use a library call for pow.
	setOperationAction(ISD::FPOW , MVT::f32 , Expand);
	setOperationAction(ISD::FPOW , MVT::f64 , Expand);
	setOperationAction(ISD::FPOW , MVT::f80 , Expand);

	setOperationAction(ISD::FLOG, MVT::f80, Expand);
	setOperationAction(ISD::FLOG2, MVT::f80, Expand);
	setOperationAction(ISD::FLOG10, MVT::f80, Expand);
	setOperationAction(ISD::FEXP, MVT::f80, Expand);
	setOperationAction(ISD::FEXP2, MVT::f80, Expand);
	setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
	setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

	// Some FP actions are always expanded for vector types.
	for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
	MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	}

	// First set operation action for all vector types to either promote
	// (for widening) or expand (for scalarization). Then we will selectively
	// turn on ones that can be effectively codegen'd.
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::FMA, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::SETCC, VT, Expand);
	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
	setOperationAction(ISD::FP_TO_SINT, VT, Expand);
	setOperationAction(ISD::UINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
	setOperationAction(ISD::TRUNCATE, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
	setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
	setOperationAction(ISD::ANY_EXTEND, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	for (MVT InnerVT : MVT::vector_valuetypes()) {
	setTruncStoreAction(InnerVT, VT, Expand);

	setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

	// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
	// types, we have to deal with them whether we ask for Expansion or not.
	// Setting Expand causes its own optimisation problems though, so leave
	// them legal.
	if (VT.getVectorElementType() == MVT::i1)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
	// split/scalarized right now.
	if (VT.getVectorElementType() == MVT::f16)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
	}
	}

	// FIXME: In order to prevent SSE instructions being expanded to MMX ones
	// with -msoft-float, disable use of MMX as well.
	if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
	// No operations on x86mmx supported, everything uses intrinsics.
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
	addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
	setOperationAction(ISD::FABS, MVT::v4f32, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
	setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
	setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

	setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
	setOperationAction(ISD::STORE, MVT::v2f32, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
	addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
	// registers cannot be used even for integer operations.
	addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
	MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
	setOperationAction(ISD::SDIV, VT, Custom);
	setOperationAction(ISD::SREM, VT, Custom);
	setOperationAction(ISD::UDIV, VT, Custom);
	setOperationAction(ISD::UREM, VT, Custom);
	}

	setOperationAction(ISD::MUL, MVT::v2i8, Custom);
	setOperationAction(ISD::MUL, MVT::v2i16, Custom);
	setOperationAction(ISD::MUL, MVT::v2i32, Custom);
	setOperationAction(ISD::MUL, MVT::v4i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i16, Custom);
	setOperationAction(ISD::MUL, MVT::v8i8, Custom);

	setOperationAction(ISD::MUL, MVT::v16i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);
	setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
	setOperationAction(ISD::MUL, MVT::v8i16, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
	setOperationAction(ISD::FABS, MVT::v2f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
	}

	setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
	setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
	setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
	setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
	setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
	setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);

	if (!ExperimentalVectorWideningLegalization) {
	// Use widening instead of promotion.
	for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8,
	MVT::v4i16, MVT::v2i16 }) {
	setOperationAction(ISD::UADDSAT, VT, Custom);
	setOperationAction(ISD::SADDSAT, VT, Custom);
	setOperationAction(ISD::USUBSAT, VT, Custom);
	setOperationAction(ISD::SSUBSAT, VT, Custom);
	}
	}

	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

	// Provide custom widening for v2f32 setcc. This is really for VLX when
	// setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
	// type legalization changing the result type to v4i1 during widening.
	// It works fine for SSE2 and is probably faster so no need to qualify with
	// VLX support.
	setOperationAction(ISD::SETCC, MVT::v2i32, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::ABS, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// We support custom legalizing of sext and anyext loads for specific
	// memory vector types which we can load as a scalar (or sequence of
	// scalars) and extend in-register to a legal 128-bit vector type. For sext
	// loads these must work with a single scalar load.
	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
	}

	for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);

	if (VT == MVT::v2i64 && !Subtarget.is64Bit())
	continue;

	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// Custom lower v2i64 and v2f64 selects.
	setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
	setOperationAction(ISD::SELECT, MVT::v16i8, Custom);

	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);

	// Custom legalize these to avoid over promotion or custom promotion.
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);

	// By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into
	// promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is
	// split again based on the input type, this will cause an AssertSExt i16 to
	// be emitted instead of an AssertZExt. This will allow packssdw followed by
	// packuswb to be used to truncate to v8i8. This is necessary since packusdw
	// isn't available until sse4.1.
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);

	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

	// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
	setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

	setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

	// We want to legalize this to an f64 load rather than an i64 load on
	// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
	// store.
	setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
	setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
	setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
	setOperationAction(ISD::STORE, MVT::v2i32, Custom);
	setOperationAction(ISD::STORE, MVT::v4i16, Custom);
	setOperationAction(ISD::STORE, MVT::v8i8, Custom);

	setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
	setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
	if (!Subtarget.hasAVX512())
	setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

	if (ExperimentalVectorWideningLegalization) {
	setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);

	setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
	} else {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
	}

	// In the customized shift lowering, the legal v4i32/v2i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
	setOperationAction(ISD::ROTL, MVT::v8i16, Custom);

	// With AVX512, expanding (and promoting the shifts) is better.
	if (!Subtarget.hasAVX512())
	setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
	setOperationAction(ISD::ABS, MVT::v16i8, Legal);
	setOperationAction(ISD::ABS, MVT::v8i16, Legal);
	setOperationAction(ISD::ABS, MVT::v4i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
	setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);

	// These might be better off as horizontal vector ops.
	setOperationAction(ISD::ADD, MVT::i16, Custom);
	setOperationAction(ISD::ADD, MVT::i32, Custom);
	setOperationAction(ISD::SUB, MVT::i16, Custom);
	setOperationAction(ISD::SUB, MVT::i32, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
	for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
	setOperationAction(ISD::FCEIL, RoundedTy, Legal);
	setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
	setOperationAction(ISD::FRINT, RoundedTy, Legal);
	setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
	}

	setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
	setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

	// FIXME: Do we need to handle scalar-to-vector here?
	setOperationAction(ISD::MUL, MVT::v4i32, Legal);

	// We directly match byte blends in the backend as they match the VSELECT
	// condition form.
	setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

	// SSE41 brings specific instructions for doing vector sign extend even in
	// cases where we don't have SRA.
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
	}

	if (!ExperimentalVectorWideningLegalization) {
	// Avoid narrow result types when widening. The legal types are listed
	// in the next loop.
	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
	}
	}

	// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
	if (!ExperimentalVectorWideningLegalization)
	setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
	}

	// i8 vectors are custom because the source register and source
	// source memory operand types are not the same width.
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::ROTL, VT, Custom);

	// XOP can efficiently perform BITREVERSE with VPPERM.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
	bool HasInt256 = Subtarget.hasInt256();

	addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);

	for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
	// even though v8i16 is a legal type.
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);

	setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);

	if (!Subtarget.hasAVX512())
	setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);

	// In the customized shift lowering, the legal v8i32/v4i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	// These types need custom splitting if their input is a 128-bit vector.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

	setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
	setOperationAction(ISD::ROTL, MVT::v16i16, Custom);

	// With BWI, expanding (and promoting the shifts) is the better.
	if (!Subtarget.hasBWI())
	setOperationAction(ISD::ROTL, MVT::v32i8, Custom);

	setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
	setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
	setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
	setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

	for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	if (Subtarget.hasAnyFMA()) {
	for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
	MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::FMA, VT, Legal);
	}

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
	}

	setOperationAction(ISD::MUL, MVT::v4i64, Custom);
	setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v32i8, Custom);

	setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

	setOperationAction(ISD::ABS, MVT::v4i64, Custom);
	setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
	setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
	setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
	setOperationAction(ISD::UMIN, MVT::v4i64, Custom);

	setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
	}

	for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
	}

	if (HasInt256) {
	// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
	// when we have a 256bit-wide blend with immediate.
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

	// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
	}
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MSTORE, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 128-bit but the source is 256-bit wide.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
	}

	// Custom lower several nodes for 256-bit types.
	for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	setOperationAction(ISD::STORE, VT, Custom);
	}

	if (HasInt256)
	setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

	if (HasInt256) {
	// Custom legalize 2x32 to get a little better code.
	setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
	setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MGATHER, VT, Custom);
	}
	}

	// This block controls legalization of the mask vector sizes that are
	// available with AVX512. 512-bit vectors are in a separate block controlled
	// by useAVX512Regs.
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
	addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
	addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
	addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
	addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

	setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);

	// There is no byte sized k-register load or store without AVX512DQ.
	if (!Subtarget.hasDQI()) {
	setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
	setOperationAction(ISD::LOAD, MVT::v8i1, Custom);

	setOperationAction(ISD::STORE, MVT::v1i1, Custom);
	setOperationAction(ISD::STORE, MVT::v2i1, Custom);
	setOperationAction(ISD::STORE, MVT::v4i1, Custom);
	setOperationAction(ISD::STORE, MVT::v8i1, Custom);
	}

	// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::UADDSAT, VT, Custom);
	setOperationAction(ISD::SADDSAT, VT, Custom);
	setOperationAction(ISD::USUBSAT, VT, Custom);
	setOperationAction(ISD::SSUBSAT, VT, Custom);

	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	}

	for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	}

	// This block controls legalization for 512-bit operations with 32/64 bit
	// elements. 512-bits can be disabled based on prefer-vector-width and
	// required-vector-width function attributes.
	if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
	addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
	addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
	addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
	addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
	}

	for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FMA, VT, Legal);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
	setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);

	setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

	// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
	// to 512-bit rather than use the AVX2 instructions so that we can use
	// k-masks.
	if (!Subtarget.hasVLX()) {
	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
	setOperationAction(ISD::MLOAD, VT, Custom);
	setOperationAction(ISD::MSTORE, VT, Custom);
	}
	}

	setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

	if (ExperimentalVectorWideningLegalization) {
	// Need to custom widen this if we don't have AVX512BW.
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
	}

	for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);

	setOperationAction(ISD::SELECT, VT, Custom);
	}

	// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
	for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
	}

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);

	setOperationAction(ISD::MUL, MVT::v8i64, Custom);
	setOperationAction(ISD::MUL, MVT::v16i32, Legal);

	setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i32, Custom);

	for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	if (Subtarget.hasDQI()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);

	setOperationAction(ISD::MUL, MVT::v8i64, Legal);
	}

	if (Subtarget.hasCDI()) {
	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v16i32, MVT::v8i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 256-bit but the source is 512-bit wide.
	// 128-bit was made Legal under AVX1.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

	for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::MGATHER, VT, Custom);
	setOperationAction(ISD::MSCATTER, VT, Custom);
	}
	// Need to custom split v32i16/v64i8 bitcasts.
	if (!Subtarget.hasBWI()) {
	setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
	}

	if (Subtarget.hasVBMI2()) {
	for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::FSHL, VT, Custom);
	setOperationAction(ISD::FSHR, VT, Custom);
	}
	}
	}// has AVX-512

	// This block controls legalization for operations that don't have
	// pre-AVX512 equivalents. Without VLX we use 512-bit operations for
	// narrower widths.
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.
	// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

	setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);

	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	}

	// Custom legalize 2x32 to get a little better code.
	setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
	setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MSCATTER, VT, Custom);

	if (Subtarget.hasDQI()) {
	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SINT_TO_FP, VT, Legal);
	setOperationAction(ISD::UINT_TO_FP, VT, Legal);
	setOperationAction(ISD::FP_TO_SINT, VT, Legal);
	setOperationAction(ISD::FP_TO_UINT, VT, Legal);

	setOperationAction(ISD::MUL, VT, Legal);
	}
	}

	if (Subtarget.hasCDI()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	// This block control legalization of v32i1/v64i1 which are available with
	// AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
	// useBWIRegs.
	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
	addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
	addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

	for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	setOperationAction(ISD::UADDSAT, VT, Custom);
	setOperationAction(ISD::SADDSAT, VT, Custom);
	setOperationAction(ISD::USUBSAT, VT, Custom);
	setOperationAction(ISD::SSUBSAT, VT, Custom);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	}

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
	for (auto VT : { MVT::v16i1, MVT::v32i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	// Extends from v32i1 masks to 256-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
	}

	// This block controls legalization for v32i16 and v64i8. 512-bits can be
	// disabled based on prefer-vector-width and required-vector-width function
	// attributes.
	if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
	addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
	addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

	// Extends from v64i1 masks to 512-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

	setOperationAction(ISD::MUL, MVT::v32i16, Legal);
	setOperationAction(ISD::MUL, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
	setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);

	setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

	for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::UADDSAT, VT, Legal);
	setOperationAction(ISD::SADDSAT, VT, Legal);
	setOperationAction(ISD::USUBSAT, VT, Legal);
	setOperationAction(ISD::SSUBSAT, VT, Legal);
	setOperationAction(ISD::SELECT, VT, Custom);

	// The condition codes aren't legal in SSE/AVX and under AVX512 we use
	// setcc all the way to isel and prefer SETGT in some isel patterns.
	setCondCodeAction(ISD::SETLT, VT, Custom);
	setCondCodeAction(ISD::SETLE, VT, Custom);
	}

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
	}

	if (Subtarget.hasBITALG()) {
	for (auto VT : { MVT::v64i8, MVT::v32i16 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}

	if (Subtarget.hasVBMI2()) {
	setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
	setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
	for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
	setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
	}

	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.
	// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

	if (Subtarget.hasBITALG()) {
	for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
	setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

	setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

	if (Subtarget.hasDQI()) {
	// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
	// v2f32 UINT_TO_FP is already custom under SSE2.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
	assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
	"Unexpected operation action!");
	// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
	setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
	}

	if (Subtarget.hasBWI()) {
	setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
	}

	if (Subtarget.hasVBMI2()) {
	// TODO: Make these legal even without VLX?
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::FSHL, VT, Custom);
	setOperationAction(ISD::FSHR, VT, Custom);
	}
	}
	}

	// We want to custom lower some of our intrinsics.
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
	if (!Subtarget.is64Bit()) {
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
	}

	// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
	// handle type legalization for these operations here.
	//
	// FIXME: We really should do custom legalization for addition and
	// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
	// than generic legalization for 64-bit multiplication-with-overflow, though.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	// Add/Sub/Mul with overflow operations are custom lowered.
	setOperationAction(ISD::SADDO, VT, Custom);
	setOperationAction(ISD::UADDO, VT, Custom);
	setOperationAction(ISD::SSUBO, VT, Custom);
	setOperationAction(ISD::USUBO, VT, Custom);
	setOperationAction(ISD::SMULO, VT, Custom);
	setOperationAction(ISD::UMULO, VT, Custom);

	// Support carry in as value rather than glue.
	setOperationAction(ISD::ADDCARRY, VT, Custom);
	setOperationAction(ISD::SUBCARRY, VT, Custom);
	setOperationAction(ISD::SETCCCARRY, VT, Custom);
	}

	if (!Subtarget.is64Bit()) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	setLibcallName(RTLIB::MUL_I128, nullptr);
	}

	// Combine sin / cos into _sincos_stret if it is available.
	if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
	getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	}

	if (Subtarget.isTargetWin64()) {
	setOperationAction(ISD::SDIV, MVT::i128, Custom);
	setOperationAction(ISD::UDIV, MVT::i128, Custom);
	setOperationAction(ISD::SREM, MVT::i128, Custom);
	setOperationAction(ISD::UREM, MVT::i128, Custom);
	setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
	setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
	}

	// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
	// is. We should promote the value to 64-bits to solve this.
	// This is what the CRT headers do - `fmodf` is an inline header
	// function casting to f64 and calling `fmod`.
	if (Subtarget.is32Bit() &&
	(Subtarget.isTargetWindowsMSVC() \|\| Subtarget.isTargetWindowsItanium()))
	for (ISD::NodeType Op :
	{ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
	ISD::FLOG10, ISD::FPOW, ISD::FSIN})
	if (isOperationExpand(Op, MVT::f32))
	setOperationAction(Op, MVT::f32, Promote);

	// We have target-specific dag combine patterns for the following nodes:
	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
	setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
	setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
	setTargetDAGCombine(ISD::CONCAT_VECTORS);
	setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
	setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::VSELECT);
	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::OR);
	setTargetDAGCombine(ISD::AND);
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::FADD);
	setTargetDAGCombine(ISD::FSUB);
	setTargetDAGCombine(ISD::FNEG);
	setTargetDAGCombine(ISD::FMA);
	setTargetDAGCombine(ISD::FMINNUM);
	setTargetDAGCombine(ISD::FMAXNUM);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::MLOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::MSTORE);
	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
	setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);
	setTargetDAGCombine(ISD::SETCC);
	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::MSCATTER);
	setTargetDAGCombine(ISD::MGATHER);

	computeRegisterProperties(Subtarget.getRegisterInfo());

	MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
	MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
	MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
	MaxStoresPerMemmoveOptSize = 4;

	// TODO: These control memcmp expansion in CGP and could be raised higher, but
	// that needs to benchmarked and balanced with the potential use of vector
	// load/store types (PR33329, PR33914).
	MaxLoadsPerMemcmp = 2;
	MaxLoadsPerMemcmpOptSize = 2;

	// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
	setPrefLoopAlignment(ExperimentalPrefLoopAlignment);

	// An out-of-order CPU can speculatively execute past a predictable branch,
	// but a conditional move could be stalled by an expensive earlier operation.
	PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
	EnableExtLdPromotion = true;
	setPrefFunctionAlignment(4); // 2^4 bytes.

	verifyIntrinsicTables();
	}

	// This has so far only been implemented for 64-bit MachO.
	bool X86TargetLowering::useLoadStackGuardNode() const {
	return Subtarget.isTargetMachO() && Subtarget.is64Bit();
	}

	bool X86TargetLowering::useStackGuardXorFP() const {
	// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
	return Subtarget.getTargetTriple().isOSMSVCRT();
	}

	SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
	const SDLoc &DL) const {
	EVT PtrTy = getPointerTy(DAG.getDataLayout());
	unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
	MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
	return SDValue(Node, 0);
	}

	TargetLoweringBase::LegalizeTypeAction
	X86TargetLowering::getPreferredVectorAction(MVT VT) const {
	if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
	return TypeSplitVector;

	if (ExperimentalVectorWideningLegalization &&
	VT.getVectorNumElements() != 1 &&
	VT.getVectorElementType() != MVT::i1)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
	CallingConv::ID CC,
	EVT VT) const {
	if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
	return MVT::v32i8;
	return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
	}

	unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
	CallingConv::ID CC,
	EVT VT) const {
	if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
	return 1;
	return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
	}

	EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
	LLVMContext& Context,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i8;

	if (Subtarget.hasAVX512()) {
	const unsigned NumElts = VT.getVectorNumElements();

	// Figure out what this type will be legalized to.
	EVT LegalVT = VT;
	while (getTypeAction(Context, LegalVT) != TypeLegal)
	LegalVT = getTypeToTransformTo(Context, LegalVT);

	// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
	if (LegalVT.getSimpleVT().is512BitVector())
	return EVT::getVectorVT(Context, MVT::i1, NumElts);

	if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
	// If we legalized to less than a 512-bit vector, then we will use a vXi1
	// compare for vXi32/vXi64 for sure. If we have BWI we will also support
	// vXi16/vXi8.
	MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
	if (Subtarget.hasBWI() \|\| EltVT.getSizeInBits() >= 32)
	return EVT::getVectorVT(Context, MVT::i1, NumElts);
	}
	}

	return VT.changeVectorElementTypeToInteger();
	}

	/// Helper for getByValTypeAlignment to determine
	/// the desired ByVal argument alignment.
	static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
	if (MaxAlign == 16)
	return;
	if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
	if (VTy->getBitWidth() == 128)
	MaxAlign = 16;
	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	unsigned EltAlign = 0;
	getMaxByValAlign(ATy->getElementType(), EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
	for (auto *EltTy : STy->elements()) {
	unsigned EltAlign = 0;
	getMaxByValAlign(EltTy, EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	if (MaxAlign == 16)
	break;
	}
	}
	}

	/// Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. For X86, aggregates
	/// that contain SSE vectors are placed at 16-byte boundaries while the rest
	/// are at 4-byte boundaries.
	unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const {
	if (Subtarget.is64Bit()) {
	// Max of 8 and alignment of type.
	unsigned TyAlign = DL.getABITypeAlignment(Ty);
	if (TyAlign > 8)
	return TyAlign;
	return 8;
	}

	unsigned Align = 4;
	if (Subtarget.hasSSE1())
	getMaxByValAlign(Ty, Align);
	return Align;
	}

	/// Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	/// For vector ops we check that the overall size isn't larger than our
	/// preferred vector width.
	EVT X86TargetLowering::getOptimalMemOpType(
	uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
	bool ZeroMemset, bool MemcpyStrSrc,
	const AttributeList &FuncAttributes) const {
	if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
	if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() \|\|
	((DstAlign == 0 \|\| DstAlign >= 16) &&
	(SrcAlign == 0 \|\| SrcAlign >= 16)))) {
	// FIXME: Check if unaligned 32-byte accesses are slow.
	if (Size >= 32 && Subtarget.hasAVX() &&
	(Subtarget.getPreferVectorWidth() >= 256)) {
	// Although this isn't a well-supported type for AVX1, we'll let
	// legalization and shuffle lowering produce the optimal codegen. If we
	// choose an optimal type with a vector element larger than a byte,
	// getMemsetStores() may create an intermediate splat (using an integer
	// multiply) before we splat as a vector.
	return MVT::v32i8;
	}
	if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
	return MVT::v16i8;
	// TODO: Can SSE1 handle a byte vector?
	// If we have SSE1 registers we should be able to use them.
	if (Subtarget.hasSSE1() && (Subtarget.is64Bit() \|\| Subtarget.hasX87()) &&
	(Subtarget.getPreferVectorWidth() >= 128))
	return MVT::v4f32;
	} else if ((!IsMemset \|\| ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
	!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
	// Do not use f64 to lower memcpy if source is string constant. It's
	// better to use i32 to avoid the loads.
	// Also, do not use f64 to lower memset unless this is a memset of zeros.
	// The gymnastics of splatting a byte value into an XMM register and then
	// only using 8-byte stores (because this is a CPU with slow unaligned
	// 16-byte accesses) makes that a loser.
	return MVT::f64;
	}
	}
	// This is a compromise. If we reach here, unaligned accesses may be slow on
	// this target. However, creating smaller, aligned accesses could be even
	// slower and would certainly be a lot more code.
	if (Subtarget.is64Bit() && Size >= 8)
	return MVT::i64;
	return MVT::i32;
	}

	bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
	if (VT == MVT::f32)
	return X86ScalarSSEf32;
	else if (VT == MVT::f64)
	return X86ScalarSSEf64;
	return true;
	}

	bool X86TargetLowering::allowsMisalignedMemoryAccesses(
	EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
	bool *Fast) const {
	if (Fast) {
	switch (VT.getSizeInBits()) {
	default:
	// 8-byte and under are always assumed to be fast.
	*Fast = true;
	break;
	case 128:
	*Fast = !Subtarget.isUnalignedMem16Slow();
	break;
	case 256:
	*Fast = !Subtarget.isUnalignedMem32Slow();
	break;
	// TODO: What about AVX-512 (512-bit) accesses?
	}
	}
	// NonTemporal vector memory ops must be aligned.
	if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
	// NT loads can only be vector aligned, so if its less aligned than the
	// minimum vector size (which we can split the vector down to), we might as
	// well use a regular unaligned vector load.
	// We don't have any NT loads pre-SSE41.
	if (!!(Flags & MachineMemOperand::MOLoad))
	return (Align < 16 \|\| !Subtarget.hasSSE41());
	return false;
	}
	// Misaligned accesses of any size are always allowed.
	return true;
	}

	/// Return the entry encoding for a jump table in the
	/// current function. The returned value is a member of the
	/// MachineJumpTableInfo::JTEntryKind enum.
	unsigned X86TargetLowering::getJumpTableEncoding() const {
	// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
	// symbol.
	if (isPositionIndependent() && Subtarget.isPICStyleGOT())
	return MachineJumpTableInfo::EK_Custom32;

	// Otherwise, use the normal jump table encoding heuristics.
	return TargetLowering::getJumpTableEncoding();
	}

	bool X86TargetLowering::useSoftFloat() const {
	return Subtarget.useSoftFloat();
	}

	void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const {

	// Only relabel X86-32 for C / Stdcall CCs.
	if (Subtarget.is64Bit())
	return;
	if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
	return;
	unsigned ParamRegs = 0;
	if (auto *M = MF->getFunction().getParent())
	ParamRegs = M->getNumberRegisterParameters();

	// Mark the first N int arguments as having reg
	for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
	Type *T = Args[Idx].Ty;
	if (T->isIntOrPtrTy())
	if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
	unsigned numRegs = 1;
	if (MF->getDataLayout().getTypeAllocSize(T) > 4)
	numRegs = 2;
	if (ParamRegs < numRegs)
	return;
	ParamRegs -= numRegs;
	Args[Idx].IsInReg = true;
	}
	}
	}

	const MCExpr *
	X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
	const MachineBasicBlock *MBB,
	unsigned uid,MCContext &Ctx) const{
	assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
	// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
	// entries.
	return MCSymbolRefExpr::create(MBB->getSymbol(),
	MCSymbolRefExpr::VK_GOTOFF, Ctx);
	}

	/// Returns relocation base for the given PIC jumptable.
	SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const {
	if (!Subtarget.is64Bit())
	// This doesn't have SDLoc associated with it, but is not really the
	// same as a Register.
	return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()));
	return Table;
	}

	/// This returns the relocation base for the given PIC jumptable,
	/// the same as getPICJumpTableRelocBase, but as an MCExpr.
	const MCExpr *X86TargetLowering::
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
	MCContext &Ctx) const {
	// X86-64 uses RIP relative addressing based on the jump table label.
	if (Subtarget.isPICStyleRIPRel())
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

	// Otherwise, the reference is relative to the PIC base.
	return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
	}

	std::pair<const TargetRegisterClass *, uint8_t>
	X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const {
	const TargetRegisterClass *RRC = nullptr;
	uint8_t Cost = 1;
	switch (VT.SimpleTy) {
	default:
	return TargetLowering::findRepresentativeClass(TRI, VT);
	case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
	RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
	break;
	case MVT::x86mmx:
	RRC = &X86::VR64RegClass;
	break;
	case MVT::f32: case MVT::f64:
	case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
	case MVT::v4f32: case MVT::v2f64:
	case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
	case MVT::v8f32: case MVT::v4f64:
	case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
	case MVT::v16f32: case MVT::v8f64:
	RRC = &X86::VR128XRegClass;
	break;
	}
	return std::make_pair(RRC, Cost);
	}

	unsigned X86TargetLowering::getAddressSpace() const {
	if (Subtarget.is64Bit())
	return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
	return 256;
	}

	static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
	return TargetTriple.isOSGlibc() \|\| TargetTriple.isOSFuchsia() \|\|
	(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
	}

	static Constant* SegmentOffset(IRBuilder<> &IRB,
	unsigned Offset, unsigned AddressSpace) {
	return ConstantExpr::getIntToPtr(
	ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
	Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
	}

	Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// glibc, bionic, and Fuchsia have a special slot for the stack guard in
	// tcbhead_t; use it instead of the usual global variable (see
	// sysdeps/{i386,x86_64}/nptl/tls.h)
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
	return SegmentOffset(IRB, 0x10, getAddressSpace());
	} else {
	// %fs:0x28, unless we're using a Kernel code model, in which case
	// it's %gs:0x28. gs:0x14 on i386.
	unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}
	}

	return TargetLowering::getIRStackGuard(IRB);
	}

	void X86TargetLowering::insertSSPDeclarations(Module &M) const {
	// MSVC CRT provides functionalities for stack protection.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
	"__security_check_cookie", Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext()));
	if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
	F->setCallingConv(CallingConv::X86_FastCall);
	F->addAttribute(1, Attribute::AttrKind::InReg);
	}
	return;
	}
	// glibc, bionic, and Fuchsia have a special slot for the stack guard.
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
	return;
	TargetLowering::insertSSPDeclarations(M);
	}

	Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	return M.getGlobalVariable("__security_cookie");
	}
	return TargetLowering::getSDagStackGuard(M);
	}

	Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() \|\|
	Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
	return M.getFunction("__security_check_cookie");
	}
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	if (Subtarget.getTargetTriple().isOSContiki())
	return getDefaultSafeStackPointerLocation(IRB, false);

	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget.isTargetAndroid()) {
	// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
	// %gs:0x24 on i386
	unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}

	// Fuchsia is similar.
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
	return SegmentOffset(IRB, 0x18, getAddressSpace());
	}

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
	unsigned DestAS) const {
	assert(SrcAS != DestAS && "Expected different address spaces!");

	return SrcAS < 256 && DestAS < 256;
	}

	//===----------------------------------------------------------------------===//
	// Return Value Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	bool X86TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC_X86);
	}

	const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
	static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
	return ScratchRegs;
	}

	/// Lowers masks values (v*i1) to the local register values
	/// \returns DAG node after lowering to register type
	static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
	const SDLoc &Dl, SelectionDAG &DAG) {
	EVT ValVT = ValArg.getValueType();

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
	DAG.getIntPtrConstant(0, Dl));

	if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 \|\| ValLoc == MVT::i32)) \|\|
	(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 \|\| ValLoc == MVT::i32))) {
	// Two stage lowering might be required
	// bitcast: v8i1 -> i8 / v16i1 -> i16
	// anyextend: i8 -> i32 / i16 -> i32
	EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
	SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
	if (ValLoc == MVT::i32)
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
	return ValToCopy;
	}

	if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) \|\|
	(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
	// One stage lowering is required
	// bitcast: v32i1 -> i32 / v64i1 -> i64
	return DAG.getBitcast(ValLoc, ValArg);
	}

	return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
	}

	/// Breaks v64i1 value into two registers and adds the new node to the DAG
	static void Passv64i1ArgInRegs(
	const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
	CCValAssign &NextVA, const X86Subtarget &Subtarget) {
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The value should reside in two registers");

	// Before splitting the value we cast it to i64
	Arg = DAG.getBitcast(MVT::i64, Arg);

	// Splitting the value into two i32 types
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(0, Dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(1, Dl, MVT::i32));

	// Attach the two i32 types into corresponding registers
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
	RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
	}

	SDValue
	X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	// In some cases we need to disable registers from the default CSR list.
	// For example, when they are used for argument passing.
	bool ShouldDisableCalleeSavedRegister =
	CallConv == CallingConv::X86_RegCall \|\|
	MF.getFunction().hasFnAttribute("no_caller_saved_registers");

	if (CallConv == CallingConv::X86_INTR && !Outs.empty())
	report_fatal_error("X86 interrupts may not return any value");

	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC_X86);

	SDValue Flag;
	SmallVector<SDValue, 6> RetOps;
	RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
	// Operand #1 = Bytes To Pop
	RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
	MVT::i32));

	// Copy the result values into the output registers.
	for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = RVLocs[I];
	assert(VA.isRegLoc() && "Can only return in registers!");

	// Add the register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

	SDValue ValToCopy = OutVals[OutsIndex];
	EVT ValVT = ValToCopy.getValueType();

	// Promote values to the appropriate types.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::AExt) {
	if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
	ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
	else
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
	}
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

	assert(VA.getLocInfo() != CCValAssign::FPExt &&
	"Unexpected FP-extend for return value.");

	// If this is x86-64, and we disabled SSE, we can't return FP values,
	// or SSE or MMX vectors.
	if ((ValVT == MVT::f32 \|\| ValVT == MVT::f64 \|\|
	VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) &&
	(Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	} else if (ValVT == MVT::f64 &&
	(Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
	// Likewise we can't return F64 values with SSE1 only. gcc does so, but
	// llvm-gcc has never done it right and no one has noticed, so this
	// should be OK for now.
	errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// Returns in ST0/ST1 are handled specially: these are pushed as operands to
	// the RET instruction and handled by the FP Stackifier.
	if (VA.getLocReg() == X86::FP0 \|\|
	VA.getLocReg() == X86::FP1) {
	// If this is a copy from an xmm register to ST(0), use an FPExtend to
	// change the value to the FP stack register class.
	if (isScalarFPTypeInSSEReg(VA.getValVT()))
	ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
	RetOps.push_back(ValToCopy);
	// Don't emit a copytoreg.
	continue;
	}

	// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
	// which is returned in RAX / RDX.
	if (Subtarget.is64Bit()) {
	if (ValVT == MVT::x86mmx) {
	if (VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) {
	ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
	ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	ValToCopy);
	// If we don't have SSE2 available, convert to v4f32 so the generated
	// register is legal.
	if (!Subtarget.hasSSE2())
	ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
	}
	}
	}

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
	Subtarget);

	assert(2 == RegsToPass.size() &&
	"Expecting two registers after Pass64BitArgInRegs");

	// Add the second register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
	} else {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
	}

	// Add nodes to the DAG and add the values into the RetOps list
	for (auto &Reg : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
	}
	}

	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

	// All x86 ABIs require that for returning structs by value we copy
	// the sret argument into %rax/%eax (depending on ABI) for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into %rax/%eax.
	//
	// Checking Function.hasStructRetAttr() here is insufficient because the IR
	// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
	// false, then an sret argument may be implicitly inserted in the SelDAG. In
	// either case FuncInfo->setSRetReturnReg() will have been called.
	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
	// When we have both sret and another return value, we should use the
	// original Chain stored in RetOps[0], instead of the current Chain updated
	// in the above loop. If we only have sret, RetOps[0] equals to Chain.

	// For the case of sret and another return value, we have
	// Chain_0 at the function entry
	// Chain_1 = getCopyToReg(Chain_0) in the above loop
	// If we use Chain_1 in getCopyFromReg, we will have
	// Val = getCopyFromReg(Chain_1)
	// Chain_2 = getCopyToReg(Chain_1, Val) from below

	// getCopyToReg(Chain_0) will be glued together with
	// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
	// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
	// Data dependency from Unit B to Unit A due to usage of Val in
	// getCopyToReg(Chain_1, Val)
	// Chain dependency from Unit A to Unit B

	// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
	SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
	getPointerTy(MF.getDataLayout()));

	unsigned RetValReg
	= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
	X86::RAX : X86::EAX;
	Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
	Flag = Chain.getValue(1);

	// RAX/EAX now acts like a return value.
	RetOps.push_back(
	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

	// Add the returned register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
	}

	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (X86::GR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	X86ISD::NodeType opcode = X86ISD::RET_FLAG;
	if (CallConv == CallingConv::X86_INTR)
	opcode = X86ISD::IRET;
	return DAG.getNode(opcode, dl, MVT::Other, RetOps);
	}

	bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
	if (N->getNumValues() != 1 \|\| !N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() != X86ISD::RET_FLAG)
	return false;
	// If we are returning more than one value, we can definitely
	// not make a tail call see PR19530
	if (UI->getNumOperands() > 4)
	return false;
	if (UI->getNumOperands() == 4 &&
	UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType ExtendKind) const {
	MVT ReturnMVT = MVT::i32;

	bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
	if (VT == MVT::i1 \|\| (!Darwin && (VT == MVT::i8 \|\| VT == MVT::i16))) {
	// The ABI does not require i1, i8 or i16 to be extended.
	//
	// On Darwin, there is code in the wild relying on Clang's old behaviour of
	// always extending i8/i16 return values, so keep doing that for now.
	// (PR26665).
	ReturnMVT = MVT::i8;
	}

	EVT MinVT = getRegisterType(Context, ReturnMVT);
	return VT.bitsLT(MinVT) ? MinVT : VT;
	}

	/// Reads two 32 bit registers and creates a 64 bit mask value.
	/// \param VA The current 32 bit value that need to be assigned.
	/// \param NextVA The next 32 bit value that need to be assigned.
	/// \param Root The parent DAG node.
	/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
	/// glue purposes. In the case the DAG is already using
	/// physical register instead of virtual, we should glue
	/// our new SDValue to InFlag SDvalue.
	/// \return a new SDvalue of size 64bit.
	static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
	SDValue &Root, SelectionDAG &DAG,
	const SDLoc &Dl, const X86Subtarget &Subtarget,
	SDValue *InFlag = nullptr) {
	assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(VA.getValVT() == MVT::v64i1 &&
	"Expecting first location of 64 bit width type");
	assert(NextVA.getValVT() == VA.getValVT() &&
	"The locations should have the same type");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The values should reside in two registers");

	SDValue Lo, Hi;
	SDValue ArgValueLo, ArgValueHi;

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetRegisterClass *RC = &X86::GR32RegClass;

	// Read a 32 bit value from the registers.
	if (nullptr == InFlag) {
	// When no physical register is present,
	// create an intermediate virtual register.
	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
	ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	} else {
	// When a physical register is available read the value from it and glue
	// the reads together.
	ArgValueLo =
	DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueLo.getValue(2);
	ArgValueHi =
	DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueHi.getValue(2);
	}

	// Convert the i32 type into v32i1 type.
	Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

	// Convert the i32 type into v32i1 type.
	Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

	// Concatenate the two values together.
	return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
	}

	/// The function will lower a register of various sizes (8/16/32/64)
	/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
	/// \returns a DAG node contains the operand after lowering to mask type.
	static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
	const EVT &ValLoc, const SDLoc &Dl,
	SelectionDAG &DAG) {
	SDValue ValReturned = ValArg;

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

	if (ValVT == MVT::v64i1) {
	// In 32 bit machine, this case is handled by getv64i1Argument
	assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
	// In 64 bit machine, There is no need to truncate the value only bitcast
	} else {
	MVT maskLen;
	switch (ValVT.getSimpleVT().SimpleTy) {
	case MVT::v8i1:
	maskLen = MVT::i8;
	break;
	case MVT::v16i1:
	maskLen = MVT::i16;
	break;
	case MVT::v32i1:
	maskLen = MVT::i32;
	break;
	default:
	llvm_unreachable("Expecting a vector of i1 types");
	}

	ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
	}
	return DAG.getBitcast(ValVT, ValReturned);
	}

	/// Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	///
	SDValue X86TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	uint32_t *RegMask) const {

	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	bool Is64Bit = Subtarget.is64Bit();
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++InsIndex) {
	CCValAssign &VA = RVLocs[I];
	EVT CopyVT = VA.getLocVT();

	// In some calling conventions we need to remove the used registers
	// from the register mask.
	if (RegMask) {
	for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));
	}

	// If this is x86-64, and we disabled SSE, we can't return FP values
	if ((CopyVT == MVT::f32 \|\| CopyVT == MVT::f64 \|\| CopyVT == MVT::f128) &&
	((Is64Bit \|\| Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// If we prefer to use the value in xmm registers, copy it out as f80 and
	// use a truncate to move it from fp stack reg to xmm reg.
	bool RoundAfterCopy = false;
	if ((VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1) &&
	isScalarFPTypeInSSEReg(VA.getValVT())) {
	if (!Subtarget.hasX87())
	report_fatal_error("X87 register return with X87 disabled");
	CopyVT = MVT::f80;
	RoundAfterCopy = (CopyVT != VA.getLocVT());
	}

	SDValue Val;
	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	Val =
	getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
	} else {
	Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
	.getValue(1);
	Val = Chain.getValue(0);
	InFlag = Chain.getValue(2);
	}

	if (RoundAfterCopy)
	Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
	// This truncation won't change the value.
	DAG.getIntPtrConstant(1, dl));

	if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
	if (VA.getValVT().isVector() &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
	} else
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	//===----------------------------------------------------------------------===//
	// C & StdCall & Fast Calling Convention implementation
	//===----------------------------------------------------------------------===//
	// StdCall calling convention seems to be standard for many Windows' API
	// routines and around. It differs from C calling convention just a little:
	// callee should clean up the stack, not caller. Symbols should be also
	// decorated in some fancy way :) It doesn't support any vector arguments.
	// For info on fast calling convention see Fast Calling Convention (tail call)
	// implementation LowerX86_32FastCCCallTo.

	/// CallIsStructReturn - Determines whether a call uses struct return
	/// semantics.
	enum StructReturnType {
	NotStructReturn,
	RegStructReturn,
	StackStructReturn
	};
	static StructReturnType
	callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
	if (Outs.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Determines whether a function uses struct return semantics.
	static StructReturnType
	argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
	if (Ins.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Make a copy of an aggregate at address specified by "Src" to address
	/// "Dst" with size and alignment information specified by the specific
	/// parameter attribute. The copy will be passed as a byval function parameter.
	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
	SDValue Chain, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) {
	SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);

	return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
	/isVolatile/false, /AlwaysInline=/true,
	/isTailCall/false,
	MachinePointerInfo(), MachinePointerInfo());
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
	CC == CallingConv::X86_RegCall \|\| CC == CallingConv::HiPE \|\|
	CC == CallingConv::HHVM);
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	// C calling conventions:
	case CallingConv::C:
	case CallingConv::Win64:
	case CallingConv::X86_64_SysV:
	// Callee pop conventions:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_StdCall:
	case CallingConv::X86_VectorCall:
	case CallingConv::X86_FastCall:
	// Swift:
	case CallingConv::Swift:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	/// Return true if the function is being made into a tailcall target by
	/// changing its ABI.
	static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
	return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
	}

	bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	auto Attr =
	CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
	if (!CI->isTailCall() \|\| Attr.getValueAsString() == "true")
	return false;

	ImmutableCallSite CS(CI);
	CallingConv::ID CalleeCC = CS.getCallingConv();
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	return true;
	}

	SDValue
	X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA,
	MachineFrameInfo &MFI, unsigned i) const {
	// Create the nodes corresponding to a load from this parameter slot.
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	bool AlwaysUseMutable = shouldGuaranteeTCO(
	CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
	bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
	EVT ValVT;
	MVT PtrVT = getPointerTy(DAG.getDataLayout());

	// If value is passed by pointer we have address passed instead of the value
	// itself. No need to extend if the mask value and location share the same
	// absolute size.
	bool ExtendedInMem =
	VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
	VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

	if (VA.getLocInfo() == CCValAssign::Indirect \|\| ExtendedInMem)
	ValVT = VA.getLocVT();
	else
	ValVT = VA.getValVT();

	// FIXME: For now, all byval parameter objects are marked mutable. This can be
	// changed with more analysis.
	// In case of tail call optimization mark all arguments mutable. Since they
	// could be overwritten by lowering of arguments in case of a tail call.
	if (Flags.isByVal()) {
	unsigned Bytes = Flags.getByValSize();
	if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

	// FIXME: For now, all byval parameter objects are marked as aliasing. This
	// can be improved with deeper analysis.
	int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
	/isAliased=/true);
	return DAG.getFrameIndex(FI, PtrVT);
	}

	// This is an argument in memory. We might be able to perform copy elision.
	// If the argument is passed directly in memory without any extension, then we
	// can perform copy elision. Large vector types, for example, may be passed
	// indirectly by pointer.
	if (Flags.isCopyElisionCandidate() &&
	VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
	EVT ArgVT = Ins[i].ArgVT;
	SDValue PartAddr;
	if (Ins[i].PartOffset == 0) {
	// If this is a one-part value or the first part of a multi-part value,
	// create a stack object for the entire argument value type and return a
	// load from our portion of it. This assumes that if the first part of an
	// argument is in memory, the rest will also be in memory.
	int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
	/IsImmutable=/false);
	PartAddr = DAG.getFrameIndex(FI, PtrVT);
	return DAG.getLoad(
	ValVT, dl, Chain, PartAddr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	} else {
	// This is not the first piece of an argument in memory. See if there is
	// already a fixed stack object including this offset. If so, assume it
	// was created by the PartOffset == 0 branch above and create a load from
	// the appropriate offset into it.
	int64_t PartBegin = VA.getLocMemOffset();
	int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
	int FI = MFI.getObjectIndexBegin();
	for (; MFI.isFixedObjectIndex(FI); ++FI) {
	int64_t ObjBegin = MFI.getObjectOffset(FI);
	int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
	if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
	break;
	}
	if (MFI.isFixedObjectIndex(FI)) {
	SDValue Addr =
	DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
	DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
	return DAG.getLoad(
	ValVT, dl, Chain, Addr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
	Ins[i].PartOffset));
	}
	}
	}

	int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), isImmutable);

	// Set SExt or ZExt flag.
	if (VA.getLocInfo() == CCValAssign::ZExt) {
	MFI.setObjectZExt(FI, true);
	} else if (VA.getLocInfo() == CCValAssign::SExt) {
	MFI.setObjectSExt(FI, true);
	}

	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val = DAG.getLoad(
	ValVT, dl, Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	return ExtendedInMem
	? (VA.getValVT().isVector()
	? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
	: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
	: Val;
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());

	if (Subtarget.isCallingConvWin64(CallConv)) {
	static const MCPhysReg GPR64ArgRegsWin64[] = {
	X86::RCX, X86::RDX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
	}

	static const MCPhysReg GPR64ArgRegs64Bit[] = {
	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
	CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());
	if (Subtarget.isCallingConvWin64(CallConv)) {
	// The XMM registers which might contain var arg parameters are shadowed
	// in their paired GPR. So we only need to save the GPR to their home
	// slots.
	// TODO: __vectorcall will change this.
	return None;
	}

	const Function &F = MF.getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool isSoftFloat = Subtarget.useSoftFloat();
	assert(!(isSoftFloat && NoImplicitFloatOps) &&
	"SSE register cannot be used when SSE is disabled!");
	if (isSoftFloat \|\| NoImplicitFloatOps \|\| !Subtarget.hasSSE1())
	// Kernel mode asks for SSE to be disabled, so there are no XMM argument
	// registers.
	return None;

	static const MCPhysReg XMMArgRegs64Bit[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
	}

	#ifndef NDEBUG
	static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
	return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
	[](const CCValAssign &A, const CCValAssign &B) -> bool {
	return A.getValNo() < B.getValNo();
	});
	}
	#endif

	SDValue X86TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

	const Function &F = MF.getFunction();
	if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
	F.getName() == "main")
	FuncInfo->setForceFramePointer(true);

	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

	assert(
	!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Ins, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
	}

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	SDValue ArgValue;
	for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++InsIndex) {
	assert(InsIndex < Ins.size() && "Invalid Ins index");
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	EVT RegVT = VA.getLocVT();
	if (VA.needsCustom()) {
	assert(
	VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	// v64i1 values, in regcall calling convention, that are
	// compiled to 32 bit arch, are split up into two registers.
	ArgValue =
	getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
	} else {
	const TargetRegisterClass *RC;
	if (RegVT == MVT::i8)
	RC = &X86::GR8RegClass;
	else if (RegVT == MVT::i16)
	RC = &X86::GR16RegClass;
	else if (RegVT == MVT::i32)
	RC = &X86::GR32RegClass;
	else if (Is64Bit && RegVT == MVT::i64)
	RC = &X86::GR64RegClass;
	else if (RegVT == MVT::f32)
	RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
	else if (RegVT == MVT::f64)
	RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
	else if (RegVT == MVT::f80)
	RC = &X86::RFP80RegClass;
	else if (RegVT == MVT::f128)
	RC = &X86::VR128RegClass;
	else if (RegVT.is512BitVector())
	RC = &X86::VR512RegClass;
	else if (RegVT.is256BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
	else if (RegVT.is128BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
	else if (RegVT == MVT::x86mmx)
	RC = &X86::VR64RegClass;
	else if (RegVT == MVT::v1i1)
	RC = &X86::VK1RegClass;
	else if (RegVT == MVT::v8i1)
	RC = &X86::VK8RegClass;
	else if (RegVT == MVT::v16i1)
	RC = &X86::VK16RegClass;
	else if (RegVT == MVT::v32i1)
	RC = &X86::VK32RegClass;
	else if (RegVT == MVT::v64i1)
	RC = &X86::VK64RegClass;
	else
	llvm_unreachable("Unknown argument type!");

	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
	}

	// If this is an 8 or 16-bit value, it is really passed promoted to 32
	// bits. Insert an assert[sz]ext to capture this, then truncate to the
	// right size.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

	if (VA.isExtInLoc()) {
	// Handle MMX values passed in XMM regs.
	if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
	ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
	else if (VA.getValVT().isVector() &&
	VA.getValVT().getScalarType() == MVT::i1 &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
	} else
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
	}
	} else {
	assert(VA.isMemLoc());
	ArgValue =
	LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
	}

	// If value is passed via pointer - do a load.
	if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
	ArgValue =
	DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

	InVals.push_back(ArgValue);
	}

	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
	if (CallConv == CallingConv::Swift)
	continue;

	// All x86 ABIs require that for returning structs by value we copy the
	// sret argument into %rax/%eax (depending on ABI) for the return. Save
	// the argument into a virtual register so that we can access it from the
	// return points.
	if (Ins[I].Flags.isSRet()) {
	unsigned Reg = FuncInfo->getSRetReturnReg();
	if (!Reg) {
	MVT PtrTy = getPointerTy(DAG.getDataLayout());
	Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
	FuncInfo->setSRetReturnReg(Reg);
	}
	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
	break;
	}
	}

	unsigned StackSize = CCInfo.getNextStackOffset();
	// Align stack specially for tail calls.
	if (shouldGuaranteeTCO(CallConv,
	MF.getTarget().Options.GuaranteedTailCallOpt))
	StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start. We
	// can skip this if there are no va_start calls.
	if (MFI.hasVAStart() &&
	(Is64Bit \|\| (CallConv != CallingConv::X86_FastCall &&
	CallConv != CallingConv::X86_ThisCall))) {
	FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
	}

	// Figure out if XMM registers are in use.
	assert(!(Subtarget.useSoftFloat() &&
	F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
	"SSE register cannot be used when SSE is disabled!");

	// 64-bit calling conventions support varargs and register parameters, so we
	// have to do extra work to spill them in the prologue.
	if (Is64Bit && isVarArg && MFI.hasVAStart()) {
	// Find the first unallocated argument registers.
	ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
	ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
	unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
	assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
	"SSE register cannot be used when SSE is disabled!");

	// Gather all the live in physical registers.
	SmallVector<SDValue, 6> LiveGPRs;
	SmallVector<SDValue, 8> LiveXMMRegs;
	SDValue ALVal;
	for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
	unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
	LiveGPRs.push_back(
	DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
	}
	if (!ArgXMMs.empty()) {
	unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
	for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
	unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
	LiveXMMRegs.push_back(
	DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
	}
	}

	if (IsWin64) {
	// Get to the caller-allocated home save location. Add 8 to account
	// for the return address.
	int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
	FuncInfo->setRegSaveFrameIndex(
	MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
	// Fixup to set vararg frame on shadow area (4 x i64).
	if (NumIntRegs < 4)
	FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
	} else {
	// For X86-64, if there are vararg parameters that are passed via
	// registers, then we must store them to their spots on the stack so
	// they may be loaded by dereferencing the result of va_next.
	FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
	FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
	FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
	ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
	}

	// Store the integer parameter registers.
	SmallVector<SDValue, 8> MemOps;
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
	getPointerTy(DAG.getDataLayout()));
	unsigned Offset = FuncInfo->getVarArgsGPOffset();
	for (SDValue Val : LiveGPRs) {
	SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	RSFIN, DAG.getIntPtrConstant(Offset, dl));
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(),
	FuncInfo->getRegSaveFrameIndex(), Offset));
	MemOps.push_back(Store);
	Offset += 8;
	}

	if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
	// Now store the XMM (fp + vector) parameter registers.
	SmallVector<SDValue, 12> SaveXMMOps;
	SaveXMMOps.push_back(Chain);
	SaveXMMOps.push_back(ALVal);
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getRegSaveFrameIndex(), dl));
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getVarArgsFPOffset(), dl));
	SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
	LiveXMMRegs.end());
	MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
	MVT::Other, SaveXMMOps));
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
	}

	if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
	// Find the largest legal vector type.
	MVT VecVT = MVT::Other;
	// FIXME: Only some x86_32 calling conventions support AVX512.
	if (Subtarget.hasAVX512() &&
	(Is64Bit \|\| (CallConv == CallingConv::X86_VectorCall \|\|
	CallConv == CallingConv::Intel_OCL_BI)))
	VecVT = MVT::v16f32;
	else if (Subtarget.hasAVX())
	VecVT = MVT::v8f32;
	else if (Subtarget.hasSSE2())
	VecVT = MVT::v4f32;

	// We forward some GPRs and some vector types.
	SmallVector<MVT, 2> RegParmTypes;
	MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
	RegParmTypes.push_back(IntVT);
	if (VecVT != MVT::Other)
	RegParmTypes.push_back(VecVT);

	// Compute the set of forwarded registers. The rest are scratch.
	SmallVectorImpl<ForwardedRegister> &Forwards =
	FuncInfo->getForwardedMustTailRegParms();
	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

	// Conservatively forward AL on x86_64, since it might be used for varargs.
	if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
	unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
	}

	// Copy all forwards from physical to virtual registers.
	for (ForwardedRegister &FR : Forwards) {
	// FIXME: Can we use a less constrained schedule?
	SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
	FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
	Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
	}
	}

	// Some CCs need callee pop.
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt)) {
	FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
	} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
	// X86 interrupts must pop the error code (and the alignment padding) if
	// present.
	FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
	} else {
	FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
	// If this is an sret function, the return should pop the hidden pointer.
	if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
	FuncInfo->setBytesToPopOnReturn(4);
	}

	if (!Is64Bit) {
	// RegSaveFrameIndex is X86-64 only.
	FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
	if (CallConv == CallingConv::X86_FastCall \|\|
	CallConv == CallingConv::X86_ThisCall)
	// fastcc functions can't have varargs.
	FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
	}

	FuncInfo->setArgumentStackSize(StackSize);

	if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
	EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
	if (Personality == EHPersonality::CoreCLR) {
	assert(Is64Bit);
	// TODO: Add a mechanism to frame lowering that will allow us to indicate
	// that we'd prefer this slot be allocated towards the bottom of the frame
	// (i.e. near the stack pointer after allocating the frame). Every
	// funclet needs a copy of this slot in its (mostly empty) frame, and the
	// offset from the bottom of this and each funclet's frame must be the
	// same, so the size of funclets' (mostly empty) frames is dictated by
	// how far this slot is from the bottom (since they allocate just enough
	// space to accommodate holding this slot at the correct offset).
	int PSPSymFI = MFI.CreateStackObject(8, 8, /isSS=/false);
	EHInfo->PSPSymFrameIdx = PSPSymFI;
	}
	}

	if (CallConv == CallingConv::X86_RegCall \|\|
	F.hasFnAttribute("no_caller_saved_registers")) {
	MachineRegisterInfo &MRI = MF.getRegInfo();
	for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
	MRI.disableCalleeSavedRegister(Pair.first);
	}

	return Chain;
	}

	SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
	SDValue Arg, const SDLoc &dl,
	SelectionDAG &DAG,
	const CCValAssign &VA,
	ISD::ArgFlagsTy Flags) const {
	unsigned LocMemOffset = VA.getLocMemOffset();
	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, PtrOff);
	if (Flags.isByVal())
	return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

	return DAG.getStore(
	Chain, dl, Arg, PtrOff,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
	}

	/// Emit a load of return address if tail call
	/// optimization is performed and it is required.
	SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
	SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
	bool Is64Bit, int FPDiff, const SDLoc &dl) const {
	// Adjust the Return address stack slot.
	EVT VT = getPointerTy(DAG.getDataLayout());
	OutRetAddr = getReturnAddressFrameIndex(DAG);

	// Load the "old" Return address.
	OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
	return SDValue(OutRetAddr.getNode(), 1);
	}

	/// Emit a store of the return address if tail call
	/// optimization is performed and it is required (FPDiff!=0).
	static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
	SDValue Chain, SDValue RetAddrFrIdx,
	EVT PtrVT, unsigned SlotSize,
	int FPDiff, const SDLoc &dl) {
	// Store the return address to the appropriate stack slot.
	if (!FPDiff) return Chain;
	// Calculate the new stack slot for the return address.
	int NewReturnAddrFI =
	MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
	false);
	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
	Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), NewReturnAddrFI));
	return Chain;
	}

	/// Returns a vector_shuffle mask for an movs{s\|d}, movd
	/// operation of specified width.
	static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
	SDValue V2) {
	unsigned NumElems = VT.getVectorNumElements();
	SmallVector<int, 8> Mask;
	Mask.push_back(NumElems);
	for (unsigned i = 1; i != NumElems; ++i)
	Mask.push_back(i);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	SDValue
	X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	CallingConv::ID CallConv = CLI.CallConv;
	bool &isTailCall = CLI.IsTailCall;
	bool isVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
	StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
	bool IsSibcall = false;
	X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
	auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
	const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
	const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
	bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) \|\|
	(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
	const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
	bool HasNoCfCheck =
	(CI && CI->doesNoCfCheck()) \|\| (II && II->doesNoCfCheck());
	const Module *M = MF.getMMI().getModule();
	Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

	MachineFunction::CallSiteInfo CSInfo;

	if (CallConv == CallingConv::X86_INTR)
	report_fatal_error("X86 interrupts may not be called directly");

	if (Attr.getValueAsString() == "true")
	isTailCall = false;

	if (Subtarget.isPICStyleGOT() &&
	!MF.getTarget().Options.GuaranteedTailCallOpt) {
	// If we are using a GOT, disable tail calls to external symbols with
	// default visibility. Tail calling such a symbol requires using a GOT
	// relocation, which forces early binding of the symbol. This breaks code
	// that require lazy function symbol resolution. Using musttail or
	// GuaranteedTailCallOpt will override this.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (!G \|\| (!G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility()))
	isTailCall = false;
	}

	bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
	if (IsMustTail) {
	// Force this to be a tail call. The verifier rules are enough to ensure
	// that we can lower this successfully without moving the return address
	// around.
	isTailCall = true;
	} else if (isTailCall) {
	// Check if it's really possible to do a tail call.
	isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
	isVarArg, SR != NotStructReturn,
	MF.getFunction().hasStructRetAttr(), CLI.RetTy,
	Outs, OutVals, Ins, DAG);

	// Sibcalls are automatically detected tailcalls which do not require
	// ABI changes.
	if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
	IsSibcall = true;

	if (isTailCall)
	++NumTailCalls;
	}

	assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling convention fastcc, ghc or hipe");

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Outs, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
	if (IsSibcall)
	// This is a sibcall. The memory operands are available in caller's
	// own caller's stack.
	NumBytes = 0;
	else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
	canGuaranteeTCO(CallConv))
	NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

	int FPDiff = 0;
	if (isTailCall && !IsSibcall && !IsMustTail) {
	// Lower arguments at fp - stackoffset + fpdiff.
	unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

	FPDiff = NumBytesCallerPushed - NumBytes;

	// Set the delta of movement of the returnaddr stackslot.
	// But only set if delta is greater than previous delta.
	if (FPDiff < X86Info->getTCReturnAddrDelta())
	X86Info->setTCReturnAddrDelta(FPDiff);
	}

	unsigned NumBytesToPush = NumBytes;
	unsigned NumBytesToPop = NumBytes;

	// If we have an inalloca argument, all stack space has already been allocated
	// for us and be right at the top of the stack. We don't support multiple
	// arguments passed in memory when using inalloca.
	if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
	NumBytesToPush = 0;
	if (!ArgLocs.back().isMemLoc())
	report_fatal_error("cannot use inalloca attribute on a register "
	"parameter");
	if (ArgLocs.back().getLocMemOffset() != 0)
	report_fatal_error("any parameter with the inalloca attribute must be "
	"the only memory argument");
	}

	if (!IsSibcall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
	NumBytes - NumBytesToPush, dl);

	SDValue RetAddrFrIdx;
	// Load return address for tail calls.
	if (isTailCall && FPDiff)
	Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
	Is64Bit, FPDiff, dl);

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	SDValue StackPtr;

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	// Walk the register/memloc assignments, inserting copies/loads. In the case
	// of tail call optimization arguments are handle later.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutIndex) {
	assert(OutIndex < Outs.size() && "Invalid Out index");
	// Skip inalloca arguments, they have already been written.
	ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
	if (Flags.isInAlloca())
	continue;

	CCValAssign &VA = ArgLocs[I];
	EVT RegVT = VA.getLocVT();
	SDValue Arg = OutVals[OutIndex];
	bool isByVal = Flags.isByVal();

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::AExt:
	if (Arg.getValueType().isVector() &&
	Arg.getValueType().getVectorElementType() == MVT::i1)
	Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
	else if (RegVT.is128BitVector()) {
	// Special case: passing MMX values in XMM registers.
	Arg = DAG.getBitcast(MVT::i64, Arg);
	Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
	Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
	} else
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getBitcast(RegVT, Arg);
	break;
	case CCValAssign::Indirect: {
	if (isByVal) {
	// Memcpy the argument to a temporary stack slot to prevent
	// the caller from seeing any modifications the callee may make
	// as guaranteed by the `byval` attribute.
	int FrameIdx = MF.getFrameInfo().CreateStackObject(
	Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()),
	false);
	SDValue StackSlot =
	DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
	Chain =
	CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
	// From now on treat this as a regular pointer
	Arg = StackSlot;
	isByVal = false;
	} else {
	// Store the argument.
	SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
	Chain = DAG.getStore(
	Chain, dl, Arg, SpillSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	Arg = SpillSlot;
	}
	break;
	}
	}

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	// Split v64i1 value into two registers
	Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
	Subtarget);
	} else if (VA.isRegLoc()) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	const TargetOptions &Options = DAG.getTarget().Options;
	if (Options.EnableDebugEntryValues)
	CSInfo.emplace_back(VA.getLocReg(), I);
	if (isVarArg && IsWin64) {
	// Win64 ABI requires argument XMM reg to be copied to the corresponding
	// shadow reg if callee is a varargs function.
	unsigned ShadowReg = 0;
	switch (VA.getLocReg()) {
	case X86::XMM0: ShadowReg = X86::RCX; break;
	case X86::XMM1: ShadowReg = X86::RDX; break;
	case X86::XMM2: ShadowReg = X86::R8; break;
	case X86::XMM3: ShadowReg = X86::R9; break;
	}
	if (ShadowReg)
	RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
	}
	} else if (!IsSibcall && (!isTailCall \|\| isByVal)) {
	assert(VA.isMemLoc());
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
	dl, DAG, VA, Flags));
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	if (Subtarget.isPICStyleGOT()) {
	// ELF / PIC requires GOT in the EBX register before function calls via PLT
	// GOT pointer.
	if (!isTailCall) {
	RegsToPass.push_back(std::make_pair(
	unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()))));
	} else {
	// If we are tail calling and generating PIC/GOT style code load the
	// address of the callee into ECX. The value in ecx is used as target of
	// the tail jump. This is done to circumvent the ebx/callee-saved problem
	// for tail calls on PIC/GOT architectures. Normally we would just put the
	// address of GOT into ebx and then call target@PLT. But for tail calls
	// ebx would be restored (since ebx is callee saved) before jumping to the
	// target@PLT.

	// Note: The actual moving to ECX is done further down.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (G && !G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility())
	Callee = LowerGlobalAddress(Callee, DAG);
	else if (isa<ExternalSymbolSDNode>(Callee))
	Callee = LowerExternalSymbol(Callee, DAG);
	}
	}

	if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
	// From AMD64 ABI document:
	// For calls that may call functions that use varargs or stdargs
	// (prototype-less calls or calls to functions containing ellipsis (...) in
	// the declaration) %al is used as hidden argument to specify the number
	// of SSE registers used. The contents of %al do not need to match exactly
	// the number of registers, but must be an ubound on the number of SSE
	// registers used and is in the range 0 - 8 inclusive.

	// Count the number of XMM registers allocated.
	static const MCPhysReg XMMArgRegs[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
	assert((Subtarget.hasSSE1() \|\| !NumXMMRegs)
	&& "SSE registers cannot be used when SSE is disabled");

	RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
	DAG.getConstant(NumXMMRegs, dl,
	MVT::i8)));
	}

	if (isVarArg && IsMustTail) {
	const auto &Forwards = X86Info->getForwardedMustTailRegParms();
	for (const auto &F : Forwards) {
	SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
	}
	}

	// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
	// don't need this because the eligibility check rejects calls that require
	// shuffling arguments passed in memory.
	if (!IsSibcall && isTailCall) {
	// Force all the incoming stack arguments to be loaded from the stack
	// before any new outgoing arguments are stored to the stack, because the
	// outgoing stack slots may alias the incoming argument stack slots, and
	// the alias isn't otherwise explicit. This is slightly more conservative
	// than necessary, because it means that each store effectively depends
	// on every argument instead of just those arguments it would clobber.
	SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

	SmallVector<SDValue, 8> MemOpChains2;
	SDValue FIN;
	int FI = 0;
	for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	if (VA.needsCustom()) {
	assert((CallConv == CallingConv::X86_RegCall) &&
	"Expecting custom case only in regcall calling convention");
	// This means that we are in special case where one argument was
	// passed through two register locations - Skip the next location
	++I;
	}

	continue;
	}

	assert(VA.isMemLoc());
	SDValue Arg = OutVals[OutsIndex];
	ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
	// Skip inalloca arguments. They don't require any work.
	if (Flags.isInAlloca())
	continue;
	// Create frame index.
	int32_t Offset = VA.getLocMemOffset()+FPDiff;
	uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
	FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
	FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

	if (Flags.isByVal()) {
	// Copy relative to framepointer.
	SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, Source);

	MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
	ArgChain,
	Flags, DAG, dl));
	} else {
	// Store relative to framepointer.
	MemOpChains2.push_back(DAG.getStore(
	ArgChain, dl, Arg, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
	}
	}

	if (!MemOpChains2.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

	// Store the return address to the appropriate stack slot.
	Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
	getPointerTy(DAG.getDataLayout()),
	RegInfo->getSlotSize(), FPDiff, dl);
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into registers.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
	assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
	// In the 64-bit large code model, we have to make all calls
	// through a register, since the call instruction's 32-bit
	// pc-relative offset may not be large enough to hold the whole
	// address.
	} else if (Callee->getOpcode() == ISD::GlobalAddress \|\|
	Callee->getOpcode() == ISD::ExternalSymbol) {
	// Lower direct calls to global addresses and external symbols. Setting
	// ForCall to true here has the effect of removing WrapperRIP when possible
	// to allow direct calls to be selected without first materializing the
	// address into a register.
	Callee = LowerGlobalOrExternal(Callee, DAG, /ForCall=/true);
	} else if (Subtarget.isTarget64BitILP32() &&
	Callee->getValueType(0) == MVT::i32) {
	// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
	Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
	}

	// Returns a chain & a flag for retval copy to use.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SmallVector<SDValue, 8> Ops;

	if (!IsSibcall && isTailCall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (isTailCall)
	Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
	// set X86_INTR calling convention because it has the same CSR mask
	// (same preserved registers).
	const uint32_t *Mask = RegInfo->getCallPreservedMask(
	MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");

	// If this is an invoke in a 32-bit function using a funclet-based
	// personality, assume the function clobbers all registers. If an exception
	// is thrown, the runtime will not restore CSRs.
	// FIXME: Model this more precisely so that we can register allocate across
	// the normal edge and spill and fill across the exceptional edge.
	if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
	const Function &CallerFn = MF.getFunction();
	EHPersonality Pers =
	CallerFn.hasPersonalityFn()
	? classifyEHPersonality(CallerFn.getPersonalityFn())
	: EHPersonality::Unknown;
	if (isFuncletEHPersonality(Pers))
	Mask = RegInfo->getNoPreservedMask();
	}

	// Define a new register mask from the existing mask.
	uint32_t *RegMask = nullptr;

	// In some calling conventions we need to remove the used physical registers
	// from the reg mask.
	if (CallConv == CallingConv::X86_RegCall \|\| HasNCSR) {
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	// Allocate a new Reg Mask and copy Mask.
	RegMask = MF.allocateRegMask();
	unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
	memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);

	// Make sure all sub registers of the argument registers are reset
	// in the RegMask.
	for (auto const &RegPair : RegsToPass)
	for (MCSubRegIterator SubRegs(RegPair.first, TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));

	// Create the RegMask Operand according to our updated mask.
	Ops.push_back(DAG.getRegisterMask(RegMask));
	} else {
	// Create the RegMask Operand according to the static mask.
	Ops.push_back(DAG.getRegisterMask(Mask));
	}

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	if (isTailCall) {
	// We used to do:
	//// If this is the first return lowered for this function, add the regs
	//// to the liveout set for the function.
	// This isn't right, although it's probably harmless on x86; liveouts
	// should be computed from returns not tail calls. Consider a void
	// function making a tail call to a function returning int.
	MF.getFrameInfo().setHasTailCall();
	SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
	DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
	return Ret;
	}

	if (HasNoCfCheck && IsCFProtectionSupported) {
	Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
	} else {
	Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
	}
	InFlag = Chain.getValue(1);
	DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

	+ // Save heapallocsite metadata.
	+ if (CLI.CS)
	+ if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
	+ DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
	+
	// Create the CALLSEQ_END node.
	unsigned NumBytesForCalleeToPop;
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	DAG.getTarget().Options.GuaranteedTailCallOpt))
	NumBytesForCalleeToPop = NumBytes; // Callee pops everything
	else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	SR == StackStructReturn)
	// If this is a call to a struct-return function, the callee
	// pops the hidden struct pointer, so we have to push it back.
	// This is common for Darwin/X86, Linux & Mingw32 targets.
	// For MSVC Win32 targets, the caller pops the hidden struct pointer.
	NumBytesForCalleeToPop = 4;
	else
	NumBytesForCalleeToPop = 0; // Callee pops nothing.

	if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
	// No need to reset the stack after the call if the call doesn't return. To
	// make the MI verify, we'll pretend the callee does it for us.
	NumBytesForCalleeToPop = NumBytes;
	}

	// Returns a flag for retval copy to use.
	if (!IsSibcall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
	true),
	InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
	InVals, RegMask);
	}

	//===----------------------------------------------------------------------===//
	// Fast Calling Convention (tail call) implementation
	//===----------------------------------------------------------------------===//

	// Like std call, callee cleans arguments, convention except that ECX is
	// reserved for storing the tail called function address. Only 2 registers are
	// free for argument passing (inreg). Tail call optimization is performed
	// provided:
	// * tailcallopt is enabled
	// * caller/callee are fastcc
	// On X86_64 architecture with GOT-style position independent code only local
	// (within module) calls are supported at the moment.
	// To keep the stack aligned according to platform abi the function
	// GetAlignedArgumentStackSize ensures that argument delta is always multiples
	// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
	// If a tail called function callee has more arguments than the caller the
	// caller needs to make sure that there is room to move the RETADDR to. This is
	// achieved by reserving an area the size of the argument delta right after the
	// original RETADDR, but before the saved framepointer or the spilled registers
	// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
	// stack layout:
	// arg1
	// arg2
	// RETADDR
	// [ new RETADDR
	// move area ]
	// (possible EBP)
	// ESI
	// EDI
	// local1 ..

	/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
	/// requirement.
	unsigned
	X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
	SelectionDAG& DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlignment = TFI.getStackAlignment();
	uint64_t AlignMask = StackAlignment - 1;
	int64_t Offset = StackSize;
	unsigned SlotSize = RegInfo->getSlotSize();
	if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
	// Number smaller than 12 so just add the difference.
	Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
	} else {
	// Mask out lower bits, add stackalignment once plus the 12 bytes.
	Offset = ((~AlignMask) & Offset) + StackAlignment +
	(StackAlignment-SlotSize);
	}
	return Offset;
	}

	/// Return true if the given stack call argument is already available in the
	/// same position (relatively) of the caller's incoming argument stack.
	static
	bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
	MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
	const X86InstrInfo *TII, const CCValAssign &VA) {
	unsigned Bytes = Arg.getValueSizeInBits() / 8;

	for (;;) {
	// Look through nodes that don't alter the bits of the incoming value.
	unsigned Op = Arg.getOpcode();
	if (Op == ISD::ZERO_EXTEND \|\| Op == ISD::ANY_EXTEND \|\| Op == ISD::BITCAST) {
	Arg = Arg.getOperand(0);
	continue;
	}
	if (Op == ISD::TRUNCATE) {
	const SDValue &TruncInput = Arg.getOperand(0);
	if (TruncInput.getOpcode() == ISD::AssertZext &&
	cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
	Arg.getValueType()) {
	Arg = TruncInput.getOperand(0);
	continue;
	}
	}
	break;
	}

	int FI = INT_MAX;
	if (Arg.getOpcode() == ISD::CopyFromReg) {
	unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
	if (!TargetRegisterInfo::isVirtualRegister(VR))
	return false;
	MachineInstr *Def = MRI->getVRegDef(VR);
	if (!Def)
	return false;
	if (!Flags.isByVal()) {
	if (!TII->isLoadFromStackSlot(*Def, FI))
	return false;
	} else {
	unsigned Opcode = Def->getOpcode();
	if ((Opcode == X86::LEA32r \|\| Opcode == X86::LEA64r \|\|
	Opcode == X86::LEA64_32r) &&
	Def->getOperand(1).isFI()) {
	FI = Def->getOperand(1).getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;
	}
	} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
	if (Flags.isByVal())
	// ByVal argument is passed in as a pointer but it's now being
	// dereferenced. e.g.
	// define @foo(%struct.X* %A) {
	// tail call @bar(%struct.X* byval %A)
	// }
	return false;
	SDValue Ptr = Ld->getBasePtr();
	FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
	if (!FINode)
	return false;
	FI = FINode->getIndex();
	} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
	FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
	FI = FINode->getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;

	assert(FI != INT_MAX);
	if (!MFI.isFixedObjectIndex(FI))
	return false;

	if (Offset != MFI.getObjectOffset(FI))
	return false;

	// If this is not byval, check that the argument stack object is immutable.
	// inalloca and argument copy elision can create mutable argument stack
	// objects. Byval objects can be mutated, but a byval call intends to pass the
	// mutated memory.
	if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
	return false;

	if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
	// If the argument location is wider than the argument type, check that any
	// extension flags match.
	if (Flags.isZExt() != MFI.isObjectZExt(FI) \|\|
	Flags.isSExt() != MFI.isObjectSExt(FI)) {
	return false;
	}
	}

	return Bytes == MFI.getObjectSize(FI);
	}

	/// Check whether the call is eligible for tail call optimization. Targets
	/// that want to do tail call optimization should implement this function.
	bool X86TargetLowering::IsEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	// If -tailcallopt is specified, make fastcc functions tail-callable.
	MachineFunction &MF = DAG.getMachineFunction();
	const Function &CallerF = MF.getFunction();

	// If the function return type is x86_fp80 and the callee return type is not,
	// then the FP_EXTEND of the call result is not a nop. It's not safe to
	// perform a tailcall optimization here.
	if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
	return false;

	CallingConv::ID CallerCC = CallerF.getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;
	bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
	bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

	// Win64 functions have extra shadow space for argument homing. Don't do the
	// sibcall if the caller and callee have mismatched expectations for this
	// space.
	if (IsCalleeWin64 != IsCallerWin64)
	return false;

	if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
	if (canGuaranteeTCO(CalleeCC) && CCMatch)
	return true;
	return false;
	}

	// Look for obvious safe cases to perform tail call optimization that do not
	// require ABI changes. This is what gcc calls sibcall.

	// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
	// emit a special epilogue.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	if (RegInfo->needsStackRealignment(MF))
	return false;

	// Also avoid sibcall optimization if either caller or callee uses struct
	// return semantics.
	if (isCalleeStructRet \|\| isCallerStructRet)
	return false;

	// Do not sibcall optimize vararg calls unless all arguments are passed via
	// registers.
	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// Optimizing for varargs on Win64 is unlikely to be safe without
	// additional testing.
	if (IsCalleeWin64 \|\| IsCallerWin64)
	return false;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
	if (!ArgLocs[i].isRegLoc())
	return false;
	}

	// If the call result is in ST0 / ST1, it needs to be popped off the x87
	// stack. Therefore, if it's not used by the call it is not safe to optimize
	// this into a sibcall.
	bool Unused = false;
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	if (!Ins[i].Used) {
	Unused = true;
	break;
	}
	}
	if (Unused) {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
	for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
	CCValAssign &VA = RVLocs[i];
	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
	return false;
	}
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	RetCC_X86, RetCC_X86))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	unsigned StackArgsSize = 0;

	// If the callee takes no arguments then go on to check the results of the
	// call.
	if (!Outs.empty()) {
	// Check if stack adjustment is needed. For now, do not do this if any
	// argument is passed on the stack.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	// Allocate shadow area for Win64
	if (IsCalleeWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	StackArgsSize = CCInfo.getNextStackOffset();

	if (CCInfo.getNextStackOffset()) {
	// Check if the arguments are already laid out in the right way as
	// the caller's fixed stack objects.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const MachineRegisterInfo *MRI = &MF.getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;
	if (!VA.isRegLoc()) {
	if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
	MFI, MRI, TII, VA))
	return false;
	}
	}
	}

	bool PositionIndependent = isPositionIndependent();
	// If the tailcall address may be in a register, then make sure it's
	// possible to register allocate for it. In 32-bit, the call address can
	// only target EAX, EDX, or ECX since the tail call must be scheduled after
	// callee-saved registers are restored. These happen to be the same
	// registers used to pass 'inreg' arguments so watch out for those.
	if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee)) \|\|
	PositionIndependent)) {
	unsigned NumInRegs = 0;
	// In PIC we need an extra register to formulate the address computation
	// for the callee.
	unsigned MaxInRegs = PositionIndependent ? 2 : 3;

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	if (!VA.isRegLoc())
	continue;
	unsigned Reg = VA.getLocReg();
	switch (Reg) {
	default: break;
	case X86::EAX: case X86::EDX: case X86::ECX:
	if (++NumInRegs == MaxInRegs)
	return false;
	break;
	}
	}
	}

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;
	}

	bool CalleeWillPop =
	X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt);

	if (unsigned BytesToPop =
	MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
	// If we have bytes to pop, the callee must pop them.
	bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
	if (!CalleePopMatches)
	return false;
	} else if (CalleeWillPop && StackArgsSize > 0) {
	// If we don't have bytes to pop, make sure the callee doesn't pop any.
	return false;
	}

	return true;
	}

	FastISel *
	X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return X86::createFastISel(funcInfo, libInfo);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Hooks
	//===----------------------------------------------------------------------===//

	static bool MayFoldLoad(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
	}

	static bool MayFoldIntoStore(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
	}

	static bool MayFoldIntoZeroExtend(SDValue Op) {
	if (Op.hasOneUse()) {
	unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
	return (ISD::ZERO_EXTEND == Opcode);
	}
	return false;
	}

	static bool isTargetShuffle(unsigned Opcode) {
	switch(Opcode) {
	default: return false;
	case X86ISD::BLENDI:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::SHUFP:
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::MOVLHPS:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	case X86ISD::VBROADCAST:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::SHUF128:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMI:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VZEXT_MOVL:
	return true;
	}
	}

	static bool isTargetShuffleVariableMask(unsigned Opcode) {
	switch (Opcode) {
	default: return false;
	// Target Shuffles.
	case X86ISD::PSHUFB:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERMIL2:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	return true;
	// 'Faux' Target Shuffles.
	case ISD::OR:
	case ISD::AND:
	case X86ISD::ANDNP:
	return true;
	}
	}

	SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	int ReturnAddrIndex = FuncInfo->getRAIndex();

	if (ReturnAddrIndex == 0) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
	-(int64_t)SlotSize,
	false);
	FuncInfo->setRAIndex(ReturnAddrIndex);
	}

	return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
	}

	bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
	bool hasSymbolicDisplacement) {
	// Offset should fit into 32 bit immediate field.
	if (!isInt<32>(Offset))
	return false;

	// If we don't have a symbolic displacement - we don't have any extra
	// restrictions.
	if (!hasSymbolicDisplacement)
	return true;

	// FIXME: Some tweaks might be needed for medium code model.
	if (M != CodeModel::Small && M != CodeModel::Kernel)
	return false;

	// For small code model we assume that latest object is 16MB before end of 31
	// bits boundary. We may also accept pretty large negative constants knowing
	// that all objects are in the positive half of address space.
	if (M == CodeModel::Small && Offset < 1610241024)
	return true;

	// For kernel code model we know that all object resist in the negative half
	// of 32bits address space. We may not accept negative offsets, since they may
	// be just off and we may accept pretty large positive ones.
	if (M == CodeModel::Kernel && Offset >= 0)
	return true;

	return false;
	}

	/// Determines whether the callee is required to pop its own arguments.
	/// Callee pop is necessary to support tail calls.
	bool X86::isCalleePop(CallingConv::ID CallingConv,
	bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
	// If GuaranteeTCO is true, we force some calls to be callee pop so that we
	// can guarantee TCO.
	if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
	return true;

	switch (CallingConv) {
	default:
	return false;
	case CallingConv::X86_StdCall:
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_VectorCall:
	return !is64Bit;
	}
	}

	/// Return true if the condition is an unsigned comparison operation.
	static bool isX86CCUnsigned(unsigned X86CC) {
	switch (X86CC) {
	default:
	llvm_unreachable("Invalid integer condition!");
	case X86::COND_E:
	case X86::COND_NE:
	case X86::COND_B:
	case X86::COND_A:
	case X86::COND_BE:
	case X86::COND_AE:
	return true;
	case X86::COND_G:
	case X86::COND_GE:
	case X86::COND_L:
	case X86::COND_LE:
	return false;
	}
	}

	static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
	switch (SetCCOpcode) {
	default: llvm_unreachable("Invalid integer condition!");
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETGT: return X86::COND_G;
	case ISD::SETGE: return X86::COND_GE;
	case ISD::SETLT: return X86::COND_L;
	case ISD::SETLE: return X86::COND_LE;
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETULT: return X86::COND_B;
	case ISD::SETUGT: return X86::COND_A;
	case ISD::SETULE: return X86::COND_BE;
	case ISD::SETUGE: return X86::COND_AE;
	}
	}

	/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
	/// condition code, returning the condition code and the LHS/RHS of the
	/// comparison to make.
	static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
	bool isFP, SDValue &LHS, SDValue &RHS,
	SelectionDAG &DAG) {
	if (!isFP) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
	if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
	// X > -1 -> X == 0, jump !sign.
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_NS;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
	// X < 0 -> X == 0, jump on sign.
	return X86::COND_S;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
	// X < 1 -> X <= 0
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_LE;
	}
	}

	return TranslateIntegerX86CC(SetCCOpcode);
	}

	// First determine if it is required or is profitable to flip the operands.

	// If LHS is a foldable load, but RHS is not, flip the condition.
	if (ISD::isNON_EXTLoad(LHS.getNode()) &&
	!ISD::isNON_EXTLoad(RHS.getNode())) {
	SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
	std::swap(LHS, RHS);
	}

	switch (SetCCOpcode) {
	default: break;
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	std::swap(LHS, RHS);
	break;
	}

	// On a floating point condition, the flags are set as follows:
	// ZF PF CF op
	// 0 \| 0 \| 0 \| X > Y
	// 0 \| 0 \| 1 \| X < Y
	// 1 \| 0 \| 0 \| X == Y
	// 1 \| 1 \| 1 \| unordered
	switch (SetCCOpcode) {
	default: llvm_unreachable("Condcode should be pre-legalized away");
	case ISD::SETUEQ:
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETOLT: // flipped
	case ISD::SETOGT:
	case ISD::SETGT: return X86::COND_A;
	case ISD::SETOLE: // flipped
	case ISD::SETOGE:
	case ISD::SETGE: return X86::COND_AE;
	case ISD::SETUGT: // flipped
	case ISD::SETULT:
	case ISD::SETLT: return X86::COND_B;
	case ISD::SETUGE: // flipped
	case ISD::SETULE:
	case ISD::SETLE: return X86::COND_BE;
	case ISD::SETONE:
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETUO: return X86::COND_P;
	case ISD::SETO: return X86::COND_NP;
	case ISD::SETOEQ:
	case ISD::SETUNE: return X86::COND_INVALID;
	}
	}

	/// Is there a floating point cmov for the specific X86 condition code?
	/// Current x86 isa includes the following FP cmov instructions:
	/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
	static bool hasFPCMov(unsigned X86CC) {
	switch (X86CC) {
	default:
	return false;
	case X86::COND_B:
	case X86::COND_BE:
	case X86::COND_E:
	case X86::COND_P:
	case X86::COND_A:
	case X86::COND_AE:
	case X86::COND_NE:
	case X86::COND_NP:
	return true;
	}
	}


	bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {

	const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
	if (!IntrData)
	return false;

	Info.flags = MachineMemOperand::MONone;
	Info.offset = 0;

	switch (IntrData->Type) {
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	Info.opc = ISD::INTRINSIC_VOID;
	Info.ptrVal = I.getArgOperand(0);
	MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
	MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
	ScalarVT = MVT::i8;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
	ScalarVT = MVT::i16;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
	ScalarVT = MVT::i32;

	Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	case GATHER:
	case GATHER_AVX2: {
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.ptrVal = nullptr;
	MVT DataVT = MVT::getVT(I.getType());
	MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
	unsigned NumElts = std::min(DataVT.getVectorNumElements(),
	IndexVT.getVectorNumElements());
	Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOLoad;
	break;
	}
	case SCATTER: {
	Info.opc = ISD::INTRINSIC_VOID;
	Info.ptrVal = nullptr;
	MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
	MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
	unsigned NumElts = std::min(DataVT.getVectorNumElements(),
	IndexVT.getVectorNumElements());
	Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	default:
	return false;
	}

	return true;
	}

	/// Returns true if the target can instruction select the
	/// specified FP immediate natively. If false, the legalizer will
	/// materialize the FP immediate as a load from a constant pool.
	bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
	bool ForCodeSize) const {
	for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
	if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
	return true;
	}
	return false;
	}

	bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
	// relocation target a movq or addq instruction: don't let the load shrink.
	SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
	if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
	if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
	return GA->getTargetFlags() != X86II::MO_GOTTPOFF;

	// If this is an (1) AVX vector load with (2) multiple uses and (3) all of
	// those uses are extracted directly into a store, then the extract + store
	// can be store-folded. Therefore, it's probably not worth splitting the load.
	EVT VT = Load->getValueType(0);
	if ((VT.is256BitVector() \|\| VT.is512BitVector()) && !Load->hasOneUse()) {
	for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
	// Skip uses of the chain value. Result 0 of the node is the load value.
	if (UI.getUse().getResNo() != 0)
	continue;

	// If this use is not an extract + store, it's probably worth splitting.
	if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR \|\| !UI->hasOneUse() \|\|
	UI->use_begin()->getOpcode() != ISD::STORE)
	return true;
	}
	// All non-chain uses are extract + store.
	return false;
	}

	return true;
	}

	/// Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0 \|\| BitSize > 64)
	return false;
	return true;
	}

	bool X86TargetLowering::reduceSelectOfFPConstantLoads(bool IsFPSetCC) const {
	// If we are using XMM registers in the ABI and the condition of the select is
	// a floating-point compare and we have blendv or conditional move, then it is
	// cheaper to select instead of doing a cross-register move and creating a
	// load that depends on the compare result.
	return !IsFPSetCC \|\| !Subtarget.isTarget64BitLP64() \|\| !Subtarget.hasAVX();
	}

	bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
	// TODO: It might be a win to ease or lift this restriction, but the generic
	// folds in DAGCombiner conflict with vector folds for an AVX512 target.
	if (VT.isVector() && Subtarget.hasAVX512())
	return false;

	return true;
	}

	bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const {
	// TODO: We handle scalars using custom code, but generic combining could make
	// that unnecessary.
	APInt MulC;
	if (!ISD::isConstantSplatVector(C.getNode(), MulC))
	return false;

	// If vector multiply is legal, assume that's faster than shl + add/sub.
	// TODO: Multiply is a complex op with higher latency and lower througput in
	// most implementations, so this check could be loosened based on type
	// and/or a CPU attribute.
	if (isOperationLegal(ISD::MUL, VT))
	return false;

	// shl+add, shl+sub, shl+add+neg
	return (MulC + 1).isPowerOf2() \|\| (MulC - 1).isPowerOf2() \|\|
	(1 - MulC).isPowerOf2() \|\| (-(MulC + 1)).isPowerOf2();
	}

	bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
	bool IsSigned) const {
	// f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available.
	return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov();
	}

	bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	// Mask vectors support all subregister combinations and operations that
	// extract half of vector.
	if (ResVT.getVectorElementType() == MVT::i1)
	return Index == 0 \|\| ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
	(Index == ResVT.getVectorNumElements()));

	return (Index % ResVT.getVectorNumElements()) == 0;
	}

	bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
	unsigned Opc = VecOp.getOpcode();

	// Assume target opcodes can't be scalarized.
	// TODO - do we have any exceptions?
	if (Opc >= ISD::BUILTIN_OP_END)
	return false;

	// If the vector op is not supported, try to convert to scalar.
	EVT VecVT = VecOp.getValueType();
	if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
	return true;

	// If the vector op is supported, but the scalar op is not, the transform may
	// not be worthwhile.
	EVT ScalarVT = VecVT.getScalarType();
	return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
	}

	bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const {
	// TODO: Allow vectors?
	if (VT.isVector())
	return false;
	return VT.isSimple() \|\| !isOperationExpand(Opcode, VT);
	}

	bool X86TargetLowering::isCheapToSpeculateCttz() const {
	// Speculate cttz only if we can directly use TZCNT.
	return Subtarget.hasBMI();
	}

	bool X86TargetLowering::isCheapToSpeculateCtlz() const {
	// Speculate ctlz only if we can directly use LZCNT.
	return Subtarget.hasLZCNT();
	}

	bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
	const SelectionDAG &DAG,
	const MachineMemOperand &MMO) const {
	if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
	BitcastVT.getVectorElementType() == MVT::i1)
	return false;

	if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
	return false;

	// If both types are legal vectors, it's always ok to convert them.
	if (LoadVT.isVector() && BitcastVT.isVector() &&
	isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
	return true;

	return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
	}

	bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
	const SelectionDAG &DAG) const {
	// Do not merge to float value size (128 bytes) if no implicit
	// float attribute is set.
	bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);

	if (NoFloat) {
	unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
	return (MemVT.getSizeInBits() <= MaxIntSize);
	}
	// Make sure we don't merge greater than our preferred vector
	// width.
	if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
	return false;
	return true;
	}

	bool X86TargetLowering::isCtlzFast() const {
	return Subtarget.hasFastLZCNT();
	}

	bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	return true;
	}

	bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
	EVT VT = Y.getValueType();

	if (VT.isVector())
	return false;

	if (!Subtarget.hasBMI())
	return false;

	// There are only 32-bit and 64-bit forms for 'andn'.
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	return !isa<ConstantSDNode>(Y);
	}

	bool X86TargetLowering::hasAndNot(SDValue Y) const {
	EVT VT = Y.getValueType();

	if (!VT.isVector())
	return hasAndNotCompare(Y);

	// Vector.

	if (!Subtarget.hasSSE1() \|\| VT.getSizeInBits() < 128)
	return false;

	if (VT == MVT::v4i32)
	return true;

	return Subtarget.hasSSE2();
	}

	bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
	const SDNode *N, CombineLevel Level) const {
	assert(((N->getOpcode() == ISD::SHL &&
	N->getOperand(0).getOpcode() == ISD::SRL) \|\|
	(N->getOpcode() == ISD::SRL &&
	N->getOperand(0).getOpcode() == ISD::SHL)) &&
	"Expected shift-shift mask");
	EVT VT = N->getValueType(0);
	if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) \|\|
	(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
	// Only fold if the shift values are equal - so it folds to AND.
	// TODO - we should fold if either is a non-uniform vector but we don't do
	// the fold for non-splats yet.
	return N->getOperand(1) == N->getOperand(0).getOperand(1);
	}
	return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
	}

	bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
	EVT VT = Y.getValueType();

	// For vectors, we don't have a preference, but we probably want a mask.
	if (VT.isVector())
	return false;

	// 64-bit shifts on 32-bit targets produce really bad bloated code.
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	return false;

	return true;
	}

	bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
	// Any legal vector type can be splatted more efficiently than
	// loading/spilling from memory.
	return isTypeLegal(VT);
	}

	MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
	MVT VT = MVT::getIntegerVT(NumBits);
	if (isTypeLegal(VT))
	return VT;

	// PMOVMSKB can handle this.
	if (NumBits == 128 && isTypeLegal(MVT::v16i8))
	return MVT::v16i8;

	// VPMOVMSKB can handle this.
	if (NumBits == 256 && isTypeLegal(MVT::v32i8))
	return MVT::v32i8;

	// TODO: Allow 64-bit type for 32-bit target.
	// TODO: 512-bit types should be allowed, but make sure that those
	// cases are handled in combineVectorSizedSetCCEquality().

	return MVT::INVALID_SIMPLE_VALUE_TYPE;
	}

	/// Val is the undef sentinel value or equal to the specified value.
	static bool isUndefOrEqual(int Val, int CmpVal) {
	return ((Val == SM_SentinelUndef) \|\| (Val == CmpVal));
	}

	/// Val is either the undef or zero sentinel value.
	static bool isUndefOrZero(int Val) {
	return ((Val == SM_SentinelUndef) \|\| (Val == SM_SentinelZero));
	}

	/// Return true if every element in Mask, beginning from position Pos and ending
	/// in Pos+Size is the undef sentinel value.
	static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
	if (Mask[i] != SM_SentinelUndef)
	return false;
	return true;
	}

	/// Return true if the mask creates a vector whose lower half is undefined.
	static bool isUndefLowerHalf(ArrayRef<int> Mask) {
	unsigned NumElts = Mask.size();
	return isUndefInRange(Mask, 0, NumElts / 2);
	}

	/// Return true if the mask creates a vector whose upper half is undefined.
	static bool isUndefUpperHalf(ArrayRef<int> Mask) {
	unsigned NumElts = Mask.size();
	return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
	}

	/// Return true if Val falls within the specified range (L, H].
	static bool isInRange(int Val, int Low, int Hi) {
	return (Val >= Low && Val < Hi);
	}

	/// Return true if the value of any element in Mask falls within the specified
	/// range (L, H].
	static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
	for (int M : Mask)
	if (isInRange(M, Low, Hi))
	return true;
	return false;
	}

	/// Return true if Val is undef or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrInRange(int Val, int Low, int Hi) {
	return (Val == SM_SentinelUndef) \|\| isInRange(Val, Low, Hi);
	}

	/// Return true if every element in Mask is undef or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrInRange(ArrayRef<int> Mask,
	int Low, int Hi) {
	for (int M : Mask)
	if (!isUndefOrInRange(M, Low, Hi))
	return false;
	return true;
	}

	/// Return true if Val is undef, zero or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
	return isUndefOrZero(Val) \|\| isInRange(Val, Low, Hi);
	}

	/// Return true if every element in Mask is undef, zero or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
	for (int M : Mask)
	if (!isUndefOrZeroOrInRange(M, Low, Hi))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos + Size, falls within the specified
	/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
	static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low, int Step = 1) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
	if (!isUndefOrEqual(Mask[i], Low))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size], or is undef or is zero.
	static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
	if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is undef or is zero.
	static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
	if (!isUndefOrZero(Mask[i]))
	return false;
	return true;
	}

	/// Helper function to test whether a shuffle mask could be
	/// simplified by widening the elements being shuffled.
	///
	/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
	/// leaves it in an unspecified state.
	///
	/// NOTE: This must handle normal vector shuffle masks and target vector
	/// shuffle masks. The latter have the special property of a '-2' representing
	/// a zero-ed lane of a vector.
	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	SmallVectorImpl<int> &WidenedMask) {
	WidenedMask.assign(Mask.size() / 2, 0);
	for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
	int M0 = Mask[i];
	int M1 = Mask[i + 1];

	// If both elements are undef, its trivial.
	if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
	WidenedMask[i / 2] = SM_SentinelUndef;
	continue;
	}

	// Check for an undef mask and a mask value properly aligned to fit with
	// a pair of values. If we find such a case, use the non-undef mask's value.
	if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
	WidenedMask[i / 2] = M1 / 2;
	continue;
	}
	if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// When zeroing, we need to spread the zeroing across both lanes to widen.
	if (M0 == SM_SentinelZero \|\| M1 == SM_SentinelZero) {
	if ((M0 == SM_SentinelZero \|\| M0 == SM_SentinelUndef) &&
	(M1 == SM_SentinelZero \|\| M1 == SM_SentinelUndef)) {
	WidenedMask[i / 2] = SM_SentinelZero;
	continue;
	}
	return false;
	}

	// Finally check if the two mask values are adjacent and aligned with
	// a pair.
	if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// Otherwise we can't safely widen the elements used in this shuffle.
	return false;
	}
	assert(WidenedMask.size() == Mask.size() / 2 &&
	"Incorrect size of mask after widening the elements!");

	return true;
	}

	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	const APInt &Zeroable,
	SmallVectorImpl<int> &WidenedMask) {
	SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
	for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
	if (TargetMask[i] == SM_SentinelUndef)
	continue;
	if (Zeroable[i])
	TargetMask[i] = SM_SentinelZero;
	}
	return canWidenShuffleElements(TargetMask, WidenedMask);
	}

	static bool canWidenShuffleElements(ArrayRef<int> Mask) {
	SmallVector<int, 32> WidenedMask;
	return canWidenShuffleElements(Mask, WidenedMask);
	}

	/// Returns true if Elt is a constant zero or a floating point constant +0.0.
	bool X86::isZeroNode(SDValue Elt) {
	return isNullConstant(Elt) \|\| isNullFPConstant(Elt);
	}

	// Build a vector of constants.
	// Use an UNDEF node if MaskElt == -1.
	// Split 64-bit constants in the 32-bit mode.
	static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
	const SDLoc &dl, bool IsMask = false) {

	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsUndef = Values[i] < 0 && IsMask;
	SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(Values[i], dl, EltVT);
	Ops.push_back(OpNode);
	if (Split)
	Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(0, dl, EltVT));
	}
	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	if (Split)
	ConstsNode = DAG.getBitcast(VT, ConstsNode);
	return ConstsNode;
	}

	static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
	MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert(Bits.size() == Undefs.getBitWidth() &&
	"Unequal constant and undef arrays");
	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
	if (Undefs[i]) {
	Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
	continue;
	}
	const APInt &V = Bits[i];
	assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
	if (Split) {
	Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
	Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
	} else if (EltVT == MVT::f32) {
	APFloat FV(APFloat::IEEEsingle(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else if (EltVT == MVT::f64) {
	APFloat FV(APFloat::IEEEdouble(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else {
	Ops.push_back(DAG.getConstant(V, dl, EltVT));
	}
	}

	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	return DAG.getBitcast(VT, ConstsNode);
	}

	/// Returns a vector of specified type with all zero elements.
	static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector() \|\|
	VT.getVectorElementType() == MVT::i1) &&
	"Unexpected vector type");

	// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
	// type. This ensures they get CSE'd. But if the integer type is not
	// available, use a floating-point +0.0 instead.
	SDValue Vec;
	if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
	Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
	} else if (VT.getVectorElementType() == MVT::i1) {
	assert((Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) &&
	"Unexpected vector type");
	Vec = DAG.getConstant(0, dl, VT);
	} else {
	unsigned Num32BitElts = VT.getSizeInBits() / 32;
	Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
	}
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
	const SDLoc &dl, unsigned vectorWidth) {
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	unsigned Factor = VT.getSizeInBits()/vectorWidth;
	EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
	VT.getVectorNumElements()/Factor);

	// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
	unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	// If the input is a buildvector just emit a smaller one.
	if (Vec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(ResultVT, dl,
	Vec->ops().slice(IdxVal, ElemsPerChunk));

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
	}

	/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
	/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
	/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
	/// instructions or a simple subregister reference. Idx is an index in the
	/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering EXTRACT_VECTOR_ELT operations easier.
	static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((Vec.getValueType().is256BitVector() \|\|
	Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 128);
	}

	/// Generate a DAG to grab 256-bits from a 512-bit vector.
	static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 256);
	}

	static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl,
	unsigned vectorWidth) {
	assert((vectorWidth == 128 \|\| vectorWidth == 256) &&
	"Unsupported vector width");
	// Inserting UNDEF is Result
	if (Vec.isUndef())
	return Result;
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	EVT ResultVT = Result.getValueType();

	// Insert the relevant vectorWidth bits.
	unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
	}

	/// Generate a DAG to put 128-bits into a vector > 128 bits. This
	/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
	/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
	/// simple superregister reference. Idx is an index in the 128 bits
	/// we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering INSERT_VECTOR_ELT operations easier.
	static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
	}

	/// Widen a vector to a larger size with the same scalar type, with the new
	/// elements either zero or undef.
	static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {
	assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
	Vec.getValueType().getScalarType() == VT.getScalarType() &&
	"Unsupported vector widening type");
	SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
	: DAG.getUNDEF(VT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	/// Widen a vector to a larger size with the same scalar type, with the new
	/// elements either zero or undef.
	static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl, unsigned WideSizeInBits) {
	assert(Vec.getValueSizeInBits() < WideSizeInBits &&
	(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
	"Unsupported vector widening type");
	unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
	MVT SVT = Vec.getSimpleValueType().getScalarType();
	MVT VT = MVT::getVectorVT(SVT, WideNumElts);
	return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
	}

	// Helper function to collect subvector ops that are concated together,
	// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
	// The subvectors in Ops are guaranteed to be the same type.
	static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
	assert(Ops.empty() && "Expected an empty ops vector");

	if (N->getOpcode() == ISD::CONCAT_VECTORS) {
	Ops.append(N->op_begin(), N->op_end());
	return true;
	}

	if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	SDValue Src = N->getOperand(0);
	SDValue Sub = N->getOperand(1);
	const APInt &Idx = N->getConstantOperandAPInt(2);
	EVT VT = Src.getValueType();
	EVT SubVT = Sub.getValueType();

	// TODO - Handle more general insert_subvector chains.
	if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
	Idx == (VT.getVectorNumElements() / 2) &&
	Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
	+ Src.getOperand(1).getValueType() == SubVT &&
	isNullConstant(Src.getOperand(2))) {
	Ops.push_back(Src.getOperand(1));
	Ops.push_back(Sub);
	return true;
	}
	}

	return false;
	}

	// Helper for splitting operands of an operation to legal target size and
	// apply a function on each part.
	// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
	// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
	// deciding if/how to split Ops. Ops elements do not have to be of type VT.
	// The argument Builder is a function that will be applied on each split part:
	// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
	template <typename F>
	SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
	const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
	F Builder, bool CheckBWI = true) {
	assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
	unsigned NumSubs = 1;
	if ((CheckBWI && Subtarget.useBWIRegs()) \|\|
	(!CheckBWI && Subtarget.useAVX512Regs())) {
	if (VT.getSizeInBits() > 512) {
	NumSubs = VT.getSizeInBits() / 512;
	assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
	}
	} else if (Subtarget.hasAVX2()) {
	if (VT.getSizeInBits() > 256) {
	NumSubs = VT.getSizeInBits() / 256;
	assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
	}
	} else {
	if (VT.getSizeInBits() > 128) {
	NumSubs = VT.getSizeInBits() / 128;
	assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
	}
	}

	if (NumSubs == 1)
	return Builder(DAG, DL, Ops);

	SmallVector<SDValue, 4> Subs;
	for (unsigned i = 0; i != NumSubs; ++i) {
	SmallVector<SDValue, 2> SubOps;
	for (SDValue Op : Ops) {
	EVT OpVT = Op.getValueType();
	unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
	unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
	SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
	}
	Subs.push_back(Builder(DAG, DL, SubOps));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
	}

	/// Insert i1-subvector to i1-vector.
	static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue SubVec = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);

	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	// Inserting undef is a nop. We can just return the original vector.
	if (SubVec.isUndef())
	return Vec;

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
	return Op;

	MVT OpVT = Op.getSimpleValueType();
	unsigned NumElems = OpVT.getVectorNumElements();

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

	// Extend to natively supported kshift.
	MVT WideOpVT = OpVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8)
	WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

	// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
	// if necessary.
	if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// May need to promote to a legal type.
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	SubVec, Idx);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	MVT SubVecVT = SubVec.getSimpleValueType();
	unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

	assert(IdxVal + SubVecNumElems <= NumElems &&
	IdxVal % SubVecVT.getSizeInBits() == 0 &&
	"Unexpected index value in INSERT_SUBVECTOR");

	SDValue Undef = DAG.getUNDEF(WideOpVT);

	if (IdxVal == 0) {
	// Zero lower bits of the Vec
	SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
	ZeroIdx);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	// Merge them together, SubVec should be zero extended.
	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	SubVec, ZeroIdx);
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, SubVec, ZeroIdx);

	if (Vec.isUndef()) {
	assert(IdxVal != 0 && "Unexpected index");
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	assert(IdxVal != 0 && "Unexpected index");
	NumElems = WideOpVT.getVectorNumElements();
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(ShiftLeft, dl, MVT::i8));
	if (ShiftRight != 0)
	SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
	DAG.getConstant(ShiftRight, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	// Simple case when we put subvector in the upper part
	if (IdxVal + SubVecNumElems == NumElems) {
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	if (SubVecNumElems * 2 == NumElems) {
	// Special case, use legal zero extending insert_subvector. This allows
	// isel to opimitize when bits are known zero.
	Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	Vec, ZeroIdx);
	} else {
	// Otherwise use explicit shifts to zero the bits.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, Vec, ZeroIdx);
	NumElems = WideOpVT.getVectorNumElements();
	SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	}
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	// Inserting into the middle is more complicated.

	NumElems = WideOpVT.getVectorNumElements();

	// Widen the vector if needed.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
	// Move the current value of the bit to be replace to the lsbs.
	Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	// Xor with the new bit.
	Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
	// Shift to MSB, filling bottom bits with 0.
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
	DAG.getConstant(ShiftLeft, dl, MVT::i8));
	// Shift to the final position, filling upper bits with 0.
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
	DAG.getConstant(ShiftRight, dl, MVT::i8));
	// Xor with original vector leaving the new value.
	Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
	// Reduce to original width if needed.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
	unsigned NumElems, SelectionDAG &DAG,
	const SDLoc &dl, unsigned VectorWidth) {
	SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
	return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
	}

	/// Returns a vector of specified type with all bits set.
	/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
	/// Then bitcast to their original type, ensuring they get CSE'd.
	static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected a 128/256/512-bit vector type");

	APInt Ones = APInt::getAllOnesValue(32);
	unsigned NumElts = VT.getSizeInBits() / 32;
	SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
	return DAG.getBitcast(VT, Vec);
	}

	// Convert _EXTEND to _EXTEND_VECTOR_INREG opcode.
	static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
	switch (Opcode) {
	case ISD::ANY_EXTEND:
	case ISD::ANY_EXTEND_VECTOR_INREG:
	return ISD::ANY_EXTEND_VECTOR_INREG;
	case ISD::ZERO_EXTEND:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	return ISD::ZERO_EXTEND_VECTOR_INREG;
	case ISD::SIGN_EXTEND:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return ISD::SIGN_EXTEND_VECTOR_INREG;
	}
	llvm_unreachable("Unknown opcode");
	}

	static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
	SDValue In, SelectionDAG &DAG) {
	EVT InVT = In.getValueType();
	assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
	assert((ISD::ANY_EXTEND == Opcode \|\| ISD::SIGN_EXTEND == Opcode \|\|
	ISD::ZERO_EXTEND == Opcode) &&
	"Unknown extension opcode");

	// For 256-bit vectors, we only need the lower (128-bit) input half.
	// For 512-bit vectors, we only need the lower input half or quarter.
	if (InVT.getSizeInBits() > 128) {
	assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
	"Expected VTs to be the same size!");
	unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
	In = extractSubVector(In, 0, DAG, DL,
	std::max(128U, VT.getSizeInBits() / Scale));
	InVT = In.getValueType();
	}

	if (VT.getVectorNumElements() != InVT.getVectorNumElements())
	Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);

	return DAG.getNode(Opcode, DL, VT, In);
	}

	/// Returns a vector_shuffle node for an unpackl operation.
	static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / true, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Returns a vector_shuffle node for an unpackh operation.
	static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / false, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Return a vector_shuffle of the specified vector of zero or undef vector.
	/// This produces a shuffle where the low element of V2 is swizzled into the
	/// zero/undef vector, landing at element Idx.
	/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
	static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
	bool IsZero,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = V2.getSimpleValueType();
	SDValue V1 = IsZero
	? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
	int NumElems = VT.getVectorNumElements();
	SmallVector<int, 16> MaskVec(NumElems);
	for (int i = 0; i != NumElems; ++i)
	// If this is the insertion idx, put the low elt of V2 here.
	MaskVec[i] = (i == Idx) ? NumElems : i;
	return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
	}

	static const Constant getTargetConstantFromNode(LoadSDNode Load) {
	if (!Load \|\| !ISD::isNormalLoad(Load))
	return nullptr;

	SDValue Ptr = Load->getBasePtr();
	if (Ptr->getOpcode() == X86ISD::Wrapper \|\|
	Ptr->getOpcode() == X86ISD::WrapperRIP)
	Ptr = Ptr->getOperand(0);

	auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
	if (!CNode \|\| CNode->isMachineConstantPoolEntry() \|\| CNode->getOffset() != 0)
	return nullptr;

	return CNode->getConstVal();
	}

	static const Constant *getTargetConstantFromNode(SDValue Op) {
	Op = peekThroughBitcasts(Op);
	return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
	}

	const Constant *
	X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
	assert(LD && "Unexpected null LoadSDNode");
	return getTargetConstantFromNode(LD);
	}

	// Extract raw constant bits from constant pools.
	static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
	APInt &UndefElts,
	SmallVectorImpl<APInt> &EltBits,
	bool AllowWholeUndefs = true,
	bool AllowPartialUndefs = true) {
	assert(EltBits.empty() && "Expected an empty EltBits vector");

	Op = peekThroughBitcasts(Op);

	EVT VT = Op.getValueType();
	unsigned SizeInBits = VT.getSizeInBits();
	assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
	unsigned NumElts = SizeInBits / EltSizeInBits;

	// Bitcast a source array of element bits to the target size.
	auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
	unsigned NumSrcElts = UndefSrcElts.getBitWidth();
	unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
	assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
	"Constant bit sizes don't match");

	// Don't split if we don't allow undef bits.
	bool AllowUndefs = AllowWholeUndefs \|\| AllowPartialUndefs;
	if (UndefSrcElts.getBoolValue() && !AllowUndefs)
	return false;

	// If we're already the right size, don't bother bitcasting.
	if (NumSrcElts == NumElts) {
	UndefElts = UndefSrcElts;
	EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
	return true;
	}

	// Extract all the undef/constant element data and pack into single bitsets.
	APInt UndefBits(SizeInBits, 0);
	APInt MaskBits(SizeInBits, 0);

	for (unsigned i = 0; i != NumSrcElts; ++i) {
	unsigned BitOffset = i * SrcEltSizeInBits;
	if (UndefSrcElts[i])
	UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
	MaskBits.insertBits(SrcEltBits[i], BitOffset);
	}

	// Split the undef/constant single bitset data into the target elements.
	UndefElts = APInt(NumElts, 0);
	EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

	for (unsigned i = 0; i != NumElts; ++i) {
	unsigned BitOffset = i * EltSizeInBits;
	APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

	// Only treat an element as UNDEF if all bits are UNDEF.
	if (UndefEltBits.isAllOnesValue()) {
	if (!AllowWholeUndefs)
	return false;
	UndefElts.setBit(i);
	continue;
	}

	// If only some bits are UNDEF then treat them as zero (or bail if not
	// supported).
	if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
	return false;

	EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
	}
	return true;
	};

	// Collect constant bits and insert into mask/undef bit masks.
	auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
	unsigned UndefBitIndex) {
	if (!Cst)
	return false;
	if (isa<UndefValue>(Cst)) {
	Undefs.setBit(UndefBitIndex);
	return true;
	}
	if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
	Mask = CInt->getValue();
	return true;
	}
	if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
	Mask = CFP->getValueAPF().bitcastToAPInt();
	return true;
	}
	return false;
	};

	// Handle UNDEFs.
	if (Op.isUndef()) {
	APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
	SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract scalar constant bits.
	if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt UndefSrcElts = APInt::getNullValue(1);
	SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt UndefSrcElts = APInt::getNullValue(1);
	APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
	SmallVector<APInt, 64> SrcEltBits(1, RawBits);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from build vector.
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantSDNode>(Src);
	SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantFPSDNode>(Src);
	APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
	SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from constant pool vector.
	if (auto *Cst = getTargetConstantFromNode(Op)) {
	Type *CstTy = Cst->getType();
	unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
	if (!CstTy->isVectorTy() \|\| (CstSizeInBits % SizeInBits) != 0)
	return false;

	unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0; i != NumSrcElts; ++i)
	if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
	UndefSrcElts, i))
	return false;

	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from a broadcasted constant pool scalar.
	if (Op.getOpcode() == X86ISD::VBROADCAST &&
	EltSizeInBits <= VT.getScalarSizeInBits()) {
	if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
	unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
	if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
	if (UndefSrcElts[0])
	UndefSrcElts.setBits(0, NumSrcElts);
	SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	}
	}

	// Extract constant bits from a subvector broadcast.
	if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
	SmallVector<APInt, 16> SubEltBits;
	if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, SubEltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	UndefElts = APInt::getSplat(NumElts, UndefElts);
	while (EltBits.size() < NumElts)
	EltBits.append(SubEltBits.begin(), SubEltBits.end());
	return true;
	}
	}

	// Extract a rematerialized scalar constant insertion.
	if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
	Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
	isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits;
	auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
	SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
	SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Insert constant bits from a base and sub vector sources.
	if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
	isa<ConstantSDNode>(Op.getOperand(2))) {
	// TODO - support insert_subvector through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	APInt UndefSubElts;
	SmallVector<APInt, 32> EltSubBits;
	if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
	UndefSubElts, EltSubBits,
	AllowWholeUndefs, AllowPartialUndefs) &&
	getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, EltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	unsigned BaseIdx = Op.getConstantOperandVal(2);
	UndefElts.insertBits(UndefSubElts, BaseIdx);
	for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
	EltBits[BaseIdx + i] = EltSubBits[i];
	return true;
	}
	}

	// Extract constant bits from a subvector's source.
	if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	isa<ConstantSDNode>(Op.getOperand(1))) {
	// TODO - support extract_subvector through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts, EltBits, AllowWholeUndefs,
	AllowPartialUndefs)) {
	EVT SrcVT = Op.getOperand(0).getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumSubElts = VT.getVectorNumElements();
	unsigned BaseIdx = Op.getConstantOperandVal(1);
	UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
	if ((BaseIdx + NumSubElts) != NumSrcElts)
	EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
	if (BaseIdx != 0)
	EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
	return true;
	}
	}

	// Extract constant bits from shuffle node sources.
	if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
	// TODO - support shuffle through bitcasts.
	if (EltSizeInBits != VT.getScalarSizeInBits())
	return false;

	ArrayRef<int> Mask = SVN->getMask();
	if ((!AllowWholeUndefs \|\| !AllowPartialUndefs) &&
	llvm::any_of(Mask, [](int M) { return M < 0; }))
	return false;

	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if (isAnyInRange(Mask, 0, NumElts) &&
	!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
	UndefElts0, EltBits0, AllowWholeUndefs,
	AllowPartialUndefs))
	return false;
	if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
	!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
	UndefElts1, EltBits1, AllowWholeUndefs,
	AllowPartialUndefs))
	return false;

	UndefElts = APInt::getNullValue(NumElts);
	for (int i = 0; i != (int)NumElts; ++i) {
	int M = Mask[i];
	if (M < 0) {
	UndefElts.setBit(i);
	EltBits.push_back(APInt::getNullValue(EltSizeInBits));
	} else if (M < (int)NumElts) {
	if (UndefElts0[M])
	UndefElts.setBit(i);
	EltBits.push_back(EltBits0[M]);
	} else {
	if (UndefElts1[M - NumElts])
	UndefElts.setBit(i);
	EltBits.push_back(EltBits1[M - NumElts]);
	}
	}
	return true;
	}

	return false;
	}

	static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
	UndefElts, EltBits, true, false)) {
	int SplatIndex = -1;
	for (int i = 0, e = EltBits.size(); i != e; ++i) {
	if (UndefElts[i])
	continue;
	if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
	SplatIndex = -1;
	break;
	}
	SplatIndex = i;
	}
	if (0 <= SplatIndex) {
	SplatVal = EltBits[SplatIndex];
	return true;
	}
	}

	return false;
	}

	static bool getTargetShuffleMaskIndices(SDValue MaskNode,
	unsigned MaskEltSizeInBits,
	SmallVectorImpl<uint64_t> &RawMask,
	APInt &UndefElts) {
	// Extract the raw target constant bits.
	SmallVector<APInt, 64> EltBits;
	if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
	EltBits, /* AllowWholeUndefs */ true,
	/* AllowPartialUndefs */ false))
	return false;

	// Insert the extracted elements into the mask.
	for (APInt Elt : EltBits)
	RawMask.push_back(Elt.getZExtValue());

	return true;
	}

	/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
	/// Note: This ignores saturation, so inputs must be checked first.
	static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
	bool Unary) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
	unsigned Offset = Unary ? 0 : NumElts;

	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
	Mask.push_back(Elt + (Lane * NumEltsPerLane));
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
	Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
	}
	}

	// Split the demanded elts of a PACKSS/PACKUS node between its operands.
	static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
	APInt &DemandedLHS, APInt &DemandedRHS) {
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = DemandedElts.getBitWidth();
	int NumInnerElts = NumElts / 2;
	int NumEltsPerLane = NumElts / NumLanes;
	int NumInnerEltsPerLane = NumInnerElts / NumLanes;

	DemandedLHS = APInt::getNullValue(NumInnerElts);
	DemandedRHS = APInt::getNullValue(NumInnerElts);

	// Map DemandedElts to the packed operands.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
	int OuterIdx = (Lane * NumEltsPerLane) + Elt;
	int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
	if (DemandedElts[OuterIdx])
	DemandedLHS.setBit(InnerIdx);
	if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
	DemandedRHS.setBit(InnerIdx);
	}
	}
	}

	// Split the demanded elts of a HADD/HSUB node between its operands.
	static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
	APInt &DemandedLHS, APInt &DemandedRHS) {
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = DemandedElts.getBitWidth();
	int NumEltsPerLane = NumElts / NumLanes;
	int HalfEltsPerLane = NumEltsPerLane / 2;

	DemandedLHS = APInt::getNullValue(NumElts);
	DemandedRHS = APInt::getNullValue(NumElts);

	// Map DemandedElts to the horizontal operands.
	for (int Idx = 0; Idx != NumElts; ++Idx) {
	if (!DemandedElts[Idx])
	continue;
	int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
	int LocalIdx = Idx % NumEltsPerLane;
	if (LocalIdx < HalfEltsPerLane) {
	DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
	DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
	} else {
	LocalIdx -= HalfEltsPerLane;
	DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
	DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
	}
	}
	}

	/// Calculates the shuffle mask corresponding to the target-specific opcode.
	/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
	/// operands in \p Ops, and returns true.
	/// Sets \p IsUnary to true if only one source is used. Note that this will set
	/// IsUnary for shuffles which use a single input multiple times, and in those
	/// cases it will adjust the mask to only have indices within that single input.
	/// It is an error to call this with non-empty Mask/Ops vectors.
	static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
	SmallVectorImpl<SDValue> &Ops,
	SmallVectorImpl<int> &Mask, bool &IsUnary) {
	unsigned NumElems = VT.getVectorNumElements();
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SmallVector<uint64_t, 32> RawMask;
	APInt RawUndefs;
	SDValue ImmN;

	assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
	assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");

	IsUnary = false;
	bool IsFakeUnary = false;
	switch (N->getOpcode()) {
	case X86ISD::BLENDI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUFP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeSHUFPMask(NumElems, MaskEltSize,
	cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::INSERTPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::EXTRQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(1)) &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	int BitLen = N->getConstantOperandVal(1);
	int BitIdx = N->getConstantOperandVal(2);
	DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
	IsUnary = true;
	}
	break;
	case X86ISD::INSERTQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(2)) &&
	isa<ConstantSDNode>(N->getOperand(3))) {
	int BitLen = N->getConstantOperandVal(2);
	int BitIdx = N->getConstantOperandVal(3);
	DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	}
	break;
	case X86ISD::UNPCKH:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::UNPCKL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVHLPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVHLPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVLHPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVLHPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::PALIGNR:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(0));
	break;
	case X86ISD::VSHLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = true;
	break;
	case X86ISD::VSRLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFD:
	case X86ISD::VPERMILPI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSHUFMask(NumElems, MaskEltSize,
	cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFHW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFLW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = true;
	break;
	case X86ISD::VZEXT_MOVL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeZeroMoveLowMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::VBROADCAST: {
	SDValue N0 = N->getOperand(0);
	// See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
	// add the pre-extracted value to the Ops vector.
	if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N0.getOperand(0).getValueType() == VT &&
	N0.getConstantOperandVal(1) == 0)
	Ops.push_back(N0.getOperand(0));

	// We only decode broadcasts of same-sized vectors, unless the broadcast
	// came from an extract from the original width. If we found one, we
	// pushed it the Ops vector above.
	if (N0.getValueType() == VT \|\| !Ops.empty()) {
	DecodeVectorBroadcast(NumElems, Mask);
	IsUnary = true;
	break;
	}
	return false;
	}
	case X86ISD::VPERMILPV: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::PSHUFB: {
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
	DecodePSHUFBMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
	break;
	case X86ISD::VPERM2X128:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
	Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUF128:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize,
	cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVSLDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSLDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSHDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSHDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVDDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVDDUPMask(NumElems, Mask);
	IsUnary = true;
	break;
	case X86ISD::VPERMIL2: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	SDValue CtrlNode = N->getOperand(3);
	if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
	unsigned CtrlImm = CtrlOp->getZExtValue();
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
	Mask);
	break;
	}
	}
	return false;
	}
	case X86ISD::VPPERM: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
	DecodeVPPERMMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV: {
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
	Ops.push_back(N->getOperand(1));
	SDValue MaskNode = N->getOperand(0);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMVMask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV3: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
	Ops.push_back(N->getOperand(0));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(1);
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
	RawUndefs)) {
	DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
	break;
	}
	return false;
	}
	default: llvm_unreachable("unknown target shuffle node");
	}

	// Empty mask indicates the decode failed.
	if (Mask.empty())
	return false;

	// Check if we're getting a shuffle mask with zero'd elements.
	if (!AllowSentinelZero)
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return false;

	// If we have a fake unary shuffle, the shuffle mask is spread across two
	// inputs that are actually the same node. Re-map the mask to always point
	// into the first input.
	if (IsFakeUnary)
	for (int &M : Mask)
	if (M >= (int)Mask.size())
	M -= Mask.size();

	// If we didn't already add operands in the opcode-specific code, default to
	// adding 1 or 2 operands starting at 0.
	if (Ops.empty()) {
	Ops.push_back(N->getOperand(0));
	if (!IsUnary \|\| IsFakeUnary)
	Ops.push_back(N->getOperand(1));
	}

	return true;
	}

	/// Check a target shuffle mask's inputs to see if we can set any values to
	/// SM_SentinelZero - this is for elements that are known to be zero
	/// (not just zeroable) from their inputs.
	/// Returns true if the target shuffle mask was decoded.
	static bool setTargetShuffleZeroElements(SDValue N,
	SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops) {
	bool IsUnary;
	if (!isTargetShuffle(N.getOpcode()))
	return false;

	MVT VT = N.getSimpleValueType();
	if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
	return false;

	SDValue V1 = Ops[0];
	SDValue V2 = IsUnary ? V1 : Ops[1];

	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	assert((VT.getSizeInBits() % Mask.size()) == 0 &&
	"Illegal split of shuffle value type");
	unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();

	// Extract known constant input data.
	APInt UndefSrcElts[2];
	SmallVector<APInt, 32> SrcEltBits[2];
	bool IsSrcConstant[2] = {
	getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
	SrcEltBits[0], true, false),
	getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
	SrcEltBits[1], true, false)};

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];

	// Already decoded as SM_SentinelZero / SM_SentinelUndef.
	if (M < 0)
	continue;

	// Determine shuffle input and normalize the mask.
	unsigned SrcIdx = M / Size;
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// We are referencing an UNDEF input.
	if (V.isUndef()) {
	Mask[i] = SM_SentinelUndef;
	continue;
	}

	// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
	// TODO: We currently only set UNDEF for integer types - floats use the same
	// registers as vectors and many of the scalar folded loads rely on the
	// SCALAR_TO_VECTOR pattern.
	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Size % V.getValueType().getVectorNumElements()) == 0) {
	int Scale = Size / V.getValueType().getVectorNumElements();
	int Idx = M / Scale;
	if (Idx != 0 && !VT.isFloatingPoint())
	Mask[i] = SM_SentinelUndef;
	else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
	Mask[i] = SM_SentinelZero;
	continue;
	}

	// Attempt to extract from the source's constant bits.
	if (IsSrcConstant[SrcIdx]) {
	if (UndefSrcElts[SrcIdx][M])
	Mask[i] = SM_SentinelUndef;
	else if (SrcEltBits[SrcIdx][M] == 0)
	Mask[i] = SM_SentinelZero;
	}
	}

	assert(VT.getVectorNumElements() == Mask.size() &&
	"Different mask size from vector size!");
	return true;
	}

	// Forward declaration (for getFauxShuffleMask recursive check).
	static bool resolveTargetShuffleInputs(SDValue Op,
	SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	SelectionDAG &DAG);

	// Attempt to decode ops that could be represented as a shuffle mask.
	// The decoded shuffle mask may contain a different number of elements to the
	// destination value type.
	static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
	SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops,
	SelectionDAG &DAG) {
	Mask.clear();
	Ops.clear();

	MVT VT = N.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumSizeInBits = VT.getSizeInBits();
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	if ((NumBitsPerElt % 8) != 0 \|\| (NumSizeInBits % 8) != 0)
	return false;
	assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");

	unsigned Opcode = N.getOpcode();
	switch (Opcode) {
	case ISD::VECTOR_SHUFFLE: {
	// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
	ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
	if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
	Mask.append(ShuffleMask.begin(), ShuffleMask.end());
	Ops.push_back(N.getOperand(0));
	Ops.push_back(N.getOperand(1));
	return true;
	}
	return false;
	}
	case ISD::AND:
	case X86ISD::ANDNP: {
	// Attempt to decode as a per-byte mask.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	bool IsAndN = (X86ISD::ANDNP == Opcode);
	uint64_t ZeroMask = IsAndN ? 255 : 0;
	if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
	return false;
	for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
	if (UndefElts[i]) {
	Mask.push_back(SM_SentinelUndef);
	continue;
	}
	uint64_t ByteBits = EltBits[i].getZExtValue();
	if (ByteBits != 0 && ByteBits != 255)
	return false;
	Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
	}
	Ops.push_back(IsAndN ? N1 : N0);
	return true;
	}
	case ISD::OR: {
	// Inspect each operand at the byte level. We can merge these into a
	// blend shuffle mask if for each byte at least one is masked out (zero).
	KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts);
	KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts);
	if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
	bool IsByteMask = true;
	unsigned NumSizeInBytes = NumSizeInBits / 8;
	unsigned NumBytesPerElt = NumBitsPerElt / 8;
	APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
	APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
	for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
	unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
	unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
	if (LHS == 255 && RHS == 0)
	SelectMask.setBit(i);
	else if (LHS == 255 && RHS == 255)
	ZeroMask.setBit(i);
	else if (!(LHS == 0 && RHS == 255))
	IsByteMask = false;
	}
	if (IsByteMask) {
	for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
	for (unsigned j = 0; j != NumBytesPerElt; ++j) {
	unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
	int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
	Mask.push_back(Idx);
	}
	}
	Ops.push_back(N.getOperand(0));
	Ops.push_back(N.getOperand(1));
	return true;
	}
	}

	// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
	// is a valid shuffle index.
	SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
	SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
	if (!N0.getValueType().isVector() \|\| !N1.getValueType().isVector())
	return false;
	SmallVector<int, 64> SrcMask0, SrcMask1;
	SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
	if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) \|\|
	!resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG))
	return false;
	int MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
	SmallVector<int, 64> Mask0, Mask1;
	scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
	scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
	for (int i = 0; i != MaskSize; ++i) {
	if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
	Mask.push_back(SM_SentinelUndef);
	else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
	Mask.push_back(SM_SentinelZero);
	else if (Mask1[i] == SM_SentinelZero)
	Mask.push_back(Mask0[i]);
	else if (Mask0[i] == SM_SentinelZero)
	Mask.push_back(Mask1[i] + (MaskSize * SrcInputs0.size()));
	else
	return false;
	}
	for (SDValue &Op : SrcInputs0)
	Ops.push_back(Op);
	for (SDValue &Op : SrcInputs1)
	Ops.push_back(Op);
	return true;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue Src = N.getOperand(0);
	SDValue Sub = N.getOperand(1);
	EVT SubVT = Sub.getValueType();
	unsigned NumSubElts = SubVT.getVectorNumElements();
	if (!isa<ConstantSDNode>(N.getOperand(2)) \|\|
	!N->isOnlyUserOf(Sub.getNode()))
	return false;
	uint64_t InsertIdx = N.getConstantOperandVal(2);
	// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
	if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	Sub.getOperand(0).getValueType() == VT &&
	isa<ConstantSDNode>(Sub.getOperand(1))) {
	uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
	for (int i = 0; i != (int)NumElts; ++i)
	Mask.push_back(i);
	for (int i = 0; i != (int)NumSubElts; ++i)
	Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
	Ops.push_back(Src);
	Ops.push_back(Sub.getOperand(0));
	return true;
	}
	// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
	SmallVector<int, 64> SubMask;
	SmallVector<SDValue, 2> SubInputs;
	if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
	SubMask, DAG))
	return false;
	if (SubMask.size() != NumSubElts) {
	assert(((SubMask.size() % NumSubElts) == 0 \|\|
	(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
	if ((NumSubElts % SubMask.size()) == 0) {
	int Scale = NumSubElts / SubMask.size();
	SmallVector<int,64> ScaledSubMask;
	scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
	SubMask = ScaledSubMask;
	} else {
	int Scale = SubMask.size() / NumSubElts;
	NumSubElts = SubMask.size();
	NumElts *= Scale;
	InsertIdx *= Scale;
	}
	}
	Ops.push_back(Src);
	for (SDValue &SubInput : SubInputs) {
	EVT SubSVT = SubInput.getValueType().getScalarType();
	EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
	NumSizeInBits / SubSVT.getSizeInBits());
	Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
	DAG.getUNDEF(AltVT), SubInput,
	DAG.getIntPtrConstant(0, SDLoc(N))));
	}
	for (int i = 0; i != (int)NumElts; ++i)
	Mask.push_back(i);
	for (int i = 0; i != (int)NumSubElts; ++i) {
	int M = SubMask[i];
	if (0 <= M) {
	int InputIdx = M / NumSubElts;
	M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
	}
	Mask[i + InsertIdx] = M;
	}
	return true;
	}
	case ISD::SCALAR_TO_VECTOR: {
	// Match against a scalar_to_vector of an extract from a vector,
	// for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
	SDValue N0 = N.getOperand(0);
	SDValue SrcExtract;

	if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	N0.getOperand(0).getValueType() == VT) \|\|
	(N0.getOpcode() == X86ISD::PEXTRW &&
	N0.getOperand(0).getValueType() == MVT::v8i16) \|\|
	(N0.getOpcode() == X86ISD::PEXTRB &&
	N0.getOperand(0).getValueType() == MVT::v16i8)) {
	SrcExtract = N0;
	}

	if (!SrcExtract \|\| !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
	return false;

	SDValue SrcVec = SrcExtract.getOperand(0);
	EVT SrcVT = SrcVec.getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;

	unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
	if (NumSrcElts <= SrcIdx)
	return false;

	Ops.push_back(SrcVec);
	Mask.push_back(SrcIdx);
	Mask.append(NumZeros, SM_SentinelZero);
	Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
	return true;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	SDValue InVec = N.getOperand(0);
	SDValue InScl = N.getOperand(1);
	SDValue InIndex = N.getOperand(2);
	if (!isa<ConstantSDNode>(InIndex) \|\|
	cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
	return false;
	uint64_t InIdx = N.getConstantOperandVal(2);

	// Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
	if (X86::isZeroNode(InScl)) {
	Ops.push_back(InVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
	return true;
	}

	// Attempt to recognise a PINSR(PEXTR) shuffle pattern.
	// TODO: Expand this to support INSERT_VECTOR_ELT/etc.
	unsigned ExOp =
	(X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
	if (InScl.getOpcode() != ExOp)
	return false;

	SDValue ExVec = InScl.getOperand(0);
	SDValue ExIndex = InScl.getOperand(1);
	if (!isa<ConstantSDNode>(ExIndex) \|\|
	cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
	return false;
	uint64_t ExIdx = InScl.getConstantOperandVal(1);

	Ops.push_back(InVec);
	Ops.push_back(ExVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
	return true;
	}
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
	N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
	"Unexpected input value type");

	APInt EltsLHS, EltsRHS;
	getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);

	// If we know input saturation won't happen we can treat this
	// as a truncation shuffle.
	if (Opcode == X86ISD::PACKSS) {
	if ((!N0.isUndef() &&
	DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) \|\|
	(!N1.isUndef() &&
	DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt))
	return false;
	} else {
	APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
	if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) \|\|
	(!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS)))
	return false;
	}

	bool IsUnary = (N0 == N1);

	Ops.push_back(N0);
	if (!IsUnary)
	Ops.push_back(N1);

	createPackShuffleMask(VT, Mask, IsUnary);
	return true;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	uint64_t ShiftVal = N.getConstantOperandVal(1);
	// Out of range bit shifts are guaranteed to be zero.
	if (NumBitsPerElt <= ShiftVal) {
	Mask.append(NumElts, SM_SentinelZero);
	return true;
	}

	// We can only decode 'whole byte' bit shifts as shuffles.
	if ((ShiftVal % 8) != 0)
	break;

	uint64_t ByteShift = ShiftVal / 8;
	unsigned NumBytes = NumSizeInBits / 8;
	unsigned NumBytesPerElt = NumBitsPerElt / 8;
	Ops.push_back(N.getOperand(0));

	// Clear mask to all zeros and insert the shifted byte indices.
	Mask.append(NumBytes, SM_SentinelZero);

	if (X86ISD::VSHLI == Opcode) {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j] = i + j - ByteShift;
	} else {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j - ByteShift] = i + j;
	}
	return true;
	}
	case X86ISD::VBROADCAST: {
	SDValue Src = N.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	if (!SrcVT.isVector())
	return false;

	if (NumSizeInBits != SrcVT.getSizeInBits()) {
	assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
	"Illegal broadcast type");
	SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
	NumSizeInBits / SrcVT.getScalarSizeInBits());
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
	DAG.getUNDEF(SrcVT), Src,
	DAG.getIntPtrConstant(0, SDLoc(N)));
	}

	Ops.push_back(Src);
	Mask.append(NumElts, 0);
	return true;
	}
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::ANY_EXTEND_VECTOR_INREG: {
	SDValue Src = N.getOperand(0);
	EVT SrcVT = Src.getValueType();

	// Extended source must be a simple vector.
	if (!SrcVT.isSimple() \|\| (SrcVT.getSizeInBits() % 128) != 0 \|\|
	(SrcVT.getScalarSizeInBits() % 8) != 0)
	return false;

	unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
	bool IsAnyExtend =
	(ISD::ANY_EXTEND == Opcode \|\| ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
	DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
	Mask);

	if (NumSizeInBits != SrcVT.getSizeInBits()) {
	assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
	"Illegal zero-extension type");
	SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
	NumSizeInBits / NumSrcBitsPerElt);
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
	DAG.getUNDEF(SrcVT), Src,
	DAG.getIntPtrConstant(0, SDLoc(N)));
	}

	Ops.push_back(Src);
	return true;
	}
	}

	return false;
	}

	/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
	static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask) {
	int MaskWidth = Mask.size();
	SmallVector<SDValue, 16> UsedInputs;
	for (int i = 0, e = Inputs.size(); i < e; ++i) {
	int lo = UsedInputs.size() * MaskWidth;
	int hi = lo + MaskWidth;

	// Strip UNDEF input usage.
	if (Inputs[i].isUndef())
	for (int &M : Mask)
	if ((lo <= M) && (M < hi))
	M = SM_SentinelUndef;

	// Check for unused inputs.
	if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
	for (int &M : Mask)
	if (lo <= M)
	M -= MaskWidth;
	continue;
	}

	// Check for repeated inputs.
	bool IsRepeat = false;
	for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
	if (UsedInputs[j] != Inputs[i])
	continue;
	for (int &M : Mask)
	if (lo <= M)
	M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
	IsRepeat = true;
	break;
	}
	if (IsRepeat)
	continue;

	UsedInputs.push_back(Inputs[i]);
	}
	Inputs = UsedInputs;
	}

	/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
	/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
	/// remaining input indices in case we now have a unary shuffle and adjust the
	/// inputs accordingly.
	/// Returns true if the target shuffle mask was decoded.
	static bool resolveTargetShuffleInputs(SDValue Op,
	SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	SelectionDAG &DAG) {
	unsigned NumElts = Op.getValueType().getVectorNumElements();
	APInt DemandedElts = APInt::getAllOnesValue(NumElts);
	if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
	if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG))
	return false;

	resolveTargetShuffleInputsAndMask(Inputs, Mask);
	return true;
	}

	/// Returns the scalar element that will make up the ith
	/// element of the result of the vector shuffle.
	static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
	unsigned Depth) {
	if (Depth == 6)
	return SDValue(); // Limit search depth.

	SDValue V = SDValue(N, 0);
	EVT VT = V.getValueType();
	unsigned Opcode = V.getOpcode();

	// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
	if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
	int Elt = SV->getMaskElt(Index);

	if (Elt < 0)
	return DAG.getUNDEF(VT.getVectorElementType());

	unsigned NumElems = VT.getVectorNumElements();
	SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
	: SV->getOperand(1);
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
	}

	// Recurse into target specific vector shuffles to find scalars.
	if (isTargetShuffle(Opcode)) {
	MVT ShufVT = V.getSimpleValueType();
	MVT ShufSVT = ShufVT.getVectorElementType();
	int NumElems = (int)ShufVT.getVectorNumElements();
	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 16> ShuffleOps;
	bool IsUnary;

	if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
	return SDValue();

	int Elt = ShuffleMask[Index];
	if (Elt == SM_SentinelZero)
	return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
	if (Elt == SM_SentinelUndef)
	return DAG.getUNDEF(ShufSVT);

	assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
	SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
	Depth+1);
	}

	// Recurse into insert_subvector base/sub vector to find scalars.
	if (Opcode == ISD::INSERT_SUBVECTOR &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	SDValue Vec = N->getOperand(0);
	SDValue Sub = N->getOperand(1);
	EVT SubVT = Sub.getValueType();
	unsigned NumSubElts = SubVT.getVectorNumElements();
	uint64_t SubIdx = N->getConstantOperandVal(2);

	if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
	return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);
	return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);
	}

	// Recurse into extract_subvector src vector to find scalars.
	if (Opcode == ISD::EXTRACT_SUBVECTOR &&
	isa<ConstantSDNode>(N->getOperand(1))) {
	SDValue Src = N->getOperand(0);
	uint64_t SrcIdx = N->getConstantOperandVal(1);
	return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);
	}

	// Actual nodes that may contain scalar elements
	if (Opcode == ISD::BITCAST) {
	V = V.getOperand(0);
	EVT SrcVT = V.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	if (!SrcVT.isVector() \|\| SrcVT.getVectorNumElements() != NumElems)
	return SDValue();
	}

	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return (Index == 0) ? V.getOperand(0)
	: DAG.getUNDEF(VT.getVectorElementType());

	if (V.getOpcode() == ISD::BUILD_VECTOR)
	return V.getOperand(Index);

	return SDValue();
	}

	// Use PINSRB/PINSRW/PINSRD to create a build vector.
	static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	((VT == MVT::v16i8 \|\| VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
	"Illegal vector insertion");

	SDLoc dl(Op);
	SDValue V;
	bool First = true;

	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsNonZero = (NonZeros & (1 << i)) != 0;
	if (!IsNonZero)
	continue;

	// If the build vector contains zeros or our first insertion is not the
	// first index then insert into zero vector to break any register
	// dependency else use SCALAR_TO_VECTOR.
	if (First) {
	First = false;
	if (NumZero \|\| 0 != i)
	V = getZeroVector(VT, Subtarget, DAG, dl);
	else {
	assert(0 == i && "Expected insertion into zero-index");
	V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getBitcast(VT, V);
	continue;
	}
	}
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
	DAG.getIntPtrConstant(i, dl));
	}

	return V;
	}

	/// Custom lower build_vector of v16i8.
	static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 8 && !Subtarget.hasSSE41())
	return SDValue();

	// SSE4.1 - use PINSRB to insert each byte directly.
	if (Subtarget.hasSSE41())
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);

	SDLoc dl(Op);
	SDValue V;

	// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
	for (unsigned i = 0; i < 16; i += 2) {
	bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
	bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
	if (!ThisIsNonZero && !NextIsNonZero)
	continue;

	// FIXME: Investigate combining the first 4 bytes as a i32 instead.
	SDValue Elt;
	if (ThisIsNonZero) {
	if (NumZero \|\| NextIsNonZero)
	Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	else
	Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	}

	if (NextIsNonZero) {
	SDValue NextElt = Op.getOperand(i + 1);
	if (i == 0 && NumZero)
	NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
	else
	NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
	NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
	DAG.getConstant(8, dl, MVT::i8));
	if (ThisIsNonZero)
	Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
	else
	Elt = NextElt;
	}

	// If our first insertion is not the first index then insert into zero
	// vector to break any register dependency else use SCALAR_TO_VECTOR.
	if (!V) {
	if (i != 0)
	V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
	else {
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
	V = DAG.getBitcast(MVT::v8i16, V);
	continue;
	}
	}
	Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
	DAG.getIntPtrConstant(i / 2, dl));
	}

	return DAG.getBitcast(MVT::v16i8, V);
	}

	/// Custom lower build_vector of v8i16.
	static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 4 && !Subtarget.hasSSE41())
	return SDValue();

	// Use PINSRW to insert each byte directly.
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);
	}

	/// Custom lower build_vector of v4i32 or v4f32.
	static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// If this is a splat of a pair of elements, use MOVDDUP (unless the target
	// has XOP; in that case defer lowering to potentially use VPERMIL2PS).
	// Because we're creating a less complicated build vector here, we may enable
	// further folding of the MOVDDUP via shuffle transforms.
	if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
	Op.getOperand(0) == Op.getOperand(2) &&
	Op.getOperand(1) == Op.getOperand(3) &&
	Op.getOperand(0) != Op.getOperand(1)) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	// Create a new build vector with the first 2 elements followed by undef
	// padding, bitcast to v2f64, duplicate, and bitcast back.
	SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
	DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
	SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
	SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
	return DAG.getBitcast(VT, Dup);
	}

	// Find all zeroable elements.
	std::bitset<4> Zeroable, Undefs;
	for (int i = 0; i < 4; ++i) {
	SDValue Elt = Op.getOperand(i);
	Undefs[i] = Elt.isUndef();
	Zeroable[i] = (Elt.isUndef() \|\| X86::isZeroNode(Elt));
	}
	assert(Zeroable.size() - Zeroable.count() > 1 &&
	"We expect at least two non-zero elements!");

	// We only know how to deal with build_vector nodes where elements are either
	// zeroable or extract_vector_elt with constant index.
	SDValue FirstNonZero;
	unsigned FirstNonZeroIdx;
	for (unsigned i = 0; i < 4; ++i) {
	if (Zeroable[i])
	continue;
	SDValue Elt = Op.getOperand(i);
	if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Elt.getOperand(1)))
	return SDValue();
	// Make sure that this node is extracting from a 128-bit vector.
	MVT VT = Elt.getOperand(0).getSimpleValueType();
	if (!VT.is128BitVector())
	return SDValue();
	if (!FirstNonZero.getNode()) {
	FirstNonZero = Elt;
	FirstNonZeroIdx = i;
	}
	}

	assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
	SDValue V1 = FirstNonZero.getOperand(0);
	MVT VT = V1.getSimpleValueType();

	// See if this build_vector can be lowered as a blend with zero.
	SDValue Elt;
	unsigned EltMaskIdx, EltIdx;
	int Mask[4];
	for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
	if (Zeroable[EltIdx]) {
	// The zero vector will be on the right hand side.
	Mask[EltIdx] = EltIdx+4;
	continue;
	}

	Elt = Op->getOperand(EltIdx);
	// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
	EltMaskIdx = Elt.getConstantOperandVal(1);
	if (Elt.getOperand(0) != V1 \|\| EltMaskIdx != EltIdx)
	break;
	Mask[EltIdx] = EltIdx;
	}

	if (EltIdx == 4) {
	// Let the shuffle legalizer deal with blend operations.
	SDValue VZeroOrUndef = (Zeroable == Undefs)
	? DAG.getUNDEF(VT)
	: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
	if (V1.getSimpleValueType() != VT)
	V1 = DAG.getBitcast(VT, V1);
	return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
	}

	// See if we can lower this build_vector to a INSERTPS.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDValue V2 = Elt.getOperand(0);
	if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
	V1 = SDValue();

	bool CanFold = true;
	for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
	if (Zeroable[i])
	continue;

	SDValue Current = Op->getOperand(i);
	SDValue SrcVector = Current->getOperand(0);
	if (!V1.getNode())
	V1 = SrcVector;
	CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
	}

	if (!CanFold)
	return SDValue();

	assert(V1.getNode() && "Expected at least two non-zero elements!");
	if (V1.getSimpleValueType() != MVT::v4f32)
	V1 = DAG.getBitcast(MVT::v4f32, V1);
	if (V2.getSimpleValueType() != MVT::v4f32)
	V2 = DAG.getBitcast(MVT::v4f32, V2);

	// Ok, we can emit an INSERTPS instruction.
	unsigned ZMask = Zeroable.to_ulong();

	unsigned InsertPSMask = EltMaskIdx << 6 \| EltIdx << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	SDLoc DL(Op);
	SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getIntPtrConstant(InsertPSMask, DL));
	return DAG.getBitcast(VT, Result);
	}

	/// Return a vector logical shift node.
	static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
	SelectionDAG &DAG, const TargetLowering &TLI,
	const SDLoc &dl) {
	assert(VT.is128BitVector() && "Unknown type for VShift");
	MVT ShVT = MVT::v16i8;
	unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
	SrcOp = DAG.getBitcast(ShVT, SrcOp);
	assert(NumBits % 8 == 0 && "Only support byte sized shifts");
	SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
	return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
	}

	static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
	SelectionDAG &DAG) {

	// Check if the scalar load can be widened into a vector load. And if
	// the address is "base + cst" see if the cst can be "absorbed" into
	// the shuffle mask.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
	SDValue Ptr = LD->getBasePtr();
	if (!ISD::isNormalLoad(LD) \|\| LD->isVolatile())
	return SDValue();
	EVT PVT = LD->getValueType(0);
	if (PVT != MVT::i32 && PVT != MVT::f32)
	return SDValue();

	int FI = -1;
	int64_t Offset = 0;
	if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
	FI = FINode->getIndex();
	Offset = 0;
	} else if (DAG.isBaseWithConstantOffset(Ptr) &&
	isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
	FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	Offset = Ptr.getConstantOperandVal(1);
	Ptr = Ptr.getOperand(0);
	} else {
	return SDValue();
	}

	// FIXME: 256-bit vector instructions don't require a strict alignment,
	// improve this code to support it better.
	unsigned RequiredAlign = VT.getSizeInBits()/8;
	SDValue Chain = LD->getChain();
	// Make sure the stack object alignment is at least 16 or 32.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
	if (MFI.isFixedObjectIndex(FI)) {
	// Can't change the alignment. FIXME: It's possible to compute
	// the exact stack offset and reference FI + adjust offset instead.
	// If someone really cares about this. That's the way to implement it.
	return SDValue();
	} else {
	MFI.setObjectAlignment(FI, RequiredAlign);
	}
	}

	// (Offset % 16 or 32) must be multiple of 4. Then address is then
	// Ptr + (Offset & ~15).
	if (Offset < 0)
	return SDValue();
	if ((Offset % RequiredAlign) & 3)
	return SDValue();
	int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
	if (StartOffset) {
	SDLoc DL(Ptr);
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
	}

	int EltNo = (Offset - StartOffset) >> 2;
	unsigned NumElems = VT.getVectorNumElements();

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
	SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
	LD->getPointerInfo().getWithOffset(StartOffset));

	SmallVector<int, 8> Mask(NumElems, EltNo);

	return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
	}

	return SDValue();
	}

	/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
	/// elements can be replaced by a single large load which has the same value as
	/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
	///
	/// Example: <load i32 a, load i32 a+4, zero, undef> -> zextload a
	static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	unsigned NumElems = Elts.size();

	int LastLoadedElt = -1;
	APInt LoadMask = APInt::getNullValue(NumElems);
	APInt ZeroMask = APInt::getNullValue(NumElems);
	APInt UndefMask = APInt::getNullValue(NumElems);

	SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);

	// For each element in the initializer, see if we've found a load, zero or an
	// undef.
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (!Elt.getNode())
	return SDValue();
	if (Elt.isUndef()) {
	UndefMask.setBit(i);
	continue;
	}
	if (X86::isZeroNode(Elt) \|\| ISD::isBuildVectorAllZeros(Elt.getNode())) {
	ZeroMask.setBit(i);
	continue;
	}

	// Each loaded element must be the correct fractional portion of the
	// requested vector load.
	if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
	return SDValue();

	if (!ISD::isNON_EXTLoad(Elt.getNode()))
	return SDValue();

	Loads[i] = cast<LoadSDNode>(Elt);
	LoadMask.setBit(i);
	LastLoadedElt = i;
	}
	assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +
	LoadMask.countPopulation()) == NumElems &&
	"Incomplete element masks");

	// Handle Special Cases - all undef or undef/zero.
	if (UndefMask.countPopulation() == NumElems)
	return DAG.getUNDEF(VT);

	// FIXME: Should we return this as a BUILD_VECTOR instead?
	if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
	return VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	int FirstLoadedElt = LoadMask.countTrailingZeros();
	SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
	EVT EltBaseVT = EltBase.getValueType();
	assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
	"Register/Memory size mismatch");
	LoadSDNode *LDBase = Loads[FirstLoadedElt];
	assert(LDBase && "Did not find base load for merging consecutive loads");
	unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
	unsigned BaseSizeInBytes = BaseSizeInBits / 8;
	int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
	assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");

	// Consecutive loads can contain UNDEFS but not ZERO elements.
	// Consecutive loads with UNDEFs and ZEROs elements require a
	// an additional shuffle stage to clear the ZERO elements.
	bool IsConsecutiveLoad = true;
	bool IsConsecutiveLoadWithZeros = true;
	for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
	if (LoadMask[i]) {
	if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes,
	i - FirstLoadedElt)) {
	IsConsecutiveLoad = false;
	IsConsecutiveLoadWithZeros = false;
	break;
	}
	} else if (ZeroMask[i]) {
	IsConsecutiveLoad = false;
	}
	}

	auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
	auto MMOFlags = LDBase->getMemOperand()->getFlags();
	assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
	"Cannot merge volatile loads.");
	SDValue NewLd =
	DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
	LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
	for (auto *LD : Loads)
	if (LD)
	DAG.makeEquivalentMemoryOrdering(LD, NewLd);
	return NewLd;
	};

	// Check if the base load is entirely dereferenceable.
	bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
	VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());

	// LOAD - all consecutive load/undefs (must start/end with a load or be
	// entirely dereferenceable). If we have found an entire vector of loads and
	// undefs, then return a large load of the entire vector width starting at the
	// base pointer. If the vector contains zeros, then attempt to shuffle those
	// elements.
	if (FirstLoadedElt == 0 &&
	(LastLoadedElt == (int)(NumElems - 1) \|\| IsDereferenceable) &&
	(IsConsecutiveLoad \|\| IsConsecutiveLoadWithZeros)) {
	if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
	return SDValue();

	// Don't create 256-bit non-temporal aligned loads without AVX2 as these
	// will lower to regular temporal loads and use the cache.
	if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
	VT.is256BitVector() && !Subtarget.hasInt256())
	return SDValue();

	if (NumElems == 1)
	return DAG.getBitcast(VT, Elts[FirstLoadedElt]);

	if (!ZeroMask)
	return CreateLoad(VT, LDBase);

	// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
	// vector and a zero vector to clear out the zero elements.
	if (!isAfterLegalize && VT.isVector()) {
	SmallVector<int, 4> ClearMask(NumElems, -1);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (ZeroMask[i])
	ClearMask[i] = i + NumElems;
	else if (LoadMask[i])
	ClearMask[i] = i;
	}
	SDValue V = CreateLoad(VT, LDBase);
	SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);
	return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
	}
	}

	// If the upper half of a ymm/zmm load is undef then just load the lower half.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	unsigned HalfNumElems = NumElems / 2;
	if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
	EVT HalfVT =
	EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
	SDValue HalfLD =
	EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
	DAG, Subtarget, isAfterLegalize);
	if (HalfLD)
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
	HalfLD, DAG.getIntPtrConstant(0, DL));
	}
	}

	// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
	if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
	(LoadSizeInBits == 32 \|\| LoadSizeInBits == 64) &&
	((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()))) {
	MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
	: MVT::getIntegerVT(LoadSizeInBits);
	MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
	if (TLI.isTypeLegal(VecVT)) {
	SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
	SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
	SDValue ResNode =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
	LDBase->getPointerInfo(),
	LDBase->getAlignment(),
	MachineMemOperand::MOLoad);
	for (auto *LD : Loads)
	if (LD)
	DAG.makeEquivalentMemoryOrdering(LD, ResNode);
	return DAG.getBitcast(VT, ResNode);
	}
	}

	// BROADCAST - match the smallest possible repetition pattern, load that
	// scalar/subvector element and then broadcast to the entire vector.
	if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
	(VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector())) {
	for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
	unsigned RepeatSize = SubElems * BaseSizeInBits;
	unsigned ScalarSize = std::min(RepeatSize, 64u);
	if (!Subtarget.hasAVX2() && ScalarSize < 32)
	continue;

	bool Match = true;
	SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
	for (unsigned i = 0; i != NumElems && Match; ++i) {
	if (!LoadMask[i])
	continue;
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (RepeatedLoads[i % SubElems].isUndef())
	RepeatedLoads[i % SubElems] = Elt;
	else
	Match &= (RepeatedLoads[i % SubElems] == Elt);
	}

	// We must have loads at both ends of the repetition.
	Match &= !RepeatedLoads.front().isUndef();
	Match &= !RepeatedLoads.back().isUndef();
	if (!Match)
	continue;

	EVT RepeatVT =
	VT.isInteger() && (RepeatSize != 64 \|\| TLI.isTypeLegal(MVT::i64))
	? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
	: EVT::getFloatingPointVT(ScalarSize);
	if (RepeatSize > ScalarSize)
	RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
	RepeatSize / ScalarSize);
	EVT BroadcastVT =
	EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
	VT.getSizeInBits() / ScalarSize);
	if (TLI.isTypeLegal(BroadcastVT)) {
	if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
	RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
	unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
	: X86ISD::VBROADCAST;
	SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
	return DAG.getBitcast(VT, Broadcast);
	}
	}
	}
	}

	return SDValue();
	}

	// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
	// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
	// are consecutive, non-overlapping, and in the right order.
	static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	SmallVector<SDValue, 64> Elts;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
	Elts.push_back(Elt);
	continue;
	}
	return SDValue();
	}
	assert(Elts.size() == VT.getVectorNumElements());
	return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
	isAfterLegalize);
	}

	static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
	unsigned SplatBitSize, LLVMContext &C) {
	unsigned ScalarSize = VT.getScalarSizeInBits();
	unsigned NumElm = SplatBitSize / ScalarSize;

	SmallVector<Constant *, 32> ConstantVec;
	for (unsigned i = 0; i < NumElm; i++) {
	APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
	Constant *Const;
	if (VT.isFloatingPoint()) {
	if (ScalarSize == 32) {
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
	} else {
	assert(ScalarSize == 64 && "Unsupported floating point scalar size");
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
	}
	} else
	Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
	ConstantVec.push_back(Const);
	}
	return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
	}

	static bool isFoldableUseOfShuffle(SDNode *N) {
	for (auto *U : N->uses()) {
	unsigned Opc = U->getOpcode();
	// VPERMV/VPERMV3 shuffles can never fold their index operands.
	if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
	return false;
	if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
	return false;
	if (isTargetShuffle(Opc))
	return true;
	if (Opc == ISD::BITCAST) // Ignore bitcasts
	return isFoldableUseOfShuffle(U);
	if (N->hasOneUse())
	return true;
	}
	return false;
	}

	// Check if the current node of build vector is a zero extended vector.
	// // If so, return the value extended.
	// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
	// // NumElt - return the number of zero extended identical values.
	// // EltType - return the type of the value include the zero extend.
	static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
	unsigned &NumElt, MVT &EltType) {
	SDValue ExtValue = Op->getOperand(0);
	unsigned NumElts = Op->getNumOperands();
	unsigned Delta = NumElts;

	for (unsigned i = 1; i < NumElts; i++) {
	if (Op->getOperand(i) == ExtValue) {
	Delta = i;
	break;
	}
	if (!(Op->getOperand(i).isUndef() \|\| isNullConstant(Op->getOperand(i))))
	return SDValue();
	}
	if (!isPowerOf2_32(Delta) \|\| Delta == 1)
	return SDValue();

	for (unsigned i = Delta; i < NumElts; i++) {
	if (i % Delta == 0) {
	if (Op->getOperand(i) != ExtValue)
	return SDValue();
	} else if (!(isNullConstant(Op->getOperand(i)) \|\|
	Op->getOperand(i).isUndef()))
	return SDValue();
	}
	unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
	unsigned ExtVTSize = EltSize * Delta;
	EltType = MVT::getIntegerVT(ExtVTSize);
	NumElt = NumElts / Delta;
	return ExtValue;
	}

	/// Attempt to use the vbroadcast instruction to generate a splat value
	/// from a splat BUILD_VECTOR which uses:
	/// a. A single scalar load, or a constant.
	/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
	///
	/// The VBROADCAST node is returned when a pattern is found,
	/// or SDValue() otherwise.
	static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// VBROADCAST requires AVX.
	// TODO: Splats could be generated for non-AVX CPUs using SSE
	// instructions, but there's less potential gain for only 128-bit vectors.
	if (!Subtarget.hasAVX())
	return SDValue();

	MVT VT = BVOp->getSimpleValueType(0);
	SDLoc dl(BVOp);

	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Unsupported vector type for broadcast.");

	BitVector UndefElements;
	SDValue Ld = BVOp->getSplatValue(&UndefElements);

	// Attempt to use VBROADCASTM
	// From this paterrn:
	// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
	// b. t1 = (build_vector t0 t0)
	//
	// Create (VBROADCASTM v2i1 X)
	if (Subtarget.hasCDI() && (VT.is512BitVector() \|\| Subtarget.hasVLX())) {
	MVT EltType = VT.getScalarType();
	unsigned NumElts = VT.getVectorNumElements();
	SDValue BOperand;
	SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
	if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) \|\|
	(Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
	Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
	if (ZeroExtended)
	BOperand = ZeroExtended.getOperand(0);
	else
	BOperand = Ld.getOperand(0).getOperand(0);
	MVT MaskVT = BOperand.getSimpleValueType();
	if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) \|\| // for broadcastmb2q
	(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
	SDValue Brdcst =
	DAG.getNode(X86ISD::VBROADCASTM, dl,
	MVT::getVectorVT(EltType, NumElts), BOperand);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}

	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumUndefElts = UndefElements.count();
	if (!Ld \|\| (NumElts - NumUndefElts) <= 1) {
	APInt SplatValue, Undef;
	unsigned SplatBitSize;
	bool HasUndef;
	// Check if this is a repeated constant pattern suitable for broadcasting.
	if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
	SplatBitSize > VT.getScalarSizeInBits() &&
	SplatBitSize < VT.getSizeInBits()) {
	// Avoid replacing with broadcast when it's a use of a shuffle
	// instruction to preserve the present custom lowering of shuffles.
	if (isFoldableUseOfShuffle(BVOp))
	return SDValue();
	// replace BUILD_VECTOR with broadcast of the repeated constants.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	LLVMContext *Ctx = DAG.getContext();
	MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
	if (Subtarget.hasAVX()) {
	if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
	!(SplatBitSize == 64 && Subtarget.is32Bit())) {
	// Splatted value can fit in one INTEGER constant in constant pool.
	// Load the constant and broadcast it.
	MVT CVT = MVT::getIntegerVT(SplatBitSize);
	Type ScalarTy = Type::getIntNTy(Ctx, SplatBitSize);
	Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize == 32 \|\| SplatBitSize == 64) {
	// Splatted value can fit in one FLOAT constant in constant pool.
	// Load the constant and broadcast it.
	// AVX have support for 32 and 64 bit broadcast for floats only.
	// No 64bit integer in 32bit subtarget.
	MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
	// Lower the splat via APFloat directly, to avoid any conversion.
	Constant *C =
	SplatBitSize == 32
	? ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEsingle(), SplatValue))
	: ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEdouble(), SplatValue));
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize > 64) {
	// Load the vector of constants and broadcast it.
	MVT CVT = VT.getScalarType();
	Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
	*Ctx);
	SDValue VCP = DAG.getConstantPool(VecC, PVT);
	unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
	unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
	Ld = DAG.getLoad(
	MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}

	// If we are moving a scalar into a vector (Ld must be set and all elements
	// but 1 are undef) and that operation is not obviously supported by
	// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
	// That's better than general shuffling and may eliminate a load to GPR and
	// move from scalar to vector register.
	if (!Ld \|\| NumElts - NumUndefElts != 1)
	return SDValue();
	unsigned ScalarSize = Ld.getValueSizeInBits();
	if (!(UndefElements[0] \|\| (ScalarSize != 32 && ScalarSize != 64)))
	return SDValue();
	}

	bool ConstSplatVal =
	(Ld.getOpcode() == ISD::Constant \|\| Ld.getOpcode() == ISD::ConstantFP);

	// Make sure that all of the users of a non-constant load are from the
	// BUILD_VECTOR node.
	if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
	return SDValue();

	unsigned ScalarSize = Ld.getValueSizeInBits();
	bool IsGE256 = (VT.getSizeInBits() >= 256);

	// When optimizing for size, generate up to 5 extra bytes for a broadcast
	// instruction to save 8 or more bytes of constant pool data.
	// TODO: If multiple splats are generated to load the same constant,
	// it may be detrimental to overall size. There needs to be a way to detect
	// that condition to know if this is truly a size win.
	bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();

	// Handle broadcasting a single constant scalar from the constant pool
	// into a vector.
	// On Sandybridge (no AVX2), it is still better to load a constant vector
	// from the constant pool and not to broadcast it from a scalar.
	// But override that restriction when optimizing for size.
	// TODO: Check if splatting is recommended for other AVX-capable CPUs.
	if (ConstSplatVal && (Subtarget.hasAVX2() \|\| OptForSize)) {
	EVT CVT = Ld.getValueType();
	assert(!CVT.isVector() && "Must not broadcast a vector type");

	// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
	// For size optimization, also splat v2f64 and v2i64, and for size opt
	// with AVX2, also splat i8 and i16.
	// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(OptForSize && (ScalarSize == 64 \|\| Subtarget.hasAVX2()))) {
	const Constant *C = nullptr;
	if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
	C = CI->getConstantIntValue();
	else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
	C = CF->getConstantFPValue();

	assert(C && "Invalid constant type");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue CP =
	DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);

	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}
	}

	bool IsLoad = ISD::isNormalLoad(Ld.getNode());

	// Handle AVX2 in-register broadcasts.
	if (!IsLoad && Subtarget.hasInt256() &&
	(ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64)))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The scalar source must be a normal load.
	if (!IsLoad)
	return SDValue();

	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(Subtarget.hasVLX() && ScalarSize == 64))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The integer check is needed for the 64-bit into 128-bit so it doesn't match
	// double since there is no vbroadcastsd xmm
	if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
	if (ScalarSize == 8 \|\| ScalarSize == 16 \|\| ScalarSize == 64)
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}

	// Unsupported broadcast.
	return SDValue();
	}

	/// For an EXTRACT_VECTOR_ELT with a constant index return the real
	/// underlying vector and index.
	///
	/// Modifies \p ExtractedFromVec to the real vector and returns the real
	/// index.
	static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
	SDValue ExtIdx) {
	int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
	if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
	return Idx;

	// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
	// lowered this:
	// (extract_vector_elt (v8f32 %1), Constant<6>)
	// to:
	// (extract_vector_elt (vector_shuffle<2,u,u,u>
	// (extract_subvector (v8f32 %0), Constant<4>),
	// undef)
	// Constant<0>)
	// In this case the vector is the extract_subvector expression and the index
	// is 2, as specified by the shuffle.
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
	SDValue ShuffleVec = SVOp->getOperand(0);
	MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
	assert(ShuffleVecVT.getVectorElementType() ==
	ExtractedFromVec.getSimpleValueType().getVectorElementType());

	int ShuffleIdx = SVOp->getMaskElt(Idx);
	if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
	ExtractedFromVec = ShuffleVec;
	return ShuffleIdx;
	}
	return Idx;
	}

	static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// Skip if insert_vec_elt is not supported.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
	return SDValue();

	SDLoc DL(Op);
	unsigned NumElems = Op.getNumOperands();

	SDValue VecIn1;
	SDValue VecIn2;
	SmallVector<unsigned, 4> InsertIndices;
	SmallVector<int, 8> Mask(NumElems, -1);

	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Opc = Op.getOperand(i).getOpcode();

	if (Opc == ISD::UNDEF)
	continue;

	if (Opc != ISD::EXTRACT_VECTOR_ELT) {
	// Quit if more than 1 elements need inserting.
	if (InsertIndices.size() > 1)
	return SDValue();

	InsertIndices.push_back(i);
	continue;
	}

	SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
	SDValue ExtIdx = Op.getOperand(i).getOperand(1);

	// Quit if non-constant index.
	if (!isa<ConstantSDNode>(ExtIdx))
	return SDValue();
	int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

	// Quit if extracted from vector of different type.
	if (ExtractedFromVec.getValueType() != VT)
	return SDValue();

	if (!VecIn1.getNode())
	VecIn1 = ExtractedFromVec;
	else if (VecIn1 != ExtractedFromVec) {
	if (!VecIn2.getNode())
	VecIn2 = ExtractedFromVec;
	else if (VecIn2 != ExtractedFromVec)
	// Quit if more than 2 vectors to shuffle
	return SDValue();
	}

	if (ExtractedFromVec == VecIn1)
	Mask[i] = Idx;
	else if (ExtractedFromVec == VecIn2)
	Mask[i] = Idx + NumElems;
	}

	if (!VecIn1.getNode())
	return SDValue();

	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
	SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

	for (unsigned Idx : InsertIndices)
	NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
	DAG.getIntPtrConstant(Idx, DL));

	return NV;
	}

	static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
	assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
	Op.getScalarValueSizeInBits() == 1 &&
	"Can not convert non-constant vector");
	uint64_t Immediate = 0;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (!In.isUndef())
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	}
	SDLoc dl(Op);
	MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
	return DAG.getConstant(Immediate, dl, VT);
	}
	// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
	static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	MVT VT = Op.getSimpleValueType();
	assert((VT.getVectorElementType() == MVT::i1) &&
	"Unexpected type in LowerBUILD_VECTORvXi1!");

	SDLoc dl(Op);
	if (ISD::isBuildVectorAllZeros(Op.getNode()))
	return Op;

	if (ISD::isBuildVectorAllOnes(Op.getNode()))
	return Op;

	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	// Split the pieces.
	SDValue Lower =
	DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
	SDValue Upper =
	DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
	// We have to manually lower both halves so getNode doesn't try to
	// reassemble the build_vector.
	Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
	Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
	}
	SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, Imm);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Vector has one or more non-const elements
	uint64_t Immediate = 0;
	SmallVector<unsigned, 16> NonConstIdx;
	bool IsSplat = true;
	bool HasConstElts = false;
	int SplatIdx = -1;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (In.isUndef())
	continue;
	if (!isa<ConstantSDNode>(In))
	NonConstIdx.push_back(idx);
	else {
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	HasConstElts = true;
	}
	if (SplatIdx < 0)
	SplatIdx = idx;
	else if (In != Op.getOperand(SplatIdx))
	IsSplat = false;
	}

	// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
	if (IsSplat)
	return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
	DAG.getConstant(1, dl, VT),
	DAG.getConstant(0, dl, VT));

	// insert elements one by one
	SDValue DstVec;
	SDValue Imm;
	if (Immediate) {
	MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
	Imm = DAG.getConstant(Immediate, dl, ImmVT);
	}
	else if (HasConstElts)
	Imm = DAG.getConstant(0, dl, VT);
	else
	Imm = DAG.getUNDEF(VT);
	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
	DstVec = DAG.getBitcast(VT, Imm);
	else {
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
	DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
	unsigned InsertIdx = NonConstIdx[i];
	DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	Op.getOperand(InsertIdx),
	DAG.getIntPtrConstant(InsertIdx, dl));
	}
	return DstVec;
	}

	/// This is a helper function of LowerToHorizontalOp().
	/// This function checks that the build_vector \p N in input implements a
	/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
	/// may not match the layout of an x86 256-bit horizontal instruction.
	/// In other words, if this returns true, then some extraction/insertion will
	/// be required to produce a valid horizontal instruction.
	///
	/// Parameter \p Opcode defines the kind of horizontal operation to match.
	/// For example, if \p Opcode is equal to ISD::ADD, then this function
	/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
	/// is equal to ISD::SUB, then this function checks if this is a horizontal
	/// arithmetic sub.
	///
	/// This function only analyzes elements of \p N whose indices are
	/// in range [BaseIdx, LastIdx).
	///
	/// TODO: This function was originally used to match both real and fake partial
	/// horizontal operations, but the index-matching logic is incorrect for that.
	/// See the corrected implementation in isHopBuildVector(). Can we reduce this
	/// code because it is only used for partial h-op matching now?
	static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
	SelectionDAG &DAG,
	unsigned BaseIdx, unsigned LastIdx,
	SDValue &V0, SDValue &V1) {
	EVT VT = N->getValueType(0);
	assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
	assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
	assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
	"Invalid Vector in input!");

	bool IsCommutable = (Opcode == ISD::ADD \|\| Opcode == ISD::FADD);
	bool CanFold = true;
	unsigned ExpectedVExtractIdx = BaseIdx;
	unsigned NumElts = LastIdx - BaseIdx;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// Check if N implements a horizontal binop.
	for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
	SDValue Op = N->getOperand(i + BaseIdx);

	// Skip UNDEFs.
	if (Op->isUndef()) {
	// Update the expected vector extract index.
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	ExpectedVExtractIdx += 2;
	continue;
	}

	CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

	if (!CanFold)
	break;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
	CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op0.getOperand(0) == Op1.getOperand(0) &&
	isa<ConstantSDNode>(Op0.getOperand(1)) &&
	isa<ConstantSDNode>(Op1.getOperand(1)));
	if (!CanFold)
	break;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();

	if (i * 2 < NumElts) {
	if (V0.isUndef()) {
	V0 = Op0.getOperand(0);
	if (V0.getValueType() != VT)
	return false;
	}
	} else {
	if (V1.isUndef()) {
	V1 = Op0.getOperand(0);
	if (V1.getValueType() != VT)
	return false;
	}
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	}

	SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
	if (I0 == ExpectedVExtractIdx)
	CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
	else if (IsCommutable && I1 == ExpectedVExtractIdx) {
	// Try to match the following dag sequence:
	// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
	CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
	} else
	CanFold = false;

	ExpectedVExtractIdx += 2;
	}

	return CanFold;
	}

	/// Emit a sequence of two 128-bit horizontal add/sub followed by
	/// a concat_vector.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function expects two 256-bit vectors called V0 and V1.
	/// At first, each vector is split into two separate 128-bit vectors.
	/// Then, the resulting 128-bit vectors are used to implement two
	/// horizontal binary operations.
	///
	/// The kind of horizontal binary operation is defined by \p X86Opcode.
	///
	/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
	/// the two new horizontal binop.
	/// When Mode is set, the first horizontal binop dag node would take as input
	/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
	/// horizontal binop dag node would take as input the lower 128-bit of V1
	/// and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V0_HI
	/// HADD V1_LO, V1_HI
	///
	/// Otherwise, the first horizontal binop dag node takes as input the lower
	/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
	/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V1_LO
	/// HADD V0_HI, V1_HI
	///
	/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
	/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
	/// the upper 128-bits of the result.
	static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
	const SDLoc &DL, SelectionDAG &DAG,
	unsigned X86Opcode, bool Mode,
	bool isUndefLO, bool isUndefHI) {
	MVT VT = V0.getSimpleValueType();
	assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
	"Invalid nodes in input!");

	unsigned NumElts = VT.getVectorNumElements();
	SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
	SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
	SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
	SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
	MVT NewVT = V0_LO.getSimpleValueType();

	SDValue LO = DAG.getUNDEF(NewVT);
	SDValue HI = DAG.getUNDEF(NewVT);

	if (Mode) {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && !V0->isUndef())
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
	if (!isUndefHI && !V1->isUndef())
	HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
	} else {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && (!V0_LO->isUndef() \|\| !V1_LO->isUndef()))
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

	if (!isUndefHI && (!V0_HI->isUndef() \|\| !V1_HI->isUndef()))
	HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
	}

	/// Returns true iff \p BV builds a vector with the result equivalent to
	/// the result of ADDSUB/SUBADD operation.
	/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
	/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
	/// \p Opnd0 and \p Opnd1.
	static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1,
	unsigned &NumExtracts,
	bool &IsSubAdd) {

	MVT VT = BV->getSimpleValueType(0);
	if (!Subtarget.hasSSE3() \|\| !VT.isFloatingPoint())
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	SDValue InVec0 = DAG.getUNDEF(VT);
	SDValue InVec1 = DAG.getUNDEF(VT);

	NumExtracts = 0;

	// Odd-numbered elements in the input build vector are obtained from
	// adding/subtracting two integer/float elements.
	// Even-numbered elements in the input build vector are obtained from
	// subtracting/adding two integer/float elements.
	unsigned Opc[2] = {0, 0};
	for (unsigned i = 0, e = NumElts; i != e; ++i) {
	SDValue Op = BV->getOperand(i);

	// Skip 'undef' values.
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::UNDEF)
	continue;

	// Early exit if we found an unexpected opcode.
	if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
	return false;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
	// Early exit if we cannot match that sequence.
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(Op1.getOperand(1)) \|\|
	Op0.getOperand(1) != Op1.getOperand(1))
	return false;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	if (I0 != i)
	return false;

	// We found a valid add/sub node, make sure its the same opcode as previous
	// elements for this parity.
	if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
	return false;
	Opc[i % 2] = Opcode;

	// Update InVec0 and InVec1.
	if (InVec0.isUndef()) {
	InVec0 = Op0.getOperand(0);
	if (InVec0.getSimpleValueType() != VT)
	return false;
	}
	if (InVec1.isUndef()) {
	InVec1 = Op1.getOperand(0);
	if (InVec1.getSimpleValueType() != VT)
	return false;
	}

	// Make sure that operands in input to each add/sub node always
	// come from a same pair of vectors.
	if (InVec0 != Op0.getOperand(0)) {
	if (Opcode == ISD::FSUB)
	return false;

	// FADD is commutable. Try to commute the operands
	// and then test again.
	std::swap(Op0, Op1);
	if (InVec0 != Op0.getOperand(0))
	return false;
	}

	if (InVec1 != Op1.getOperand(0))
	return false;

	// Increment the number of extractions done.
	++NumExtracts;
	}

	// Ensure we have found an opcode for both parities and that they are
	// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
	// inputs are undef.
	if (!Opc[0] \|\| !Opc[1] \|\| Opc[0] == Opc[1] \|\|
	InVec0.isUndef() \|\| InVec1.isUndef())
	return false;

	IsSubAdd = Opc[0] == ISD::FADD;

	Opnd0 = InVec0;
	Opnd1 = InVec1;
	return true;
	}

	/// Returns true if is possible to fold MUL and an idiom that has already been
	/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
	/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
	/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
	///
	/// Prior to calling this function it should be known that there is some
	/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
	/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
	/// before replacement of such SDNode with ADDSUB operation. Thus the number
	/// of \p Opnd0 uses is expected to be equal to 2.
	/// For example, this function may be called for the following IR:
	/// %AB = fmul fast <2 x double> %A, %B
	/// %Sub = fsub fast <2 x double> %AB, %C
	/// %Add = fadd fast <2 x double> %AB, %C
	/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
	/// <2 x i32> <i32 0, i32 3>
	/// There is a def for %Addsub here, which potentially can be replaced by
	/// X86ISD::ADDSUB operation:
	/// %Addsub = X86ISD::ADDSUB %AB, %C
	/// and such ADDSUB can further be replaced with FMADDSUB:
	/// %Addsub = FMADDSUB %A, %B, %C.
	///
	/// The main reason why this method is called before the replacement of the
	/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
	/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
	/// FMADDSUB is.
	static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
	SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
	unsigned ExpectedUses) {
	if (Opnd0.getOpcode() != ISD::FMUL \|\|
	!Opnd0->hasNUsesOfValue(ExpectedUses, 0) \|\| !Subtarget.hasAnyFMA())
	return false;

	// FIXME: These checks must match the similar ones in
	// DAGCombiner::visitFADDForFMACombine. It would be good to have one
	// function that would answer if it is Ok to fuse MUL + ADD to FMADD
	// or MUL + ADDSUB to FMADDSUB.
	const TargetOptions &Options = DAG.getTarget().Options;
	bool AllowFusion =
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath);
	if (!AllowFusion)
	return false;

	Opnd2 = Opnd1;
	Opnd1 = Opnd0.getOperand(1);
	Opnd0 = Opnd0.getOperand(0);

	return true;
	}

	/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
	/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
	/// X86ISD::FMSUBADD node.
	static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	unsigned NumExtracts;
	bool IsSubAdd;
	if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
	IsSubAdd))
	return SDValue();

	MVT VT = BV->getSimpleValueType(0);
	SDLoc DL(BV);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
	unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
	}

	// We only support ADDSUB.
	if (IsSubAdd)
	return SDValue();

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
	// recognition.
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
	unsigned &HOpcode, SDValue &V0, SDValue &V1) {
	// Initialize outputs to known values.
	MVT VT = BV->getSimpleValueType(0);
	HOpcode = ISD::DELETED_NODE;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
	// half of the result is calculated independently from the 128-bit halves of
	// the inputs, so that makes the index-checking logic below more complicated.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned GenericOpcode = ISD::DELETED_NODE;
	unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
	unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
	unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
	for (unsigned i = 0; i != Num128BitChunks; ++i) {
	for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
	// Ignore undef elements.
	SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
	if (Op.isUndef())
	continue;

	// If there's an opcode mismatch, we're done.
	if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
	return false;

	// Initialize horizontal opcode.
	if (HOpcode == ISD::DELETED_NODE) {
	GenericOpcode = Op.getOpcode();
	switch (GenericOpcode) {
	case ISD::ADD: HOpcode = X86ISD::HADD; break;
	case ISD::SUB: HOpcode = X86ISD::HSUB; break;
	case ISD::FADD: HOpcode = X86ISD::FHADD; break;
	case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
	default: return false;
	}
	}

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op0.getOperand(0) != Op1.getOperand(0) \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(Op1.getOperand(1)) \|\| !Op.hasOneUse())
	return false;

	// The source vector is chosen based on which 64-bit half of the
	// destination vector is being calculated.
	if (j < NumEltsIn64Bits) {
	if (V0.isUndef())
	V0 = Op0.getOperand(0);
	} else {
	if (V1.isUndef())
	V1 = Op0.getOperand(0);
	}

	SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
	if (SourceVec != Op0.getOperand(0))
	return false;

	// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
	unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
	unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
	unsigned ExpectedIndex = i * NumEltsIn128Bits +
	(j % NumEltsIn64Bits) * 2;
	if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
	continue;

	// If this is not a commutative op, this does not match.
	if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
	return false;

	// Addition is commutative, so try swapping the extract indexes.
	// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
	if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
	continue;

	// Extract indexes do not match horizontal requirement.
	return false;
	}
	}
	// We matched. Opcode and operands are returned by reference as arguments.
	return true;
	}

	static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
	SelectionDAG &DAG, unsigned HOpcode,
	SDValue V0, SDValue V1) {
	// If either input vector is not the same size as the build vector,
	// extract/insert the low bits to the correct size.
	// This is free (examples: zmm --> xmm, xmm --> ymm).
	MVT VT = BV->getSimpleValueType(0);
	unsigned Width = VT.getSizeInBits();
	if (V0.getValueSizeInBits() > Width)
	V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
	else if (V0.getValueSizeInBits() < Width)
	V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);

	if (V1.getValueSizeInBits() > Width)
	V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
	else if (V1.getValueSizeInBits() < Width)
	V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);

	unsigned NumElts = VT.getVectorNumElements();
	APInt DemandedElts = APInt::getAllOnesValue(NumElts);
	for (unsigned i = 0; i != NumElts; ++i)
	if (BV->getOperand(i).isUndef())
	DemandedElts.clearBit(i);

	// If we don't need the upper xmm, then perform as a xmm hop.
	unsigned HalfNumElts = NumElts / 2;
	if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
	MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts);
	V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
	V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
	SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
	return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
	}

	return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
	}

	/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
	static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// We need at least 2 non-undef elements to make this worthwhile by default.
	unsigned NumNonUndefs =
	count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
	if (NumNonUndefs < 2)
	return SDValue();

	// There are 4 sets of horizontal math operations distinguished by type:
	// int/FP at 128-bit/256-bit. Each type was introduced with a different
	// subtarget feature. Try to match those "native" patterns first.
	MVT VT = BV->getSimpleValueType(0);
	if (((VT == MVT::v4f32 \|\| VT == MVT::v2f64) && Subtarget.hasSSE3()) \|\|
	((VT == MVT::v8i16 \|\| VT == MVT::v4i32) && Subtarget.hasSSSE3()) \|\|
	((VT == MVT::v8f32 \|\| VT == MVT::v4f64) && Subtarget.hasAVX()) \|\|
	((VT == MVT::v16i16 \|\| VT == MVT::v8i32) && Subtarget.hasAVX2())) {
	unsigned HOpcode;
	SDValue V0, V1;
	if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
	return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
	}

	// Try harder to match 256-bit ops by using extract/concat.
	if (!Subtarget.hasAVX() \|\| !VT.is256BitVector())
	return SDValue();

	// Count the number of UNDEF operands in the build_vector in input.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned Half = NumElts / 2;
	unsigned NumUndefsLO = 0;
	unsigned NumUndefsHI = 0;
	for (unsigned i = 0, e = Half; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsLO++;

	for (unsigned i = Half, e = NumElts; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsHI++;

	SDLoc DL(BV);
	SDValue InVec0, InVec1;
	if (VT == MVT::v8i32 \|\| VT == MVT::v16i16) {
	SDValue InVec2, InVec3;
	unsigned X86Opcode;
	bool CanFold = true;

	if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
	InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
	InVec1) &&
	isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
	InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HSUB;
	else
	CanFold = false;

	if (CanFold) {
	// Do not try to expand this build_vector into a pair of horizontal
	// add/sub if we can emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into a pair of horizontal binops followed by
	// a concat vector. We must adjust the outputs from the partial horizontal
	// matching calls above to account for undefined vector halves.
	SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
	SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
	assert((!V0.isUndef() \|\| !V1.isUndef()) && "Horizontal-op of undefs?");
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
	isUndefHI);
	}
	}

	if (VT == MVT::v8f32 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8i32 \|\|
	VT == MVT::v16i16) {
	unsigned X86Opcode;
	if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::HSUB;
	else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::FHADD;
	else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
	InVec1))
	X86Opcode = X86ISD::FHSUB;
	else
	return SDValue();

	// Don't try to expand this build_vector into a pair of horizontal add/sub
	// if we can simply emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into two horizontal add/sub followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
	isUndefLO, isUndefHI);
	}

	return SDValue();
	}

	/// If a BUILD_VECTOR's source elements all apply the same bit operation and
	/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
	/// just apply the bit to the vectors.
	/// NOTE: Its not in our interest to start make a general purpose vectorizer
	/// from this, but enough scalar bit operations are created from the later
	/// legalization + scalarization stages to need basic support.
	static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
	SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op->getSimpleValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Check that all elements have the same opcode.
	// TODO: Should we allow UNDEFS and if so how many?
	unsigned Opcode = Op->getOperand(0).getOpcode();
	for (unsigned i = 1; i < NumElems; ++i)
	if (Opcode != Op->getOperand(i).getOpcode())
	return SDValue();

	// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
	bool IsShift = false;
	switch (Opcode) {
	default:
	return SDValue();
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SRA:
	IsShift = true;
	break;
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	// Don't do this if the buildvector is a splat - we'd replace one
	// constant with an entire vector.
	if (Op->getSplatValue())
	return SDValue();
	if (!TLI.isOperationLegalOrPromote(Opcode, VT))
	return SDValue();
	break;
	}

	SmallVector<SDValue, 4> LHSElts, RHSElts;
	for (SDValue Elt : Op->ops()) {
	SDValue LHS = Elt.getOperand(0);
	SDValue RHS = Elt.getOperand(1);

	// We expect the canonicalized RHS operand to be the constant.
	if (!isa<ConstantSDNode>(RHS))
	return SDValue();

	// Extend shift amounts.
	if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
	if (!IsShift)
	return SDValue();
	RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
	}

	LHSElts.push_back(LHS);
	RHSElts.push_back(RHS);
	}

	// Limit to shifts by uniform immediates.
	// TODO: Only accept vXi8/vXi64 special cases?
	// TODO: Permit non-uniform XOP/AVX2/MULLO cases?
	if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
	return SDValue();

	SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
	SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
	}

	/// Create a vector constant without a load. SSE/AVX provide the bare minimum
	/// functionality to do this, so it's all zeros, all ones, or some derivation
	/// that is cheap to calculate.
	static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();

	// Vectors containing all zeros can be matched by pxor and xorps.
	if (ISD::isBuildVectorAllZeros(Op.getNode())) {
	// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
	// and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32)
	return Op;

	return getZeroVector(VT, Subtarget, DAG, DL);
	}

	// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
	// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
	// vpcmpeqd on 256-bit vectors.
	if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
	if (VT == MVT::v4i32 \|\| VT == MVT::v16i32 \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()))
	return Op;

	return getOnesVector(VT, DAG, DL);
	}

	return SDValue();
	}

	/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
	/// from a vector of source values and a vector of extraction indices.
	/// The vectors might be manipulated to match the type of the permute op.
	static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
	SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT ShuffleVT = VT;
	EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();

	// Adjust IndicesVec to match VT size.
	assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
	"Illegal variable permute mask size");
	if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
	IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
	NumElts * VT.getScalarSizeInBits());
	IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);

	// Handle SrcVec that don't match VT type.
	if (SrcVec.getValueSizeInBits() != SizeInBits) {
	if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
	// Handle larger SrcVec by treating it as a larger permute.
	unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
	VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
	IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
	IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
	Subtarget, DAG, SDLoc(IndicesVec));
	return extractSubVector(
	createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
	DAG, DL, SizeInBits);
	} else if (SrcVec.getValueSizeInBits() < SizeInBits) {
	// Widen smaller SrcVec to match VT.
	SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
	} else
	return SDValue();
	}

	auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
	assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
	EVT SrcVT = Idx.getValueType();
	unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
	uint64_t IndexScale = 0;
	uint64_t IndexOffset = 0;

	// If we're scaling a smaller permute op, then we need to repeat the
	// indices, scaling and offsetting them as well.
	// e.g. v4i32 -> v16i8 (Scale = 4)
	// IndexScale = v4i32 Splat(4 << 24 \| 4 << 16 \| 4 << 8 \| 4)
	// IndexOffset = v4i32 Splat(3 << 24 \| 2 << 16 \| 1 << 8 \| 0)
	for (uint64_t i = 0; i != Scale; ++i) {
	IndexScale \|= Scale << (i * NumDstBits);
	IndexOffset \|= i << (i * NumDstBits);
	}

	Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
	DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
	Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
	DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
	return Idx;
	};

	unsigned Opcode = 0;
	switch (VT.SimpleTy) {
	default:
	break;
	case MVT::v16i8:
	if (Subtarget.hasSSSE3())
	Opcode = X86ISD::PSHUFB;
	break;
	case MVT::v8i16:
	if (Subtarget.hasVLX() && Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasSSSE3()) {
	Opcode = X86ISD::PSHUFB;
	ShuffleVT = MVT::v16i8;
	}
	break;
	case MVT::v4f32:
	case MVT::v4i32:
	if (Subtarget.hasAVX()) {
	Opcode = X86ISD::VPERMILPV;
	ShuffleVT = MVT::v4f32;
	} else if (Subtarget.hasSSSE3()) {
	Opcode = X86ISD::PSHUFB;
	ShuffleVT = MVT::v16i8;
	}
	break;
	case MVT::v2f64:
	case MVT::v2i64:
	if (Subtarget.hasAVX()) {
	// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
	IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
	Opcode = X86ISD::VPERMILPV;
	ShuffleVT = MVT::v2f64;
	} else if (Subtarget.hasSSE41()) {
	// SSE41 can compare v2i64 - select between indices 0 and 1.
	return DAG.getSelectCC(
	DL, IndicesVec,
	getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
	DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
	DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
	ISD::CondCode::SETEQ);
	}
	break;
	case MVT::v32i8:
	if (Subtarget.hasVLX() && Subtarget.hasVBMI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasXOP()) {
	SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
	SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
	SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
	SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
	return DAG.getNode(
	ISD::CONCAT_VECTORS, DL, VT,
	DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
	DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
	} else if (Subtarget.hasAVX()) {
	SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
	SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
	SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
	SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
	auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Permute Lo and Hi and then select based on index range.
	// This works as SHUFB uses bits[3:0] to permute elements and we don't
	// care about the bit[7] as its just an index vector.
	SDValue Idx = Ops[2];
	EVT VT = Idx.getValueType();
	return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
	DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
	DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
	ISD::CondCode::SETGT);
	};
	SDValue Ops[] = {LoLo, HiHi, IndicesVec};
	return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
	PSHUFBBuilder);
	}
	break;
	case MVT::v16i16:
	if (Subtarget.hasVLX() && Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasAVX()) {
	// Scale to v32i8 and perform as v32i8.
	IndicesVec = ScaleIndices(IndicesVec, 2);
	return DAG.getBitcast(
	VT, createVariablePermute(
	MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
	DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
	}
	break;
	case MVT::v8f32:
	case MVT::v8i32:
	if (Subtarget.hasAVX2())
	Opcode = X86ISD::VPERMV;
	else if (Subtarget.hasAVX()) {
	SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
	SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
	{0, 1, 2, 3, 0, 1, 2, 3});
	SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
	{4, 5, 6, 7, 4, 5, 6, 7});
	if (Subtarget.hasXOP())
	return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
	LoLo, HiHi, IndicesVec,
	DAG.getConstant(0, DL, MVT::i8)));
	// Permute Lo and Hi and then select based on index range.
	// This works as VPERMILPS only uses index bits[0:1] to permute elements.
	SDValue Res = DAG.getSelectCC(
	DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
	ISD::CondCode::SETGT);
	return DAG.getBitcast(VT, Res);
	}
	break;
	case MVT::v4i64:
	case MVT::v4f64:
	if (Subtarget.hasAVX512()) {
	if (!Subtarget.hasVLX()) {
	MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
	SDLoc(SrcVec));
	IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
	DAG, SDLoc(IndicesVec));
	SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
	DAG, Subtarget);
	return extract256BitVector(Res, 0, DAG, DL);
	}
	Opcode = X86ISD::VPERMV;
	} else if (Subtarget.hasAVX()) {
	SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
	SDValue LoLo =
	DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
	SDValue HiHi =
	DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
	// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
	IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
	if (Subtarget.hasXOP())
	return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
	LoLo, HiHi, IndicesVec,
	DAG.getConstant(0, DL, MVT::i8)));
	// Permute Lo and Hi and then select based on index range.
	// This works as VPERMILPD only uses index bit[1] to permute elements.
	SDValue Res = DAG.getSelectCC(
	DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
	DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
	ISD::CondCode::SETGT);
	return DAG.getBitcast(VT, Res);
	}
	break;
	case MVT::v64i8:
	if (Subtarget.hasVBMI())
	Opcode = X86ISD::VPERMV;
	break;
	case MVT::v32i16:
	if (Subtarget.hasBWI())
	Opcode = X86ISD::VPERMV;
	break;
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8f64:
	case MVT::v8i64:
	if (Subtarget.hasAVX512())
	Opcode = X86ISD::VPERMV;
	break;
	}
	if (!Opcode)
	return SDValue();

	assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
	(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
	"Illegal variable permute shuffle type");

	uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
	if (Scale > 1)
	IndicesVec = ScaleIndices(IndicesVec, Scale);

	EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
	IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);

	SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
	SDValue Res = Opcode == X86ISD::VPERMV
	? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
	: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
	return DAG.getBitcast(VT, Res);
	}

	// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
	// reasoned to be a permutation of a vector by indices in a non-constant vector.
	// (build_vector (extract_elt V, (extract_elt I, 0)),
	// (extract_elt V, (extract_elt I, 1)),
	// ...
	// ->
	// (vpermv I, V)
	//
	// TODO: Handle undefs
	// TODO: Utilize pshufb and zero mask blending to support more efficient
	// construction of vectors with constant-0 elements.
	static SDValue
	LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue SrcVec, IndicesVec;
	// Check for a match of the permute source vector and permute index elements.
	// This is done by checking that the i-th build_vector operand is of the form:
	// (extract_elt SrcVec, (extract_elt IndicesVec, i)).
	for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
	SDValue Op = V.getOperand(Idx);
	if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract encountered in V, set the source vector,
	// otherwise verify the extract is from the previously defined source
	// vector.
	if (!SrcVec)
	SrcVec = Op.getOperand(0);
	else if (SrcVec != Op.getOperand(0))
	return SDValue();
	SDValue ExtractedIndex = Op->getOperand(1);
	// Peek through extends.
	if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND \|\|
	ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
	ExtractedIndex = ExtractedIndex.getOperand(0);
	if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract from the index vector candidate, set the
	// indices vector, otherwise verify the extract is from the previously
	// defined indices vector.
	if (!IndicesVec)
	IndicesVec = ExtractedIndex.getOperand(0);
	else if (IndicesVec != ExtractedIndex.getOperand(0))
	return SDValue();

	auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
	if (!PermIdx \|\| PermIdx->getZExtValue() != Idx)
	return SDValue();
	}

	SDLoc DL(V);
	MVT VT = V.getSimpleValueType();
	return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
	}

	SDValue
	X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);

	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = Op.getNumOperands();

	// Generate vectors for predicate vectors.
	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
	return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

	if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
	return VectorConstant;

	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
	if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
	return AddSub;
	if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
	return HorizontalOp;
	if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
	return Broadcast;
	if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
	return BitOp;

	unsigned EVTBits = EltVT.getSizeInBits();

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	bool IsAllConstants = true;
	SmallSet<SDValue, 8> Values;
	unsigned NumConstants = NumElems;
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (Elt.isUndef())
	continue;
	Values.insert(Elt);
	if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
	IsAllConstants = false;
	NumConstants--;
	}
	if (X86::isZeroNode(Elt))
	NumZero++;
	else {
	assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
	NonZeros \|= ((uint64_t)1 << i);
	NumNonZero++;
	}
	}

	// All undef vector. Return an UNDEF. All zero vectors were handled above.
	if (NumNonZero == 0)
	return DAG.getUNDEF(VT);

	// If we are inserting one variable into a vector of non-zero constants, try
	// to avoid loading each constant element as a scalar. Load the constants as a
	// vector and then insert the variable scalar element. If insertion is not
	// supported, fall back to a shuffle to get the scalar blended with the
	// constants. Insertion into a zero vector is handled as a special-case
	// somewhere below here.
	if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
	(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) \|\|
	isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
	// Create an all-constant vector. The variable element in the old
	// build vector is replaced by undef in the constant vector. Save the
	// variable scalar element and its index for use in the insertelement.
	LLVMContext &Context = *DAG.getContext();
	Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
	SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
	SDValue VarElt;
	SDValue InsIndex;
	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (auto *C = dyn_cast<ConstantSDNode>(Elt))
	ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
	else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
	ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
	else if (!Elt.isUndef()) {
	assert(!VarElt.getNode() && !InsIndex.getNode() &&
	"Expected one variable element in this vector");
	VarElt = Elt;
	InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
	}
	}
	Constant *CV = ConstantVector::get(ConstVecOps);
	SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

	// The constants we just created may not be legal (eg, floating point). We
	// must lower the vector right here because we can not guarantee that we'll
	// legalize it before loading it. This is also why we could not just create
	// a new build vector here. If the build vector contains illegal constants,
	// it could get split back up into a series of insert elements.
	// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
	SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
	MachineFunction &MF = DAG.getMachineFunction();
	MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
	SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
	unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
	unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
	if (InsertC < NumEltsInLow128Bits)
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);

	// There's no good way to insert into the high elements of a >128-bit
	// vector, so use shuffles to avoid an extract/insert sequence.
	assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
	assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
	SmallVector<int, 8> ShuffleMask;
	unsigned NumElts = VT.getVectorNumElements();
	for (unsigned i = 0; i != NumElts; ++i)
	ShuffleMask.push_back(i == InsertC ? NumElts : i);
	SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
	return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
	}

	// Special case for single non-zero, non-undef, element.
	if (NumNonZero == 1) {
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);

	// If we have a constant or non-constant insertion into the low element of
	// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
	// the rest of the elements. This will be matched as movd/movq/movss/movsd
	// depending on what the source datatype is.
	if (Idx == 0) {
	if (NumZero == 0)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

	if (EltVT == MVT::i32 \|\| EltVT == MVT::f32 \|\| EltVT == MVT::f64 \|\|
	(EltVT == MVT::i64 && Subtarget.is64Bit())) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\|
	VT.is512BitVector()) &&
	"Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
	return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}

	// We can't directly insert an i8 or i16 into a vector, so zero extend
	// it to i32 first.
	if (EltVT == MVT::i16 \|\| EltVT == MVT::i8) {
	Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
	if (VT.getSizeInBits() >= 256) {
	MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
	if (Subtarget.hasAVX()) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	} else {
	// Without AVX, we need to extend to a 128-bit vector and then
	// insert into the 256-bit vector.
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
	SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
	Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
	}
	} else {
	assert(VT.is128BitVector() && "Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}
	return DAG.getBitcast(VT, Item);
	}
	}

	// Is it a vector logical left shift?
	if (NumElems == 2 && Idx == 1 &&
	X86::isZeroNode(Op.getOperand(0)) &&
	!X86::isZeroNode(Op.getOperand(1))) {
	unsigned NumBits = VT.getSizeInBits();
	return getVShift(true, VT,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	VT, Op.getOperand(1)),
	NumBits/2, DAG, *this, dl);
	}

	if (IsAllConstants) // Otherwise, it's better to do a constpool load.
	return SDValue();

	// Otherwise, if this is a vector with i32 or f32 elements, and the element
	// is a non-constant being inserted into an element other than the low one,
	// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
	// movd/movss) to move this into the low element, then shuffle it into
	// place.
	if (EVTBits == 32) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
	}
	}

	// Splat is obviously ok. Let legalizer expand it to a shuffle.
	if (Values.size() == 1) {
	if (EVTBits == 32) {
	// Instead of a shuffle like this:
	// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
	// Check if it's possible to issue this instead.
	// shuffle (vload ptr)), undef, <1, 1, 1, 1>
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);
	if (Op.getNode()->isOnlyUserOf(Item.getNode()))
	return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
	}
	return SDValue();
	}

	// A vector full of immediates; various special cases are already
	// handled, so this is best done with a single constant-pool load.
	if (IsAllConstants)
	return SDValue();

	if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
	return V;

	// See if we can use a vector load to get all of the elements.
	{
	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
	return LD;
	}

	// If this is a splat of pairs of 32-bit elements, we can use a narrower
	// build_vector and broadcast it.
	// TODO: We could probably generalize this more.
	if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
	SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
	DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
	auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
	// Make sure all the even/odd operands match.
	for (unsigned i = 2; i != NumElems; ++i)
	if (Ops[i % 2] != Op.getOperand(i))
	return false;
	return true;
	};
	if (CanSplat(Op, NumElems, Ops)) {
	MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
	MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
	// Create a new build vector and cast to v2i64/v2f64.
	SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
	DAG.getBuildVector(NarrowVT, dl, Ops));
	// Broadcast from v2i64/v2f64 and cast to final VT.
	MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
	return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
	NewBV));
	}
	}

	// For AVX-length vectors, build the individual 128-bit pieces and use
	// shuffles to put them in place.
	if (VT.getSizeInBits() > 128) {
	MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);

	// Build both the lower and upper subvector.
	SDValue Lower =
	DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
	SDValue Upper = DAG.getBuildVector(
	HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

	// Recreate the wider vector with the lower and upper part.
	return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
	VT.getSizeInBits() / 2);
	}

	// Let legalizer expand 2-wide build_vectors.
	if (EVTBits == 64) {
	if (NumNonZero == 1) {
	// One half is zero or undef.
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
	Op.getOperand(Idx));
	return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
	}
	return SDValue();
	}

	// If element VT is < 32 bits, convert it to inserts into a zero vector.
	if (EVTBits == 8 && NumElems == 16)
	if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	if (EVTBits == 16 && NumElems == 8)
	if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
	if (EVTBits == 32 && NumElems == 4)
	if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
	return V;

	// If element VT is == 32 bits, turn it into a number of shuffles.
	if (NumElems == 4 && NumZero > 0) {
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < 4; ++i) {
	bool isZero = !(NonZeros & (1ULL << i));
	if (isZero)
	Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
	else
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	}

	for (unsigned i = 0; i < 2; ++i) {
	switch ((NonZeros >> (i*2)) & 0x3) {
	default: llvm_unreachable("Unexpected NonZero count");
	case 0:
	Ops[i] = Ops[i*2]; // Must be a zero vector.
	break;
	case 1:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2+1], Ops[i2]);
	break;
	case 2:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	case 3:
	Ops[i] = getUnpackl(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	}
	}

	bool Reverse1 = (NonZeros & 0x3) == 2;
	bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
	int MaskVec[] = {
	Reverse1 ? 1 : 0,
	Reverse1 ? 0 : 1,
	static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
	static_cast<int>(Reverse2 ? NumElems : NumElems+1)
	};
	return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
	}

	assert(Values.size() > 1 && "Expected non-undef and non-splat vector");

	// Check for a build vector from mostly shuffle plus few inserting.
	if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
	return Sh;

	// For SSE 4.1, use insertps to put the high elements into the low element.
	if (Subtarget.hasSSE41()) {
	SDValue Result;
	if (!Op.getOperand(0).isUndef())
	Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
	else
	Result = DAG.getUNDEF(VT);

	for (unsigned i = 1; i < NumElems; ++i) {
	if (Op.getOperand(i).isUndef()) continue;
	Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
	}
	return Result;
	}

	// Otherwise, expand into a number of unpckl*, start by extending each of
	// our (non-undef) elements to the full vector width with the element in the
	// bottom slot of the vector (which generates no code for SSE).
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (!Op.getOperand(i).isUndef())
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	else
	Ops[i] = DAG.getUNDEF(VT);
	}

	// Next, we iteratively mix elements, e.g. for v4f32:
	// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
	// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
	// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
	for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
	// Generate scaled UNPCKL shuffle mask.
	SmallVector<int, 16> Mask;
	for(unsigned i = 0; i != Scale; ++i)
	Mask.push_back(i);
	for (unsigned i = 0; i != Scale; ++i)
	Mask.push_back(NumElems+i);
	Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

	for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
	Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2i], Ops[(2i)+1], Mask);
	}
	return Ops[0];
	}

	// 256-bit AVX can use the vinsertf128 instruction
	// to create 256-bit vectors from two other 128-bit ones.
	// TODO: Detect subvector broadcast here instead of DAG combine?
	static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();

	assert((ResVT.is256BitVector() \|\|
	ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");

	unsigned NumOperands = Op.getNumOperands();
	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	unsigned NonZeros = 0;
	for (unsigned i = 0; i != NumOperands; ++i) {
	SDValue SubVec = Op.getOperand(i);
	if (SubVec.isUndef())
	continue;
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	++NumZero;
	else {
	assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
	NonZeros \|= 1 << i;
	++NumNonZero;
	}
	}

	// If we have more than 2 non-zeros, build each half separately.
	if (NumNonZero > 2) {
	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
	ResVT.getVectorNumElements()/2);
	ArrayRef<SDUse> Ops = Op->ops();
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(0, NumOperands/2));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(NumOperands/2));
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	// Otherwise, build it up through insert_subvectors.
	SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
	: DAG.getUNDEF(ResVT);

	MVT SubVT = Op.getOperand(0).getSimpleValueType();
	unsigned NumSubElems = SubVT.getVectorNumElements();
	for (unsigned i = 0; i != NumOperands; ++i) {
	if ((NonZeros & (1 << i)) == 0)
	continue;

	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
	Op.getOperand(i),
	DAG.getIntPtrConstant(i * NumSubElems, dl));
	}

	return Vec;
	}

	// Returns true if the given node is a type promotion (by concatenating i1
	// zeros) of the result of a node that already zeros all upper bits of
	// k-register.
	// TODO: Merge this with LowerAVXCONCAT_VECTORS?
	static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG & DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();
	unsigned NumOperands = Op.getNumOperands();

	assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
	"Unexpected number of operands in CONCAT_VECTORS");

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	for (unsigned i = 0; i != NumOperands; ++i) {
	SDValue SubVec = Op.getOperand(i);
	if (SubVec.isUndef())
	continue;
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	++NumZero;
	else {
	assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
	NonZeros \|= (uint64_t)1 << i;
	++NumNonZero;
	}
	}


	// If there are zero or one non-zeros we can handle this very simply.
	if (NumNonZero <= 1) {
	SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
	: DAG.getUNDEF(ResVT);
	if (!NumNonZero)
	return Vec;
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue SubVec = Op.getOperand(Idx);
	unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
	DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
	}

	if (NumOperands > 2) {
	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
	ResVT.getVectorNumElements()/2);
	ArrayRef<SDUse> Ops = Op->ops();
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(0, NumOperands/2));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(NumOperands/2));
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	assert(NumNonZero == 2 && "Simple cases not handled?");

	if (ResVT.getVectorNumElements() >= 16)
	return Op; // The operation is legal with KUNPCK

	SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
	DAG.getUNDEF(ResVT), Op.getOperand(0),
	DAG.getIntPtrConstant(0, dl));
	unsigned NumElems = ResVT.getVectorNumElements();
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
	DAG.getIntPtrConstant(NumElems/2, dl));
	}

	static SDValue LowerCONCAT_VECTORS(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getVectorElementType() == MVT::i1)
	return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

	assert((VT.is256BitVector() && Op.getNumOperands() == 2) \|\|
	(VT.is512BitVector() && (Op.getNumOperands() == 2 \|\|
	Op.getNumOperands() == 4)));

	// AVX can use the vinsertf128 instruction to create 256-bit vectors
	// from two other 128-bit ones.

	// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
	return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
	}

	//===----------------------------------------------------------------------===//
	// Vector shuffle lowering
	//
	// This is an experimental code path for lowering vector shuffles on x86. It is
	// designed to handle arbitrary vector shuffles and blends, gracefully
	// degrading performance as necessary. It works hard to recognize idiomatic
	// shuffles and lower them to optimal instruction patterns without leaving
	// a framework that allows reasonably efficient handling of all vector shuffle
	// patterns.
	//===----------------------------------------------------------------------===//

	/// Tiny helper function to identify a no-op mask.
	///
	/// This is a somewhat boring predicate function. It checks whether the mask
	/// array input, which is assumed to be a single-input shuffle mask of the kind
	/// used by the X86 shuffle instructions (not a fully general
	/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
	/// in-place shuffle are 'no-op's.
	static bool isNoopShuffleMask(ArrayRef<int> Mask) {
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != i)
	return false;
	}
	return true;
	}

	/// Test whether there are elements crossing 128-bit lanes in this
	/// shuffle mask.
	///
	/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
	/// and we routinely test for these.
	static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
	int LaneSize = 128 / VT.getScalarSizeInBits();
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	return true;
	return false;
	}

	/// Test whether a shuffle mask is equivalent within each sub-lane.
	///
	/// This checks a shuffle mask to see if it is performing the same
	/// lane-relative shuffle in each sub-lane. This trivially implies
	/// that it is also not lane-crossing. It may however involve a blend from the
	/// same lane of a second vector.
	///
	/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
	/// non-trivial to compute in the face of undef lanes. The representation is
	/// suitable for use with existing 128-bit shuffles as entries from the second
	/// vector have been remapped to [LaneSize, 2*LaneSize).
	static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, -1);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] == SM_SentinelUndef \|\| Mask[i] >= 0);
	if (Mask[i] < 0)
	continue;
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
	: Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] < 0)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Test whether a shuffle mask is equivalent within each 128-bit lane.
	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
	SmallVector<int, 32> RepeatedMask;
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	/// Test whether a shuffle mask is equivalent within each 256-bit lane.
	static bool
	is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
	}

	/// Test whether a target shuffle mask is equivalent within each sub-lane.
	/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
	static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, SM_SentinelUndef);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(isUndefOrZero(Mask[i]) \|\| (Mask[i] >= 0));
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] == SM_SentinelZero) {
	if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
	return false;
	RepeatedMask[i % LaneSize] = SM_SentinelZero;
	continue;
	}
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM =
	Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Checks whether a shuffle mask is equivalent to an explicit list of
	/// arguments.
	///
	/// This is a fast way to test a shuffle mask against a fixed pattern:
	///
	/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
	///
	/// It returns true if the mask is exactly as wide as the argument list, and
	/// each element of the mask is either -1 (signifying undef) or the value given
	/// in the argument.
	static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	if (Mask.size() != ExpectedMask.size())
	return false;

	int Size = Mask.size();

	// If the values are build vectors, we can look through them to find
	// equivalent inputs that make the shuffles equivalent.
	auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
	auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);

	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
	auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
	auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
	if (!MaskBV \|\| !ExpectedBV \|\|
	MaskBV->getOperand(Mask[i] % Size) !=
	ExpectedBV->getOperand(ExpectedMask[i] % Size))
	return false;
	}
	}

	return true;
	}

	/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
	///
	/// The masks must be exactly the same width.
	///
	/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
	/// value in ExpectedMask is always accepted. Otherwise the indices must match.
	///
	/// SM_SentinelZero is accepted as a valid negative index but must match in both.
	static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	int Size = Mask.size();
	if (Size != (int)ExpectedMask.size())
	return false;
	assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
	"Illegal target shuffle mask");

	for (int i = 0; i < Size; ++i)
	if (Mask[i] == SM_SentinelUndef)
	continue;
	else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
	return false;
	else if (Mask[i] != ExpectedMask[i])
	return false;

	return true;
	}

	// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
	// mask.
	static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
	const APInt &Zeroable) {
	int NumElts = Mask.size();
	assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");

	SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
	TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
	}
	return TargetMask;
	}

	// Attempt to create a shuffle mask from a VSELECT condition mask.
	static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
	SDValue Cond) {
	if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return false;

	unsigned Size = Cond.getValueType().getVectorNumElements();
	Mask.resize(Size, SM_SentinelUndef);

	for (int i = 0; i != (int)Size; ++i) {
	SDValue CondElt = Cond.getOperand(i);
	Mask[i] = i;
	// Arbitrarily choose from the 2nd operand if the select condition element
	// is undef.
	// TODO: Can we do better by matching patterns such as even/odd?
	if (CondElt.isUndef() \|\| isNullConstant(CondElt))
	Mask[i] += Size;
	}

	return true;
	}

	// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
	// instructions.
	static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
	if (VT != MVT::v8i32 && VT != MVT::v8f32)
	return false;

	SmallVector<int, 8> Unpcklwd;
	createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
	/* Unary = */ false);
	SmallVector<int, 8> Unpckhwd;
	createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
	/* Unary = */ false);
	bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) \|\|
	isTargetShuffleEquivalent(Mask, Unpckhwd));
	return IsUnpackwdMask;
	}

	static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
	// Create 128-bit vector type based on mask size.
	MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
	MVT VT = MVT::getVectorVT(EltVT, Mask.size());

	// We can't assume a canonical shuffle mask, so try the commuted version too.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);

	// Match any of unary/binary or low/high.
	for (unsigned i = 0; i != 4; ++i) {
	SmallVector<int, 16> UnpackMask;
	createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
	if (isTargetShuffleEquivalent(Mask, UnpackMask) \|\|
	isTargetShuffleEquivalent(CommutedMask, UnpackMask))
	return true;
	}
	return false;
	}

	/// Return true if a shuffle mask chooses elements identically in its top and
	/// bottom halves. For example, any splat mask has the same top and bottom
	/// halves. If an element is undefined in only one half of the mask, the halves
	/// are not considered identical.
	static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
	assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
	unsigned HalfSize = Mask.size() / 2;
	for (unsigned i = 0; i != HalfSize; ++i) {
	if (Mask[i] != Mask[i + HalfSize])
	return false;
	}
	return true;
	}

	/// Get a 4-lane 8-bit shuffle immediate for a mask.
	///
	/// This helper function produces an 8-bit shuffle immediate corresponding to
	/// the ubiquitous shuffle encoding scheme used in x86 instructions for
	/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
	/// example.
	///
	/// NB: We rely heavily on "undef" masks preserving the input lane.
	static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
	assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
	assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");

	unsigned Imm = 0;
	Imm \|= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
	Imm \|= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
	Imm \|= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
	Imm \|= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
	return Imm;
	}

	static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
	SelectionDAG &DAG) {
	return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
	}

	/// Compute whether each element of a shuffle is zeroable.
	///
	/// A "zeroable" vector shuffle element is one which can be lowered to zero.
	/// Either it is an undef element in the shuffle mask, the element of the input
	/// referenced is undef, or the element of the input referenced is known to be
	/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
	/// as many lanes with this technique as possible to simplify the remaining
	/// shuffle.
	static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
	SDValue V1, SDValue V2) {
	APInt Zeroable(Mask.size(), 0);
	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

	int VectorSizeInBits = V1.getValueSizeInBits();
	int ScalarSizeInBits = VectorSizeInBits / Mask.size();
	assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];
	// Handle the easy cases.
	if (M < 0 \|\| (M >= 0 && M < Size && V1IsZero) \|\| (M >= Size && V2IsZero)) {
	Zeroable.setBit(i);
	continue;
	}

	// Determine shuffle input and normalize the mask.
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
	if (V.getOpcode() != ISD::BUILD_VECTOR)
	continue;

	// If the BUILD_VECTOR has fewer elements then the bitcasted portion of
	// the (larger) source element must be UNDEF/ZERO.
	if ((Size % V.getNumOperands()) == 0) {
	int Scale = Size / V->getNumOperands();
	SDValue Op = V.getOperand(M / Scale);
	if (Op.isUndef() \|\| X86::isZeroNode(Op))
	Zeroable.setBit(i);
	else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt Val = Cst->getAPIntValue();
	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
	Val = Val.getLoBits(ScalarSizeInBits);
	if (Val == 0)
	Zeroable.setBit(i);
	} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt Val = Cst->getValueAPF().bitcastToAPInt();
	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
	Val = Val.getLoBits(ScalarSizeInBits);
	if (Val == 0)
	Zeroable.setBit(i);
	}
	continue;
	}

	// If the BUILD_VECTOR has more elements then all the (smaller) source
	// elements must be UNDEF or ZERO.
	if ((V.getNumOperands() % Size) == 0) {
	int Scale = V->getNumOperands() / Size;
	bool AllZeroable = true;
	for (int j = 0; j < Scale; ++j) {
	SDValue Op = V.getOperand((M * Scale) + j);
	AllZeroable &= (Op.isUndef() \|\| X86::isZeroNode(Op));
	}
	if (AllZeroable)
	Zeroable.setBit(i);
	continue;
	}
	}

	return Zeroable;
	}

	// The Shuffle result is as follow:
	// 0a[0]0a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
	// Each Zeroable's element correspond to a particular Mask's element.
	// As described in computeZeroableShuffleElements function.
	//
	// The function looks for a sub-mask that the nonzero elements are in
	// increasing order. If such sub-mask exist. The function returns true.
	static bool isNonZeroElementsInOrder(const APInt &Zeroable,
	ArrayRef<int> Mask, const EVT &VectorType,
	bool &IsZeroSideLeft) {
	int NextElement = -1;
	// Check if the Mask's nonzero elements are in increasing order.
	for (int i = 0, e = Mask.size(); i < e; i++) {
	// Checks if the mask's zeros elements are built from only zeros.
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] < 0)
	return false;
	if (Zeroable[i])
	continue;
	// Find the lowest non zero element
	if (NextElement < 0) {
	NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
	IsZeroSideLeft = NextElement != 0;
	}
	// Exit if the mask's non zero elements are not in increasing order.
	if (NextElement != Mask[i])
	return false;
	NextElement++;
	}
	return true;
	}

	/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
	static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	const int NumBytes = VT.getSizeInBits() / 8;
	const int NumEltBytes = VT.getScalarSizeInBits() / 8;

	assert((Subtarget.hasSSSE3() && VT.is128BitVector()) \|\|
	(Subtarget.hasAVX2() && VT.is256BitVector()) \|\|
	(Subtarget.hasBWI() && VT.is512BitVector()));

	SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
	// Sign bit set in i8 mask means zero element.
	SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

	SDValue V;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / NumEltBytes];
	if (M < 0) {
	PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
	continue;
	}
	if (Zeroable[i / NumEltBytes]) {
	PSHUFBMask[i] = ZeroMask;
	continue;
	}

	// We can only use a single input of V1 or V2.
	SDValue SrcV = (M >= Size ? V2 : V1);
	if (V && V != SrcV)
	return SDValue();
	V = SrcV;
	M %= Size;

	// PSHUFB can't cross lanes, ensure this doesn't happen.
	if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
	return SDValue();

	M = M % LaneSize;
	M = M * NumEltBytes + (i % NumEltBytes);
	PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
	}
	assert(V && "Failed to find a source input");

	MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
	DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
	}

	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl);

	// X86 has dedicated shuffle that can be lowered to VEXPAND
	static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SDValue &V1,
	SDValue &V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsLeftZeroSide = true;
	if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
	IsLeftZeroSide))
	return SDValue();
	unsigned VEXPANDMask = (~Zeroable).getZExtValue();
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
	unsigned NumElts = VT.getVectorNumElements();
	assert((NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) &&
	"Unexpected number of vector elements");
	SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
	Subtarget, DAG, DL);
	SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
	return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
	}

	static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &UnpackOpcode, bool IsUnary,
	ArrayRef<int> TargetMask,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();

	bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
	for (int i = 0; i != NumElts; i += 2) {
	int M1 = TargetMask[i + 0];
	int M2 = TargetMask[i + 1];
	Undef1 &= (SM_SentinelUndef == M1);
	Undef2 &= (SM_SentinelUndef == M2);
	Zero1 &= isUndefOrZero(M1);
	Zero2 &= isUndefOrZero(M2);
	}
	assert(!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) &&
	"Zeroable shuffle detected");

	// Attempt to match the target mask against the unpack lo/hi mask patterns.
	SmallVector<int, 64> Unpckl, Unpckh;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	// If an unary shuffle, attempt to match as an unpack lo/hi with zero.
	if (IsUnary && (Zero1 \|\| Zero2)) {
	// Don't bother if we can blend instead.
	if ((Subtarget.hasSSE41() \|\| VT == MVT::v2i64 \|\| VT == MVT::v2f64) &&
	isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
	return false;

	bool MatchLo = true, MatchHi = true;
	for (int i = 0; (i != NumElts) && (MatchLo \|\| MatchHi); ++i) {
	int M = TargetMask[i];

	// Ignore if the input is known to be zero or the index is undef.
	if ((((i & 1) == 0) && Zero1) \|\| (((i & 1) == 1) && Zero2) \|\|
	(M == SM_SentinelUndef))
	continue;

	MatchLo &= (M == Unpckl[i]);
	MatchHi &= (M == Unpckh[i]);
	}

	if (MatchLo \|\| MatchHi) {
	UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	return true;
	}
	}

	// If a binary shuffle, commute and try again.
	if (!IsUnary) {
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	std::swap(V1, V2);
	return true;
	}

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	std::swap(V1, V2);
	return true;
	}
	}

	return false;
	}

	// X86 has dedicated unpack instructions that can handle specific blend
	// operations: UNPCKH and UNPCKL.
	static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1, SDValue V2,
	SelectionDAG &DAG) {
	SmallVector<int, 8> Unpckl;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = / true, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

	SmallVector<int, 8> Unpckh;
	createUnpackShuffleMask(VT, Unpckh, /* Lo = / false, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

	// Commute and try again.
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

	return SDValue();
	}

	static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
	int Delta) {
	int Size = (int)Mask.size();
	int Split = Size / Delta;
	int TruncatedVectorStart = SwappedOps ? Size : 0;

	// Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
	if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
	return false;

	// The rest of the mask should not refer to the truncated vector's elements.
	if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
	TruncatedVectorStart + Size))
	return false;

	return true;
	}

	// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
	//
	// An example is the following:
	//
	// t0: ch = EntryToken
	// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
	// t25: v4i32 = truncate t2
	// t41: v8i16 = bitcast t25
	// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
	// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
	// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
	// t18: v2i64 = bitcast t51
	//
	// Without avx512vl, this is lowered to:
	//
	// vpmovqd %zmm0, %ymm0
	// vpshufb {{.*#+}} xmm0 =
	// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
	//
	// But when avx512vl is available, one can just use a single vpmovdw
	// instruction.
	static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (VT != MVT::v16i8 && VT != MVT::v8i16)
	return SDValue();

	if (Mask.size() != VT.getVectorNumElements())
	return SDValue();

	bool SwappedOps = false;

	if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
	if (!ISD::isBuildVectorAllZeros(V1.getNode()))
	return SDValue();

	std::swap(V1, V2);
	SwappedOps = true;
	}

	// Look for:
	//
	// bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
	// bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
	//
	// and similar ones.
	if (V1.getOpcode() != ISD::BITCAST)
	return SDValue();
	if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
	return SDValue();

	SDValue Src = V1.getOperand(0).getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();

	// The vptrunc** instructions truncating 128 bit and 256 bit vectors
	// are only available with avx512vl.
	if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
	return SDValue();

	// Down Convert Word to Byte is only available with avx512bw. The case with
	// 256-bit output doesn't contain a shuffle and is therefore not handled here.
	if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
	!Subtarget.hasBWI())
	return SDValue();

	// The first half/quarter of the mask should refer to every second/fourth
	// element of the vector truncated and bitcasted.
	if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
	!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
	return SDValue();

	return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
	}

	// X86 has dedicated pack instructions that can handle specific truncation
	// operations: PACKSS and PACKUS.
	static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
	SDValue &V2, unsigned &PackOpcode,
	ArrayRef<int> TargetMask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NumElts = VT.getVectorNumElements();
	unsigned BitSize = VT.getScalarSizeInBits();
	MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
	MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);

	auto MatchPACK = [&](SDValue N1, SDValue N2) {
	SDValue VV1 = DAG.getBitcast(PackVT, N1);
	SDValue VV2 = DAG.getBitcast(PackVT, N2);
	if (Subtarget.hasSSE41() \|\| PackSVT == MVT::i16) {
	APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
	if ((N1.isUndef() \|\| DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
	(N2.isUndef() \|\| DAG.MaskedValueIsZero(VV2, ZeroMask))) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKUS;
	return true;
	}
	}
	if ((N1.isUndef() \|\| DAG.ComputeNumSignBits(VV1) > BitSize) &&
	(N2.isUndef() \|\| DAG.ComputeNumSignBits(VV2) > BitSize)) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKSS;
	return true;
	}
	return false;
	};

	// Try binary shuffle.
	SmallVector<int, 32> BinaryMask;
	createPackShuffleMask(VT, BinaryMask, false);
	if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
	if (MatchPACK(V1, V2))
	return true;

	// Try unary shuffle.
	SmallVector<int, 32> UnaryMask;
	createPackShuffleMask(VT, UnaryMask, true);
	if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
	if (MatchPACK(V1, V1))
	return true;

	return false;
	}

	static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
	SDValue V1, SDValue V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT PackVT;
	unsigned PackOpcode;
	if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
	Subtarget))
	return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
	DAG.getBitcast(PackVT, V2));

	return SDValue();
	}

	/// Try to emit a bitmask instruction for a shuffle.
	///
	/// This handles cases where we can model a blend exactly as a bitmask due to
	/// one of the inputs being zeroable.
	static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT MaskVT = VT;
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero, AllOnes;
	// Use f64 if i64 isn't legal.
	if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
	EltVT = MVT::f64;
	MaskVT = MVT::getVectorVT(EltVT, Mask.size());
	}

	MVT LogicVT = VT;
	if (EltVT == MVT::f32 \|\| EltVT == MVT::f64) {
	Zero = DAG.getConstantFP(0.0, DL, EltVT);
	AllOnes = DAG.getConstantFP(
	APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);
	LogicVT =
	MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
	} else {
	Zero = DAG.getConstant(0, DL, EltVT);
	AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	}

	SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
	SDValue V;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Zeroable[i])
	continue;
	if (Mask[i] % Size != i)
	return SDValue(); // Not a blend.
	if (!V)
	V = Mask[i] < Size ? V1 : V2;
	else if (V != (Mask[i] < Size ? V1 : V2))
	return SDValue(); // Can only let one input through the mask.

	VMaskOps[i] = AllOnes;
	}
	if (!V)
	return SDValue(); // No non-zeroable elements!

	SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
	VMask = DAG.getBitcast(LogicVT, VMask);
	V = DAG.getBitcast(LogicVT, V);
	SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
	return DAG.getBitcast(VT, And);
	}

	/// Try to emit a blend instruction for a shuffle using bit math.
	///
	/// This is used as a fallback approach when first class blend instructions are
	/// unavailable. Currently it is only suitable for integer vectors, but could
	/// be generalized for floating point vectors if desirable.
	static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.isInteger() && "Only supports integer vector types!");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> MaskOps;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
	return SDValue(); // Shuffled input!
	MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
	}

	SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
	V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
	V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
	return DAG.getNode(ISD::OR, DL, VT, V1, V2);
	}

	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG);

	static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
	MutableArrayRef<int> TargetMask,
	bool &ForceV1Zero, bool &ForceV2Zero,
	uint64_t &BlendMask) {
	bool V1IsZeroOrUndef =
	V1.isUndef() \|\| ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZeroOrUndef =
	V2.isUndef() \|\| ISD::isBuildVectorAllZeros(V2.getNode());

	BlendMask = 0;
	ForceV1Zero = false, ForceV2Zero = false;
	assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");

	// Attempt to generate the binary blend mask. If an input is zero then
	// we can use any lane.
	// TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
	for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
	int M = TargetMask[i];
	if (M == SM_SentinelUndef)
	continue;
	if (M == i)
	continue;
	if (M == i + Size) {
	BlendMask \|= 1ull << i;
	continue;
	}
	if (M == SM_SentinelZero) {
	if (V1IsZeroOrUndef) {
	ForceV1Zero = true;
	TargetMask[i] = i;
	continue;
	}
	if (V2IsZeroOrUndef) {
	ForceV2Zero = true;
	BlendMask \|= 1ull << i;
	TargetMask[i] = i + Size;
	continue;
	}
	}
	return false;
	}
	return true;
	}

	static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
	int Scale) {
	uint64_t ScaledMask = 0;
	for (int i = 0; i != Size; ++i)
	if (BlendMask & (1ull << i))
	ScaledMask \|= ((1ull << Scale) - 1) << (i * Scale);
	return ScaledMask;
	}

	/// Try to emit a blend instruction for a shuffle.
	///
	/// This doesn't do any checks for the availability of instructions for blending
	/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
	/// be matched in the backend with the type given. What it does check for is
	/// that the shuffle mask is a blend, or convertible into a blend with zero.
	static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Original,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);

	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
	BlendMask))
	return SDValue();

	// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
	if (ForceV1Zero)
	V1 = getZeroVector(VT, Subtarget, DAG, DL);
	if (ForceV2Zero)
	V2 = getZeroVector(VT, Subtarget, DAG, DL);

	switch (VT.SimpleTy) {
	case MVT::v4i64:
	case MVT::v8i32:
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v4f64:
	case MVT::v8f32:
	assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
	LLVM_FALLTHROUGH;
	case MVT::v2f64:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v4i32:
	case MVT::v8i16:
	assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8));
	case MVT::v16i16: {
	assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
	assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
	BlendMask = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	BlendMask \|= 1ull << i;
	return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8));
	}
	// Use PBLENDW for lower/upper lanes and then blend lanes.
	// TODO - we should allow 2 PBLENDW here and leave shuffle combine to
	// merge to VSELECT where useful.
	uint64_t LoMask = BlendMask & 0xFF;
	uint64_t HiMask = (BlendMask >> 8) & 0xFF;
	if (LoMask == 0 \|\| LoMask == 255 \|\| HiMask == 0 \|\| HiMask == 255) {
	SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getConstant(LoMask, DL, MVT::i8));
	SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getConstant(HiMask, DL, MVT::i8));
	return DAG.getVectorShuffle(
	MVT::v16i16, DL, Lo, Hi,
	{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
	}
	LLVM_FALLTHROUGH;
	}
	case MVT::v32i8:
	assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v16i8: {
	assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");

	// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
	if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Masked;

	if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}

	// Scale the blend by the number of bytes per element.
	int Scale = VT.getScalarSizeInBits() / 8;

	// This form of blend is always done on bytes. Compute the byte vector
	// type.
	MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

	// x86 allows load folding with blendvb from the 2nd source operand. But
	// we are still using LLVM select here (see comment below), so that's V1.
	// If V2 can be load-folded and V1 cannot be load-folded, then commute to
	// allow that load-folding possibility.
	if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(V1, V2);
	}

	// Compute the VSELECT mask. Note that VSELECT is really confusing in the
	// mix of LLVM's code generator and the x86 backend. We tell the code
	// generator that boolean values in the elements of an x86 vector register
	// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
	// mapping a select to operand #1, and 'false' mapping to operand #2. The
	// reality in x86 is that vector masks (pre-AVX-512) use only the high bit
	// of the element (the remaining are ignored) and 0 in that high bit would
	// mean operand #1 while 1 in the high bit would mean operand #2. So while
	// the LLVM model for boolean values in vector elements gets the relevant
	// bit set, it is set backwards and over constrained relative to x86's
	// actual model.
	SmallVector<SDValue, 32> VSELECTMask;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	for (int j = 0; j < Scale; ++j)
	VSELECTMask.push_back(
	Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
	: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
	MVT::i8));

	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT,
	DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
	V1, V2));
	}
	case MVT::v16f32:
	case MVT::v8f64:
	case MVT::v8i64:
	case MVT::v16i32:
	case MVT::v32i16:
	case MVT::v64i8: {
	// Attempt to lower to a bitmask if we can. Only if not optimizing for size.
	bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
	if (!OptForSize) {
	if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Masked;
	}

	// Otherwise load an immediate into a GPR, cast to k-register, and use a
	// masked move.
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}
	default:
	llvm_unreachable("Not a supported integer vector type!");
	}
	}

	/// Try to lower as a blend of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can blend elements from two inputs and
	/// then reduce the shuffle to a single-input permutation.
	static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG,
	bool ImmBlends = false) {
	// We build up the blend mask while checking whether a blend is a viable way
	// to reduce the shuffle.
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	SmallVector<int, 32> PermuteMask(Mask.size(), -1);

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");

	if (BlendMask[Mask[i] % Size] < 0)
	BlendMask[Mask[i] % Size] = Mask[i];
	else if (BlendMask[Mask[i] % Size] != Mask[i])
	return SDValue(); // Can't blend in the needed input!

	PermuteMask[i] = Mask[i] % Size;
	}

	// If only immediate blends, then bail if the blend mask can't be widened to
	// i16.
	unsigned EltSize = VT.getScalarSizeInBits();
	if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
	return SDValue();

	SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
	}

	/// Try to lower as an unpack of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can unpack elements from two inputs and
	/// then reduce the shuffle to a single-input (wider) permutation.
	static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	int NumElts = Mask.size();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;
	int NumHalfLaneElts = NumLaneElts / 2;

	bool MatchLo = true, MatchHi = true;
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

	// Determine UNPCKL/UNPCKH type and operand order.
	for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
	for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;

	SDValue &Op = Ops[Elt & 1];
	if (M < NumElts && (Op.isUndef() \|\| Op == V1))
	Op = V1;
	else if (NumElts <= M && (Op.isUndef() \|\| Op == V2))
	Op = V2;
	else
	return SDValue();

	int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
	MatchLo &= isUndefOrInRange(M, Lo, Mid) \|\|
	isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
	MatchHi &= isUndefOrInRange(M, Mid, Hi) \|\|
	isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
	if (!MatchLo && !MatchHi)
	return SDValue();
	}
	}
	assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");

	// Now check that each pair of elts come from the same unpack pair
	// and set the permute mask based on each pair.
	// TODO - Investigate cases where we permute individual elements.
	SmallVector<int, 32> PermuteMask(NumElts, -1);
	for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
	for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
	int M0 = Mask[Lane + Elt + 0];
	int M1 = Mask[Lane + Elt + 1];
	if (0 <= M0 && 0 <= M1 &&
	(M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
	return SDValue();
	if (0 <= M0)
	PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
	if (0 <= M1)
	PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
	}
	}

	unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
	return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
	}

	/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
	/// permuting the elements of the result in place.
	static SDValue lowerShuffleAsByteRotateAndPermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) \|\|
	(VT.is256BitVector() && !Subtarget.hasAVX2()) \|\|
	(VT.is512BitVector() && !Subtarget.hasBWI()))
	return SDValue();

	// We don't currently support lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	int Scale = VT.getScalarSizeInBits() / 8;
	int NumLanes = VT.getSizeInBits() / 128;
	int NumElts = VT.getVectorNumElements();
	int NumEltsPerLane = NumElts / NumLanes;

	// Determine range of mask elts.
	bool Blend1 = true;
	bool Blend2 = true;
	std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
	std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
	for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
	for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;
	if (M < NumElts) {
	Blend1 &= (M == (Lane + Elt));
	assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
	M = M % NumEltsPerLane;
	Range1.first = std::min(Range1.first, M);
	Range1.second = std::max(Range1.second, M);
	} else {
	M -= NumElts;
	Blend2 &= (M == (Lane + Elt));
	assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
	M = M % NumEltsPerLane;
	Range2.first = std::min(Range2.first, M);
	Range2.second = std::max(Range2.second, M);
	}
	}
	}

	// Bail if we don't need both elements.
	// TODO - it might be worth doing this for unary shuffles if the permute
	// can be widened.
	if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) \|\|
	!(0 <= Range2.first && Range2.second < NumEltsPerLane))
	return SDValue();

	if (VT.getSizeInBits() > 128 && (Blend1 \|\| Blend2))
	return SDValue();

	// Rotate the 2 ops so we can access both ranges, then permute the result.
	auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	SDValue Rotate = DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
	DAG.getBitcast(ByteVT, Lo),
	DAG.getConstant(Scale * RotAmt, DL, MVT::i8)));
	SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
	for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
	for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
	int M = Mask[Lane + Elt];
	if (M < 0)
	continue;
	if (M < NumElts)
	PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
	else
	PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
	}
	}
	return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
	};

	// Check if the ranges are small enough to rotate from either direction.
	if (Range2.second < Range1.first)
	return RotateAndPermute(V1, V2, Range1.first, 0);
	if (Range1.second < Range2.first)
	return RotateAndPermute(V2, V1, Range2.first, NumElts);
	return SDValue();
	}

	/// Generic routine to decompose a shuffle and blend into independent
	/// blends and permutes.
	///
	/// This matches the extremely common pattern for handling combined
	/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
	/// operations. It will try to pick the best arrangement of shuffles and
	/// blends.
	static SDValue lowerShuffleAsDecomposedShuffleBlend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	// Shuffle the input elements into the desired positions in V1 and V2 and
	// blend them together.
	SmallVector<int, 32> V1Mask(Mask.size(), -1);
	SmallVector<int, 32> V2Mask(Mask.size(), -1);
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] < Size) {
	V1Mask[i] = Mask[i];
	BlendMask[i] = i;
	} else if (Mask[i] >= Size) {
	V2Mask[i] = Mask[i] - Size;
	BlendMask[i] = i + Size;
	}

	// Try to lower with the simpler initial blend/unpack/rotate strategies unless
	// one of the input shuffles would be a no-op. We prefer to shuffle inputs as
	// the shuffle may be able to fold with a load or other benefit. However, when
	// we'll have to do 2x as many shuffles in order to achieve this, a 2-input
	// pre-shuffle first is a better strategy.
	if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
	// Only prefer immediate blends to unpack/rotate.
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
	DAG, true))
	return BlendPerm;
	if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
	DAG))
	return UnpackPerm;
	if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
	DL, VT, V1, V2, Mask, Subtarget, DAG))
	return RotatePerm;
	// Unpack/rotate failed - try again with variable blends.
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
	DAG))
	return BlendPerm;
	}

	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
	return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	}

	/// Try to lower a vector shuffle as a rotation.
	///
	/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
	static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
	int NumElts = Mask.size();

	// We need to detect various ways of spelling a rotation:
	// [11, 12, 13, 14, 15, 0, 1, 2]
	// [-1, 12, 13, 14, -1, -1, 1, -1]
	// [-1, -1, -1, -1, -1, -1, 1, 2]
	// [ 3, 4, 5, 6, 7, 8, 9, 10]
	// [-1, 4, 5, 6, -1, -1, 9, -1]
	// [-1, 4, 5, 6, -1, -1, -1, -1]
	int Rotation = 0;
	SDValue Lo, Hi;
	for (int i = 0; i < NumElts; ++i) {
	int M = Mask[i];
	assert((M == SM_SentinelUndef \|\| (0 <= M && M < (2*NumElts))) &&
	"Unexpected mask index.");
	if (M < 0)
	continue;

	// Determine where a rotated vector would have started.
	int StartIdx = i - (M % NumElts);
	if (StartIdx == 0)
	// The identity rotation isn't interesting, stop.
	return -1;

	// If we found the tail of a vector the rotation must be the missing
	// front. If we found the head of a vector, it must be how much of the
	// head.
	int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

	if (Rotation == 0)
	Rotation = CandidateRotation;
	else if (Rotation != CandidateRotation)
	// The rotations don't match, so we can't match this mask.
	return -1;

	// Compute which value this mask is pointing at.
	SDValue MaskV = M < NumElts ? V1 : V2;

	// Compute which of the two target values this index should be assigned
	// to. This reflects whether the high elements are remaining or the low
	// elements are remaining.
	SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

	// Either set up this value if we've not encountered it before, or check
	// that it remains consistent.
	if (!TargetV)
	TargetV = MaskV;
	else if (TargetV != MaskV)
	// This may be a rotation, but it pulls from the inputs in some
	// unsupported interleaving.
	return -1;
	}

	// Check that we successfully analyzed the mask, and normalize the results.
	assert(Rotation != 0 && "Failed to locate a viable rotation!");
	assert((Lo \|\| Hi) && "Failed to find a rotated input vector!");
	if (!Lo)
	Lo = Hi;
	else if (!Hi)
	Hi = Lo;

	V1 = Lo;
	V2 = Hi;

	return Rotation;
	}

	/// Try to lower a vector shuffle as a byte rotation.
	///
	/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
	/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
	/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
	/// try to generically lower a vector shuffle through such an pattern. It
	/// does not check for the profitability of lowering either as PALIGNR or
	/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
	/// This matches shuffle vectors that look like:
	///
	/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	// Don't accept any shuffles with zero elements.
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return -1;

	// PALIGNR works on 128-bit lanes.
	SmallVector<int, 16> RepeatedMask;
	if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
	return -1;

	int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask);
	if (Rotation <= 0)
	return -1;

	// PALIGNR rotates bytes, so we need to scale the
	// rotation based on how many bytes are in the vector lane.
	int NumElts = RepeatedMask.size();
	int Scale = 16 / NumElts;
	return Rotation * Scale;
	}

	static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

	SDValue Lo = V1, Hi = V2;
	int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
	if (ByteRotation <= 0)
	return SDValue();

	// Cast the inputs to i8 vector of correct length to match PALIGNR or
	// PSLLDQ/PSRLDQ.
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	Lo = DAG.getBitcast(ByteVT, Lo);
	Hi = DAG.getBitcast(ByteVT, Hi);

	// SSSE3 targets can use the palignr instruction.
	if (Subtarget.hasSSSE3()) {
	assert((!VT.is512BitVector() \|\| Subtarget.hasBWI()) &&
	"512-bit PALIGNR requires BWI instructions");
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
	DAG.getConstant(ByteRotation, DL, MVT::i8)));
	}

	assert(VT.is128BitVector() &&
	"Rotate-based lowering only supports 128-bit lowering!");
	assert(Mask.size() <= 16 &&
	"Can shuffle at most 16 bytes in a 128-bit vector!");
	assert(ByteVT == MVT::v16i8 &&
	"SSE2 rotate lowering only needed for v16i8!");

	// Default SSE2 implementation
	int LoByteShift = 16 - ByteRotation;
	int HiByteShift = ByteRotation;

	SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
	DAG.getConstant(LoByteShift, DL, MVT::i8));
	SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
	DAG.getConstant(HiByteShift, DL, MVT::i8));
	return DAG.getBitcast(VT,
	DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
	}

	/// Try to lower a vector shuffle as a dword/qword rotation.
	///
	/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
	/// rotation of the concatenation of two vectors; This routine will
	/// try to generically lower a vector shuffle through such an pattern.
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) &&
	"Only 32-bit and 64-bit elements are supported!");

	// 128/256-bit vectors are only supported with VLX.
	assert((Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT.is256BitVector()))
	&& "VLX required for 128/256-bit vectors");

	SDValue Lo = V1, Hi = V2;
	int Rotation = matchShuffleAsRotate(Lo, Hi, Mask);
	if (Rotation <= 0)
	return SDValue();

	return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
	DAG.getConstant(Rotation, DL, MVT::i8));
	}

	/// Try to lower a vector shuffle as a byte shift sequence.
	static SDValue lowerVectorShuffleAsByteShiftMask(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
	assert(VT.is128BitVector() && "Only 128-bit vectors supported");

	// We need a shuffle that has zeros at one/both ends and a sequential
	// shuffle from one source within.
	unsigned ZeroLo = Zeroable.countTrailingOnes();
	unsigned ZeroHi = Zeroable.countLeadingOnes();
	if (!ZeroLo && !ZeroHi)
	return SDValue();

	unsigned NumElts = Mask.size();
	unsigned Len = NumElts - (ZeroLo + ZeroHi);
	if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
	return SDValue();

	unsigned Scale = VT.getScalarSizeInBits() / 8;
	ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
	if (!isUndefOrInRange(StubMask, 0, NumElts) &&
	!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
	return SDValue();

	SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
	Res = DAG.getBitcast(MVT::v16i8, Res);

	// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
	// inner sequential set of elements, possibly offset:
	// 01234567 --> zzzzzz01 --> 1zzzzzzz
	// 01234567 --> 4567zzzz --> zzzzz456
	// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
	if (ZeroLo == 0) {
	unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getConstant(Scale * ZeroHi, DL, MVT::i8));
	} else if (ZeroHi == 0) {
	unsigned Shift = Mask[ZeroLo] % NumElts;
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
	} else if (!Subtarget.hasSSSE3()) {
	// If we don't have PSHUFB then its worth avoiding an AND constant mask
	// by performing 3 byte shifts. Shuffle combining can kick in above that.
	// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
	unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getConstant(Scale * Shift, DL, MVT::i8));
	Shift += Mask[ZeroLo] % NumElts;
	Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
	DAG.getConstant(Scale * Shift, DL, MVT::i8));
	Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
	DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
	} else
	return SDValue();

	return DAG.getBitcast(VT, Res);
	}

	/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
	///
	/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
	/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
	/// matches elements from one of the input vectors shuffled to the left or
	/// right with zeroable elements 'shifted in'. It handles both the strictly
	/// bit-wise element shifts and the byte shift across an entire 128-bit double
	/// quad word lane.
	///
	/// PSHL : (little-endian) left bit shift.
	/// [ zz, 0, zz, 2 ]
	/// [ -1, 4, zz, -1 ]
	/// PSRL : (little-endian) right bit shift.
	/// [ 1, zz, 3, zz]
	/// [ -1, -1, 7, zz]
	/// PSLLDQ : (little-endian) left byte shift
	/// [ zz, 0, 1, 2, 3, 4, 5, 6]
	/// [ zz, zz, -1, -1, 2, 3, 4, -1]
	/// [ zz, zz, zz, zz, zz, zz, -1, 1]
	/// PSRLDQ : (little-endian) right byte shift
	/// [ 5, 6, 7, zz, zz, zz, zz, zz]
	/// [ -1, 5, 6, 7, zz, zz, zz, zz]
	/// [ 1, 2, -1, -1, -1, -1, zz, zz]
	static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
	unsigned ScalarSizeInBits, ArrayRef<int> Mask,
	int MaskOffset, const APInt &Zeroable,
	const X86Subtarget &Subtarget) {
	int Size = Mask.size();
	unsigned SizeInBits = Size * ScalarSizeInBits;

	auto CheckZeros = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i < Size; i += Scale)
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i != Size; i += Scale) {
	unsigned Pos = Left ? i + Shift : i;
	unsigned Low = Left ? i : i + Shift;
	unsigned Len = Scale - Shift;
	if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
	return -1;
	}

	int ShiftEltBits = ScalarSizeInBits * Scale;
	bool ByteShift = ShiftEltBits > 64;
	Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
	: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
	int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

	// Normalize the scale for byte shifts to still produce an i64 element
	// type.
	Scale = ByteShift ? Scale / 2 : Scale;

	// We need to round trip through the appropriate type for the shift.
	MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
	ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
	: MVT::getVectorVT(ShiftSVT, Size / Scale);
	return (int)ShiftAmt;
	};

	// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
	// keep doubling the size of the integer elements up to that. We can
	// then shift the elements of the integer vector by whole multiples of
	// their width within the elements of the larger integer vector. Test each
	// multiple to see if we can find a match with the moved element indices
	// and that the shifted in elements are all zeroable.
	unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
	for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
	for (int Shift = 1; Shift != Scale; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Scale, Left)) {
	int ShiftAmt = MatchShift(Shift, Scale, Left);
	if (0 < ShiftAmt)
	return ShiftAmt;
	}

	// no match
	return -1;
	}

	static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	MVT ShiftVT;
	SDValue V = V1;
	unsigned Opcode;

	// Try to match shuffle against V1 shift.
	int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, 0, Zeroable, Subtarget);

	// If V1 failed, try to match shuffle against V2 shift.
	if (ShiftAmt < 0) {
	ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, Size, Zeroable, Subtarget);
	V = V2;
	}

	if (ShiftAmt < 0)
	return SDValue();

	assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
	"Illegal integer vector type");
	V = DAG.getBitcast(ShiftVT, V);
	V = DAG.getNode(Opcode, DL, ShiftVT, V,
	DAG.getConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getBitcast(VT, V);
	}

	// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
	// Remainder of lower half result is zero and upper half is all undef.
	static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx, const APInt &Zeroable) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
	assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");

	// Upper half must be undefined.
	if (!isUndefUpperHalf(Mask))
	return false;

	// Determine the extraction length from the part of the
	// lower half that isn't zeroable.
	int Len = HalfSize;
	for (; Len > 0; --Len)
	if (!Zeroable[Len - 1])
	break;
	assert(Len > 0 && "Zeroable shuffle mask");

	// Attempt to match first Len sequential elements from the lower half.
	SDValue Src;
	int Idx = -1;
	for (int i = 0; i != Len; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	SDValue &V = (M < Size ? V1 : V2);
	M = M % Size;

	// The extracted elements must start at a valid index and all mask
	// elements must be in the lower half.
	if (i > M \|\| M >= HalfSize)
	return false;

	if (Idx < 0 \|\| (Src == V && Idx == (M - i))) {
	Src = V;
	Idx = M - i;
	continue;
	}
	return false;
	}

	if (!Src \|\| Idx < 0)
	return false;

	assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Src;
	return true;
	}

	// INSERTQ: Extract lowest Len elements from lower half of second source and
	// insert over first source, starting at Idx.
	// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
	static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	// Upper half must be undefined.
	if (!isUndefUpperHalf(Mask))
	return false;

	for (int Idx = 0; Idx != HalfSize; ++Idx) {
	SDValue Base;

	// Attempt to match first source from mask before insertion point.
	if (isUndefInRange(Mask, 0, Idx)) {
	/* EMPTY */
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
	Base = V1;
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
	Base = V2;
	} else {
	continue;
	}

	// Extend the extraction length looking to match both the insertion of
	// the second source and the remaining elements of the first.
	for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
	SDValue Insert;
	int Len = Hi - Idx;

	// Match insertion.
	if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
	Insert = V1;
	} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
	Insert = V2;
	} else {
	continue;
	}

	// Match the remaining elements of the lower half.
	if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
	/* EMPTY */
	} else if ((!Base \|\| (Base == V1)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
	Base = V1;
	} else if ((!Base \|\| (Base == V2)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
	Size + Hi)) {
	Base = V2;
	} else {
	continue;
	}

	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Base;
	V2 = Insert;
	return true;
	}
	}

	return false;
	}

	/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
	static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG) {
	uint64_t BitLen, BitIdx;
	if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
	return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));

	if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
	return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
	V2 ? V2 : DAG.getUNDEF(VT),
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));

	return SDValue();
	}

	/// Lower a vector shuffle as a zero or any extension.
	///
	/// Given a specific number of elements, element bit width, and extension
	/// stride, produce either a zero or any extension based on the available
	/// features of the subtarget. The extended elements are consecutive and
	/// begin and can start from an offsetted element index in the input; to
	/// avoid excess shuffling the offset must either being in the bottom lane
	/// or at the start of a higher lane. All extended elements must be from
	/// the same lane.
	static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
	ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(Scale > 1 && "Need a scale to extend.");
	int EltBits = VT.getScalarSizeInBits();
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = 128 / EltBits;
	int OffsetLane = Offset / NumEltsPerLane;
	assert((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) &&
	"Only 8, 16, and 32 bit elements can be extended.");
	assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
	assert(0 <= Offset && "Extension offset must be positive.");
	assert((Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0) &&
	"Extension offset must be in the first lane or start an upper lane.");

	// Check that an index is in same lane as the base offset.
	auto SafeOffset = [&](int Idx) {
	return OffsetLane == (Idx / NumEltsPerLane);
	};

	// Shift along an input so that the offset base moves to the first element.
	auto ShuffleOffset = [&](SDValue V) {
	if (!Offset)
	return V;

	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = 0; i * Scale < NumElements; ++i) {
	int SrcIdx = i + Offset;
	ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
	}
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
	};

	// Found a valid zext mask! Try various lowering strategies based on the
	// input type and available ISA extensions.
	// TODO: Add AnyExt support.
	if (Subtarget.hasSSE41()) {
	// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
	// PUNPCK will catch this in a later shuffle match.
	if (Offset && Scale == 2 && VT.is128BitVector())
	return SDValue();
	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
	NumElements / Scale);
	InputV = ShuffleOffset(InputV);
	InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG);
	return DAG.getBitcast(VT, InputV);
	}

	assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");

	// For any extends we can cheat for larger element sizes and use shuffle
	// instructions that can fold with a load and/or copy.
	if (AnyExt && EltBits == 32) {
	int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
	-1};
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}
	if (AnyExt && EltBits == 16 && Scale > 2) {
	int PSHUFDMask[4] = {Offset / 2, -1,
	SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
	InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	int PSHUFWMask[4] = {1, -1, -1, -1};
	unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	return DAG.getBitcast(
	VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
	DAG.getBitcast(MVT::v8i16, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
	}

	// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
	// to 64-bits.
	if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
	assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
	assert(VT.is128BitVector() && "Unexpected vector width!");

	int LoIdx = Offset * EltBits;
	SDValue Lo = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getConstant(EltBits, DL, MVT::i8),
	DAG.getConstant(LoIdx, DL, MVT::i8)));

	if (isUndefUpperHalf(Mask) \|\| !SafeOffset(Offset + 1))
	return DAG.getBitcast(VT, Lo);

	int HiIdx = (Offset + 1) * EltBits;
	SDValue Hi = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getConstant(EltBits, DL, MVT::i8),
	DAG.getConstant(HiIdx, DL, MVT::i8)));
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
	}

	// If this would require more than 2 unpack instructions to expand, use
	// pshufb when available. We can only use more than 2 unpack instructions
	// when zero extending i8 elements which also makes it easier to use pshufb.
	if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
	assert(NumElements == 16 && "Unexpected byte vector width!");
	SDValue PSHUFBMask[16];
	for (int i = 0; i < 16; ++i) {
	int Idx = Offset + (i / Scale);
	PSHUFBMask[i] = DAG.getConstant(
	(i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
	}
	InputV = DAG.getBitcast(MVT::v16i8, InputV);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
	DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
	}

	// If we are extending from an offset, ensure we start on a boundary that
	// we can unpack from.
	int AlignToUnpack = Offset % (NumElements / Scale);
	if (AlignToUnpack) {
	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = AlignToUnpack; i < NumElements; ++i)
	ShMask[i - AlignToUnpack] = i;
	InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
	Offset -= AlignToUnpack;
	}

	// Otherwise emit a sequence of unpacks.
	do {
	unsigned UnpackLoHi = X86ISD::UNPCKL;
	if (Offset >= (NumElements / 2)) {
	UnpackLoHi = X86ISD::UNPCKH;
	Offset -= (NumElements / 2);
	}

	MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
	SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
	: getZeroVector(InputVT, Subtarget, DAG, DL);
	InputV = DAG.getBitcast(InputVT, InputV);
	InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
	Scale /= 2;
	EltBits *= 2;
	NumElements /= 2;
	} while (Scale > 1);
	return DAG.getBitcast(VT, InputV);
	}

	/// Try to lower a vector shuffle as a zero extension on any microarch.
	///
	/// This routine will try to do everything in its power to cleverly lower
	/// a shuffle which happens to match the pattern of a zero extend. It doesn't
	/// check for the profitability of this lowering, it tries to aggressively
	/// match this pattern. It will use all of the micro-architectural details it
	/// can to emit an efficient lowering. It handles both blends with all-zero
	/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
	/// masking out later).
	///
	/// The reason we have dedicated lowering for zext-style shuffles is that they
	/// are both incredibly common and often quite performance sensitive.
	static SDValue lowerShuffleAsZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Bits = VT.getSizeInBits();
	int NumLanes = Bits / 128;
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = NumElements / NumLanes;
	assert(VT.getScalarSizeInBits() <= 32 &&
	"Exceeds 32-bit integer zero extension limit");
	assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");

	// Define a helper function to check a particular ext-scale and lower to it if
	// valid.
	auto Lower = [&](int Scale) -> SDValue {
	SDValue InputV;
	bool AnyExt = true;
	int Offset = 0;
	int Matches = 0;
	for (int i = 0; i < NumElements; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue; // Valid anywhere but doesn't tell us anything.
	if (i % Scale != 0) {
	// Each of the extended elements need to be zeroable.
	if (!Zeroable[i])
	return SDValue();

	// We no longer are in the anyext case.
	AnyExt = false;
	continue;
	}

	// Each of the base elements needs to be consecutive indices into the
	// same input vector.
	SDValue V = M < NumElements ? V1 : V2;
	M = M % NumElements;
	if (!InputV) {
	InputV = V;
	Offset = M - (i / Scale);
	} else if (InputV != V)
	return SDValue(); // Flip-flopping inputs.

	// Offset must start in the lowest 128-bit lane or at the start of an
	// upper lane.
	// FIXME: Is it ever worth allowing a negative base offset?
	if (!((0 <= Offset && Offset < NumEltsPerLane) \|\|
	(Offset % NumEltsPerLane) == 0))
	return SDValue();

	// If we are offsetting, all referenced entries must come from the same
	// lane.
	if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
	return SDValue();

	if ((M % NumElements) != (Offset + (i / Scale)))
	return SDValue(); // Non-consecutive strided elements.
	Matches++;
	}

	// If we fail to find an input, we have a zero-shuffle which should always
	// have already been handled.
	// FIXME: Maybe handle this here in case during blending we end up with one?
	if (!InputV)
	return SDValue();

	// If we are offsetting, don't extend if we only match a single input, we
	// can always do better by using a basic PSHUF or PUNPCK.
	if (Offset != 0 && Matches < 2)
	return SDValue();

	return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
	InputV, Mask, Subtarget, DAG);
	};

	// The widest scale possible for extending is to a 64-bit integer.
	assert(Bits % 64 == 0 &&
	"The number of bits in a vector must be divisible by 64 on x86!");
	int NumExtElements = Bits / 64;

	// Each iteration, try extending the elements half as much, but into twice as
	// many elements.
	for (; NumExtElements < NumElements; NumExtElements *= 2) {
	assert(NumElements % NumExtElements == 0 &&
	"The input vector size must be divisible by the extended size.");
	if (SDValue V = Lower(NumElements / NumExtElements))
	return V;
	}

	// General extends failed, but 128-bit vectors may be able to use MOVQ.
	if (Bits != 128)
	return SDValue();

	// Returns one of the source operands if the shuffle can be reduced to a
	// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
	auto CanZExtLowHalf = [&]() {
	for (int i = NumElements / 2; i != NumElements; ++i)
	if (!Zeroable[i])
	return SDValue();
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
	return V1;
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
	return V2;
	return SDValue();
	};

	if (SDValue V = CanZExtLowHalf()) {
	V = DAG.getBitcast(MVT::v2i64, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
	return DAG.getBitcast(VT, V);
	}

	// No viable ext lowering found.
	return SDValue();
	}

	/// Try to get a scalar value for a specific element of a vector.
	///
	/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
	static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
	SelectionDAG &DAG) {
	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	V = peekThroughBitcasts(V);

	// If the bitcasts shift the element size, we can't extract an equivalent
	// element from it.
	MVT NewVT = V.getSimpleValueType();
	if (!NewVT.isVector() \|\| NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
	// Ensure the scalar operand is the same size as the destination.
	// FIXME: Add support for scalar truncation where possible.
	SDValue S = V.getOperand(Idx);
	if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
	return DAG.getBitcast(EltVT, S);
	}

	return SDValue();
	}

	/// Helper to test for a load that can be folded with x86 shuffles.
	///
	/// This is particularly important because the set of instructions varies
	/// significantly based on whether the operand is a load or not.
	static bool isShuffleFoldableLoad(SDValue V) {
	V = peekThroughBitcasts(V);
	return ISD::isNON_EXTLoad(V.getNode());
	}

	/// Try to lower insertion of a single element into a zero vector.
	///
	/// This is a common pattern that we have especially efficient patterns to lower
	/// across all subtarget feature sets.
	static SDValue lowerShuffleAsElementInsertion(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT ExtVT = VT;
	MVT EltVT = VT.getVectorElementType();

	int V2Index =
	find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
	Mask.begin();
	bool IsV1Zeroable = true;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (i != V2Index && !Zeroable[i]) {
	IsV1Zeroable = false;
	break;
	}

	// Check for a single input from a SCALAR_TO_VECTOR node.
	// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
	// all the smarts here sunk into that routine. However, the current
	// lowering of BUILD_VECTOR makes that nearly impossible until the old
	// vector shuffle lowering is dead.
	SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
	DAG);
	if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
	// We need to zext the scalar if it is smaller than an i32.
	V2S = DAG.getBitcast(EltVT, V2S);
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16) {
	// Using zext to expand a narrow element won't work for non-zero
	// insertions.
	if (!IsV1Zeroable)
	return SDValue();

	// Zero-extend directly to i32.
	ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
	V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
	}
	V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
	} else if (Mask[V2Index] != (int)Mask.size() \|\| EltVT == MVT::i8 \|\|
	EltVT == MVT::i16) {
	// Either not inserting from the low element of the input or the input
	// element size is too small to use VZEXT_MOVL to clear the high bits.
	return SDValue();
	}

	if (!IsV1Zeroable) {
	// If V1 can't be treated as a zero vector we have fewer options to lower
	// this. We can't support integer vectors or non-zero targets cheaply, and
	// the V1 elements can't be permuted in any way.
	assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
	if (!VT.isFloatingPoint() \|\| V2Index != 0)
	return SDValue();
	SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
	V1Mask[V2Index] = -1;
	if (!isNoopShuffleMask(V1Mask))
	return SDValue();
	if (!VT.is128BitVector())
	return SDValue();

	// Otherwise, use MOVSD or MOVSS.
	assert((EltVT == MVT::f32 \|\| EltVT == MVT::f64) &&
	"Only two types of floating point element types to handle!");
	return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
	ExtVT, V1, V2);
	}

	// This lowering only works for the low element with floating point vectors.
	if (VT.isFloatingPoint() && V2Index != 0)
	return SDValue();

	V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
	if (ExtVT != VT)
	V2 = DAG.getBitcast(VT, V2);

	if (V2Index != 0) {
	// If we have 4 or fewer lanes we can cheaply shuffle the element into
	// the desired position. Otherwise it is more efficient to do a vector
	// shift left. We know that we can do a vector shift left because all
	// the inputs are zero.
	if (VT.isFloatingPoint() \|\| VT.getVectorNumElements() <= 4) {
	SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
	V2Shuffle[V2Index] = 0;
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
	} else {
	V2 = DAG.getBitcast(MVT::v16i8, V2);
	V2 = DAG.getNode(
	X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
	DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
	V2 = DAG.getBitcast(VT, V2);
	}
	}
	return V2;
	}

	/// Try to lower broadcast of a single - truncated - integer element,
	/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
	///
	/// This assumes we have AVX2.
	static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
	int BroadcastIdx,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"We can only lower integer broadcasts with AVX2!");

	EVT EltVT = VT.getVectorElementType();
	EVT V0VT = V0.getValueType();

	assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
	assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");

	EVT V0EltVT = V0VT.getVectorElementType();
	if (!V0EltVT.isInteger())
	return SDValue();

	const unsigned EltSize = EltVT.getSizeInBits();
	const unsigned V0EltSize = V0EltVT.getSizeInBits();

	// This is only a truncation if the original element type is larger.
	if (V0EltSize <= EltSize)
	return SDValue();

	assert(((V0EltSize % EltSize) == 0) &&
	"Scalar type sizes must all be powers of 2 on x86!");

	const unsigned V0Opc = V0.getOpcode();
	const unsigned Scale = V0EltSize / EltSize;
	const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

	if ((V0Opc != ISD::SCALAR_TO_VECTOR \|\| V0BroadcastIdx != 0) &&
	V0Opc != ISD::BUILD_VECTOR)
	return SDValue();

	SDValue Scalar = V0.getOperand(V0BroadcastIdx);

	// If we're extracting non-least-significant bits, shift so we can truncate.
	// Hopefully, we can fold away the trunc/srl/load into the broadcast.
	// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
	// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
	if (const int OffsetIdx = BroadcastIdx % Scale)
	Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
	DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));

	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
	}

	/// Test whether this can be lowered with a single SHUFPS instruction.
	///
	/// This is used to disable more specialized lowerings when the shufps lowering
	/// will happen to be efficient.
	static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
	// This routine only handles 128-bit shufps.
	assert(Mask.size() == 4 && "Unsupported mask size!");
	assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");

	// To lower with a single SHUFPS we need to have the low half and high half
	// each requiring a single input.
	if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
	return false;
	if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
	return false;

	return true;
	}

	/// If we are extracting two 128-bit halves of a vector and shuffling the
	/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
	/// multi-shuffle lowering.
	static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
	SDValue N1, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	EVT VT = N0.getValueType();
	assert((VT.is128BitVector() &&
	(VT.getScalarSizeInBits() == 32 \|\| VT.getScalarSizeInBits() == 64)) &&
	"VPERM* family of shuffles requires 32-bit or 64-bit elements");

	// Check that both sources are extracts of the same source vector.
	if (!N0.hasOneUse() \|\| !N1.hasOneUse() \|\|
	N0.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
	N1.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
	N0.getOperand(0) != N1.getOperand(0))
	return SDValue();

	SDValue WideVec = N0.getOperand(0);
	EVT WideVT = WideVec.getValueType();
	if (!WideVT.is256BitVector() \|\| !isa<ConstantSDNode>(N0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(N1.getOperand(1)))
	return SDValue();

	// Match extracts of each half of the wide source vector. Commute the shuffle
	// if the extract of the low half is N1.
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
	const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
	const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
	if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
	ShuffleVectorSDNode::commuteMask(NewMask);
	else if (ExtIndex0 != 0 \|\| ExtIndex1 != NumElts)
	return SDValue();

	// Final bailout: if the mask is simple, we are better off using an extract
	// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
	// because that avoids a constant load from memory.
	if (NumElts == 4 &&
	(isSingleSHUFPSMask(NewMask) \|\| is128BitUnpackShuffleMask(NewMask)))
	return SDValue();

	// Extend the shuffle mask with undef elements.
	NewMask.append(NumElts, -1);

	// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
	SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
	NewMask);
	// This is free: ymm -> xmm.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
	DAG.getIntPtrConstant(0, DL));
	}

	/// Try to lower broadcast of a single element.
	///
	/// For convenience, this code also bundles all of the subtarget feature set
	/// filtering. While a little annoying to re-dispatch on type here, there isn't
	/// a convenient way to factor it out.
	static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) \|\|
	(Subtarget.hasAVX() && VT.isFloatingPoint()) \|\|
	(Subtarget.hasAVX2() && VT.isInteger())))
	return SDValue();

	// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
	// we can only broadcast from a register with AVX2.
	unsigned NumElts = Mask.size();
	unsigned NumEltBits = VT.getScalarSizeInBits();
	unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
	? X86ISD::MOVDDUP
	: X86ISD::VBROADCAST;
	bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) \|\| Subtarget.hasAVX2();

	// Check that the mask is a broadcast.
	int BroadcastIdx = -1;
	for (int i = 0; i != (int)NumElts; ++i) {
	SmallVector<int, 8> BroadcastMask(NumElts, i);
	if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
	BroadcastIdx = i;
	break;
	}
	}

	if (BroadcastIdx < 0)
	return SDValue();
	assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
	"a sorted mask where the broadcast "
	"comes from V1.");

	// Go up the chain of (vector) values to find a scalar load that we can
	// combine with the broadcast.
	int BitOffset = BroadcastIdx * NumEltBits;
	SDValue V = V1;
	for (;;) {
	switch (V.getOpcode()) {
	case ISD::BITCAST: {
	V = V.getOperand(0);
	continue;
	}
	case ISD::CONCAT_VECTORS: {
	int OpBitWidth = V.getOperand(0).getValueSizeInBits();
	int OpIdx = BitOffset / OpBitWidth;
	V = V.getOperand(OpIdx);
	BitOffset %= OpBitWidth;
	continue;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
	auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
	if (!ConstantIdx)
	break;

	int EltBitWidth = VOuter.getScalarValueSizeInBits();
	int Idx = (int)ConstantIdx->getZExtValue();
	int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
	int BeginOffset = Idx * EltBitWidth;
	int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
	if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
	BitOffset -= BeginOffset;
	V = VInner;
	} else {
	V = VOuter;
	}
	continue;
	}
	}
	break;
	}
	assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
	BroadcastIdx = BitOffset / NumEltBits;

	// Do we need to bitcast the source to retrieve the original broadcast index?
	bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;

	// Check if this is a broadcast of a scalar. We special case lowering
	// for scalars so that we can more effectively fold with loads.
	// If the original value has a larger element type than the shuffle, the
	// broadcast element is in essence truncated. Make that explicit to ease
	// folding.
	if (BitCastSrc && VT.isInteger())
	if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
	DL, VT, V, BroadcastIdx, Subtarget, DAG))
	return TruncBroadcast;

	MVT BroadcastVT = VT;

	// Also check the simpler case, where we can directly reuse the scalar.
	if (!BitCastSrc &&
	((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) \|\|
	(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
	V = V.getOperand(BroadcastIdx);

	// If we can't broadcast from a register, check that the input is a load.
	if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
	return SDValue();
	} else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
	BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
	Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
	? X86ISD::MOVDDUP
	: Opcode;
	}

	// If we are broadcasting a load that is only used by the shuffle
	// then we can reduce the vector load to the broadcasted scalar load.
	LoadSDNode *Ld = cast<LoadSDNode>(V);
	SDValue BaseAddr = Ld->getOperand(1);
	EVT SVT = BroadcastVT.getScalarType();
	unsigned Offset = BroadcastIdx * SVT.getStoreSize();
	assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
	SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
	V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
	DAG.getMachineFunction().getMachineMemOperand(
	Ld->getMemOperand(), Offset, SVT.getStoreSize()));
	DAG.makeEquivalentMemoryOrdering(Ld, V);
	} else if (!BroadcastFromReg) {
	// We can't broadcast from a vector register.
	return SDValue();
	} else if (BitOffset != 0) {
	// We can only broadcast from the zero-element of a vector register,
	// but it can be advantageous to broadcast from the zero-element of a
	// subvector.
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();

	// Only broadcast the zero-element of a 128-bit subvector.
	if ((BitOffset % 128) != 0)
	return SDValue();

	assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
	"Unexpected bit-offset");
	assert((V.getValueSizeInBits() == 256 \|\| V.getValueSizeInBits() == 512) &&
	"Unexpected vector size");
	unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
	V = extract128BitVector(V, ExtractIdx, DAG, DL);
	}

	if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	DAG.getBitcast(MVT::f64, V));

	// Bitcast back to the same scalar type as BroadcastVT.
	if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) {
	assert(NumEltBits == BroadcastVT.getScalarSizeInBits() &&
	"Unexpected vector element size");
	MVT ExtVT;
	if (V.getValueType().isVector()) {
	unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
	ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
	} else {
	ExtVT = BroadcastVT.getScalarType();
	}
	V = DAG.getBitcast(ExtVT, V);
	}

	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) {
	V = DAG.getBitcast(MVT::f64, V);
	unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
	BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
	}

	// We only support broadcasting from 128-bit vectors to minimize the
	// number of patterns we need to deal with in isel. So extract down to
	// 128-bits, removing as many bitcasts as possible.
	if (V.getValueSizeInBits() > 128) {
	MVT ExtVT = V.getSimpleValueType().getScalarType();
	ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits());
	V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
	V = DAG.getBitcast(ExtVT, V);
	}

	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
	}

	// Check for whether we can use INSERTPS to perform the shuffle. We only use
	// INSERTPS when the V1 elements are already in the correct locations
	// because otherwise we can just always use two SHUFPS instructions which
	// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
	// perform INSERTPS if a single V1 element is out of place and all V2
	// elements are zeroable.
	static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
	unsigned &InsertPSMask,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SelectionDAG &DAG) {
	assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Attempt to match INSERTPS with one element from VA or VB being
	// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
	// are updated.
	auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
	ArrayRef<int> CandidateMask) {
	unsigned ZMask = 0;
	int VADstIndex = -1;
	int VBDstIndex = -1;
	bool VAUsedInPlace = false;

	for (int i = 0; i < 4; ++i) {
	// Synthesize a zero mask from the zeroable elements (includes undefs).
	if (Zeroable[i]) {
	ZMask \|= 1 << i;
	continue;
	}

	// Flag if we use any VA inputs in place.
	if (i == CandidateMask[i]) {
	VAUsedInPlace = true;
	continue;
	}

	// We can only insert a single non-zeroable element.
	if (VADstIndex >= 0 \|\| VBDstIndex >= 0)
	return false;

	if (CandidateMask[i] < 4) {
	// VA input out of place for insertion.
	VADstIndex = i;
	} else {
	// VB input for insertion.
	VBDstIndex = i;
	}
	}

	// Don't bother if we have no (non-zeroable) element for insertion.
	if (VADstIndex < 0 && VBDstIndex < 0)
	return false;

	// Determine element insertion src/dst indices. The src index is from the
	// start of the inserted vector, not the start of the concatenated vector.
	unsigned VBSrcIndex = 0;
	if (VADstIndex >= 0) {
	// If we have a VA input out of place, we use VA as the V2 element
	// insertion and don't use the original V2 at all.
	VBSrcIndex = CandidateMask[VADstIndex];
	VBDstIndex = VADstIndex;
	VB = VA;
	} else {
	VBSrcIndex = CandidateMask[VBDstIndex] - 4;
	}

	// If no V1 inputs are used in place, then the result is created only from
	// the zero mask and the V2 insertion - so remove V1 dependency.
	if (!VAUsedInPlace)
	VA = DAG.getUNDEF(MVT::v4f32);

	// Update V1, V2 and InsertPSMask accordingly.
	V1 = VA;
	V2 = VB;

	// Insert the V2 element into the desired position.
	InsertPSMask = VBSrcIndex << 6 \| VBDstIndex << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	return true;
	};

	if (matchAsInsertPS(V1, V2, Mask))
	return true;

	// Commute and try again.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);
	if (matchAsInsertPS(V2, V1, CommutedMask))
	return true;

	return false;
	}

	static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
	ArrayRef<int> Mask, const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");

	// Attempt to match the insertps pattern.
	unsigned InsertPSMask;
	if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
	return SDValue();

	// Insert the V2 element into the desired position.
	return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}

	/// Try to lower a shuffle as a permute of the inputs followed by an
	/// UNPCK instruction.
	///
	/// This specifically targets cases where we end up with alternating between
	/// the two inputs, and so can permute them into something that feeds a single
	/// UNPCK instruction. Note that this routine only targets integer vectors
	/// because for floating point vectors we have a generalized SHUFPS lowering
	/// strategy that handles everything that doesn't exactly match an unpack,
	/// making this clever lowering unnecessary.
	static SDValue lowerShuffleAsPermuteAndUnpack(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() &&
	"This routine only supports integer vectors.");
	assert(VT.is128BitVector() &&
	"This routine only works on 128-bit vectors.");
	assert(!V2.isUndef() &&
	"This routine should only be used when blending two inputs.");
	assert(Mask.size() >= 2 && "Single element masks are invalid.");

	int Size = Mask.size();

	int NumLoInputs =
	count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
	int NumHiInputs =
	count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

	bool UnpackLo = NumLoInputs >= NumHiInputs;

	auto TryUnpack = [&](int ScalarSize, int Scale) {
	SmallVector<int, 16> V1Mask((unsigned)Size, -1);
	SmallVector<int, 16> V2Mask((unsigned)Size, -1);

	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	// Each element of the unpack contains Scale elements from this mask.
	int UnpackIdx = i / Scale;

	// We only handle the case where V1 feeds the first slots of the unpack.
	// We rely on canonicalization to ensure this is the case.
	if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
	return SDValue();

	// Setup the mask for this input. The indexing is tricky as we have to
	// handle the unpack stride.
	SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
	VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
	Mask[i] % Size;
	}

	// If we will have to shuffle both inputs to use the unpack, check whether
	// we can just unpack first and shuffle the result. If so, skip this unpack.
	if ((NumLoInputs == 0 \|\| NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
	!isNoopShuffleMask(V2Mask))
	return SDValue();

	// Shuffle the inputs into place.
	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

	// Cast the inputs to the type we will use to unpack them.
	MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
	V1 = DAG.getBitcast(UnpackVT, V1);
	V2 = DAG.getBitcast(UnpackVT, V2);

	// Unpack the inputs and cast the result back to the desired type.
	return DAG.getBitcast(
	VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	UnpackVT, V1, V2));
	};

	// We try each unpack from the largest to the smallest to try and find one
	// that fits this mask.
	int OrigScalarSize = VT.getScalarSizeInBits();
	for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
	if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
	return Unpack;

	// If we're shuffling with a zero vector then we're better off not doing
	// VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
	if (ISD::isBuildVectorAllZeros(V1.getNode()) \|\|
	ISD::isBuildVectorAllZeros(V2.getNode()))
	return SDValue();

	// If none of the unpack-rooted lowerings worked (or were profitable) try an
	// initial unpack.
	if (NumLoInputs == 0 \|\| NumHiInputs == 0) {
	assert((NumLoInputs > 0 \|\| NumHiInputs > 0) &&
	"We have to have some inputs!");
	int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

	// FIXME: We could consider the total complexity of the permute of each
	// possible unpacking. Or at the least we should consider how many
	// half-crossings are created.
	// FIXME: We could consider commuting the unpacks.

	SmallVector<int, 32> PermMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");

	PermMask[i] =
	2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
	}
	return DAG.getVectorShuffle(
	VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
	DL, VT, V1, V2),
	DAG.getUNDEF(VT), PermMask);
	}

	return SDValue();
	}

	/// Handle lowering of 2-lane 64-bit floating point shuffles.
	///
	/// This is the basis function for the 2-lane 64-bit shuffles as we have full
	/// support for floating point shuffles but not integer shuffles. These
	/// instructions will incur a domain crossing penalty on some chips though so
	/// it is better to avoid lowering through this for integer vectors where
	/// possible.
	static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. Simulate this by using the
	// single input as both of the "inputs" to this instruction..
	unsigned SHUFPDMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1);

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}

	return DAG.getNode(
	X86ISD::SHUFP, DL, MVT::v2f64,
	Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}
	assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
	Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Try to use one of the special instruction patterns to handle two common
	// blend patterns if a zero-blend above didn't work.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {1, 3}))
	if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
	// We can either use a special instruction to load over the low double or
	// to move just the low double.
	return DAG.getNode(
	X86ISD::MOVSD, DL, MVT::v2f64, V2,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
	return V;

	unsigned SHUFPDMask = (Mask[0] == 1) \| (((Mask[1] - 2) == 1) << 1);
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}

	/// Handle lowering of 2-lane 64-bit integer shuffles.
	///
	/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
	/// the integer unit to minimize domain crossing penalties. However, for blends
	/// it falls back to the floating point shuffle operation with appropriate bit
	/// casting.
	static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We have to map the mask as it is actually a v4i32 shuffle instruction.
	V1 = DAG.getBitcast(MVT::v4i32, V1);
	int WidenedMask[4] = {
	std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
	std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
	return DAG.getBitcast(
	MVT::v2i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
	}
	assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;
	}

	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
	Subtarget, DAG);

	// We implement this with SHUFPD which is pretty lame because it will likely
	// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
	// However, all the alternatives are still more cycles and newer chips don't
	// have this problem. It would be really nice if x86 had better shuffles here.
	V1 = DAG.getBitcast(MVT::v2f64, V1);
	V2 = DAG.getBitcast(MVT::v2f64, V2);
	return DAG.getBitcast(MVT::v2i64,
	DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
	}

	/// Lower a vector shuffle using the SHUFPS instruction.
	///
	/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
	/// It makes no assumptions about whether this is the best lowering, it simply
	/// uses it.
	static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SDValue LowV = V1, HighV = V2;
	int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 1) {
	int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

	// Compute the index adjacent to V2Index and in the same half by toggling
	// the low bit.
	int V2AdjIndex = V2Index ^ 1;

	if (Mask[V2AdjIndex] < 0) {
	// Handles all the cases where we have a single V2 element and an undef.
	// This will only ever happen in the high lanes because we commute the
	// vector otherwise.
	if (V2Index < 2)
	std::swap(LowV, HighV);
	NewMask[V2Index] -= 4;
	} else {
	// Handle the case where the V2 element ends up adjacent to a V1 element.
	// To make this work, blend them together as the first step.
	int V1Index = V2AdjIndex;
	int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
	V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now proceed to reconstruct the final blend as we have the necessary
	// high or low half formed.
	if (V2Index < 2) {
	LowV = V2;
	HighV = V1;
	} else {
	HighV = V2;
	}
	NewMask[V1Index] = 2; // We put the V1 element in V2[2].
	NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
	}
	} else if (NumV2Elements == 2) {
	if (Mask[0] < 4 && Mask[1] < 4) {
	// Handle the easy case where we have V1 in the low lanes and V2 in the
	// high lanes.
	NewMask[2] -= 4;
	NewMask[3] -= 4;
	} else if (Mask[2] < 4 && Mask[3] < 4) {
	// We also handle the reversed case because this utility may get called
	// when we detect a SHUFPS pattern but can't easily commute the shuffle to
	// arrange things in the right direction.
	NewMask[0] -= 4;
	NewMask[1] -= 4;
	HighV = V1;
	LowV = V2;
	} else {
	// We have a mixture of V1 and V2 in both low and high lanes. Rather than
	// trying to place elements directly, just blend them and set up the final
	// shuffle to place them.

	// The first two blend mask elements are for V1, the second two are for
	// V2.
	int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
	Mask[2] < 4 ? Mask[2] : Mask[3],
	(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
	(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
	V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now we do a normal shuffle of V1 by giving V1 as both operands to
	// a blend.
	LowV = HighV = V1;
	NewMask[0] = Mask[0] < 4 ? 0 : 2;
	NewMask[1] = Mask[0] < 4 ? 2 : 0;
	NewMask[2] = Mask[2] < 4 ? 1 : 3;
	NewMask[3] = Mask[2] < 4 ? 3 : 1;
	}
	}
	return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
	getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
	}

	/// Lower 4-lane 32-bit floating point shuffles.
	///
	/// Uses instructions exclusively from the floating point unit to minimize
	/// domain crossing penalties, as these are sufficient to implement all v4f32
	/// shuffles.
	static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (Subtarget.hasSSE3()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
	}

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
	// in SSE1 because otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
	}

	// Otherwise, use a straight shuffle of a single input vector. We pass the
	// input vector to both operands to simulate this with a SHUFPS.
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// There are special ways we can lower some single-element blends. However, we
	// have custom ways we can lower more complex single-element blends below that
	// we defer to if both this and BLENDPS fail to match, so restrict this to
	// when the V2 input is targeting element 0 of the mask -- that is the fast
	// case here.
	if (NumV2Elements == 1 && Mask[0] >= 4)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (Subtarget.hasSSE41()) {
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use INSERTPS if we can complete the shuffle efficiently.
	if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
	return V;

	if (!isSingleSHUFPSMask(Mask))
	if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
	V2, Mask, DAG))
	return BlendPerm;
	}

	// Use low/high mov instructions. These are only valid in SSE1 because
	// otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise fall back to a SHUFPS lowering strategy.
	return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
	}

	/// Lower 4-lane i32 vector shuffles.
	///
	/// We try to handle these with integer-domain shuffles where we can, but for
	/// blends we use the floating point domain blend instructions.
	static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We coerce the shuffle pattern to be compatible with UNPCK instructions
	// but we aren't actually going to use the UNPCK instruction because doing
	// so prevents folding a load into this instruction or making a copy.
	const int UnpackLoMask[] = {0, 0, 1, 1};
	const int UnpackHiMask[] = {2, 2, 3, 3};
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
	Mask = UnpackLoMask;
	else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
	Mask = UnpackHiMask;

	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	if (Subtarget.hasAVX2())
	if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
	return Extract;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (!isSingleSHUFPSMask(Mask)) {
	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
	Subtarget, DAG);

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
	Mask, Subtarget, DAG))
	return Unpack;
	}

	// We implement this with SHUFPS because it can blend from two vectors.
	// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
	// up the inputs, bypassing domain shift penalties that we would incur if we
	// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
	// relevant.
	SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
	SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
	return DAG.getBitcast(MVT::v4i32, ShufPS);
	}

	/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
	/// shuffle lowering, and the most complex part.
	///
	/// The lowering strategy is to try to form pairs of input lanes which are
	/// targeted at the same half of the final vector, and then use a dword shuffle
	/// to place them onto the right half, and finally unpack the paired lanes into
	/// their final position.
	///
	/// The exact breakdown of how to form these dword pairs and align them on the
	/// correct sides is really tricky. See the comments within the function for
	/// more of the details.
	///
	/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
	/// lane must shuffle the exact same way. In fact, you must pass a v8 Mask to
	/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
	/// vector, form the analogous 128-bit 8-element Mask.
	static SDValue lowerV8I16GeneralSingleInputShuffle(
	const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
	MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

	assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
	MutableArrayRef<int> LoMask = Mask.slice(0, 4);
	MutableArrayRef<int> HiMask = Mask.slice(4, 4);

	// Attempt to directly match PSHUFLW or PSHUFHW.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
	}
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	for (int i = 0; i != 4; ++i)
	HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
	return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
	}

	SmallVector<int, 4> LoInputs;
	copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
	array_pod_sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
	array_pod_sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
	int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
	int NumHToL = LoInputs.size() - NumLToL;
	int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
	int NumHToH = HiInputs.size() - NumLToH;
	MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
	MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
	MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
	MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

	// If we are shuffling values from one half - check how many different DWORD
	// pairs we need to create. If only 1 or 2 then we can perform this as a
	// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
	auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
	ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
	V = DAG.getNode(ShufWOp, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
	V = DAG.getBitcast(PSHUFDVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	};

	if ((NumHToL + NumHToH) == 0 \|\| (NumLToL + NumLToH) == 0) {
	int PSHUFDMask[4] = { -1, -1, -1, -1 };
	SmallVector<std::pair<int, int>, 4> DWordPairs;
	int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

	// Collect the different DWORD pairs.
	for (int DWord = 0; DWord != 4; ++DWord) {
	int M0 = Mask[2 * DWord + 0];
	int M1 = Mask[2 * DWord + 1];
	M0 = (M0 >= 0 ? M0 % 4 : M0);
	M1 = (M1 >= 0 ? M1 % 4 : M1);
	if (M0 < 0 && M1 < 0)
	continue;

	bool Match = false;
	for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
	auto &DWordPair = DWordPairs[j];
	if ((M0 < 0 \|\| isUndefOrEqual(DWordPair.first, M0)) &&
	(M1 < 0 \|\| isUndefOrEqual(DWordPair.second, M1))) {
	DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
	DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
	PSHUFDMask[DWord] = DOffset + j;
	Match = true;
	break;
	}
	}
	if (!Match) {
	PSHUFDMask[DWord] = DOffset + DWordPairs.size();
	DWordPairs.push_back(std::make_pair(M0, M1));
	}
	}

	if (DWordPairs.size() <= 2) {
	DWordPairs.resize(2, std::make_pair(-1, -1));
	int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
	DWordPairs[1].first, DWordPairs[1].second};
	if ((NumHToL + NumHToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
	if ((NumLToL + NumLToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
	}
	}

	// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
	// such inputs we can swap two of the dwords across the half mark and end up
	// with <=2 inputs to each half in each half. Once there, we can fall through
	// to the generic code below. For example:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
	//
	// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
	// and an existing 2-into-2 on the other half. In this case we may have to
	// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
	// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
	// Fortunately, we don't have to handle anything but a 2-into-2 pattern
	// because any other situation (including a 3-into-1 or 1-into-3 in the other
	// half than the one we target for fixing) will be fixed when we re-enter this
	// path. We will also combine away any sequence of PSHUFD instructions that
	// result into a single instruction. Here is an example of the tricky case:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
	//
	// This now has a 1-into-3 in the high half! Instead, we do two shuffles:
	//
	// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
	//
	// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
	//
	// The result is fine to be handled by the generic logic.
	auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
	ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
	int AOffset, int BOffset) {
	assert((AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) &&
	"Must call this with A having 3 or 1 inputs from the A half.");
	assert((BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) &&
	"Must call this with B having 1 or 3 inputs from the B half.");
	assert(AToAInputs.size() + BToAInputs.size() == 4 &&
	"Must call this with either 3:1 or 1:3 inputs (summing to 4).");

	bool ThreeAInputs = AToAInputs.size() == 3;

	// Compute the index of dword with only one word among the three inputs in
	// a half by taking the sum of the half with three inputs and subtracting
	// the sum of the actual three inputs. The difference is the remaining
	// slot.
	int ADWord = 0, BDWord = 0;
	int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
	int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
	int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
	ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
	int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
	int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
	int TripleNonInputIdx =
	TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
	TripleDWord = TripleNonInputIdx / 2;

	// We use xor with one to compute the adjacent DWord to whichever one the
	// OneInput is in.
	OneInputDWord = (OneInput / 2) ^ 1;

	// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
	// and BToA inputs. If there is also such a problem with the BToB and AToB
	// inputs, we don't try to fix it necessarily -- we'll recurse and see it in
	// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
	// is essential that we don't create a 3<-1 as then we might oscillate.
	if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
	// Compute how many inputs will be flipped by swapping these DWords. We
	// need
	// to balance this to ensure we don't form a 3-1 shuffle in the other
	// half.
	int NumFlippedAToBInputs =
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
	int NumFlippedBToBInputs =
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
	if ((NumFlippedAToBInputs == 1 &&
	(NumFlippedBToBInputs == 0 \|\| NumFlippedBToBInputs == 2)) \|\|
	(NumFlippedBToBInputs == 1 &&
	(NumFlippedAToBInputs == 0 \|\| NumFlippedAToBInputs == 2))) {
	// We choose whether to fix the A half or B half based on whether that
	// half has zero flipped inputs. At zero, we may not be able to fix it
	// with that half. We also bias towards fixing the B half because that
	// will more commonly be the high half, and we have to bias one way.
	auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
	ArrayRef<int> Inputs) {
	int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
	bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
	// Determine whether the free index is in the flipped dword or the
	// unflipped dword based on where the pinned index is. We use this bit
	// in an xor to conditionally select the adjacent dword.
	int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
	bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	if (IsFixIdxInput == IsFixFreeIdxInput)
	FixFreeIdx += 1;
	IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	assert(IsFixIdxInput != IsFixFreeIdxInput &&
	"We need to be changing the number of flipped inputs!");
	int PSHUFHalfMask[] = {0, 1, 2, 3};
	std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
	V = DAG.getNode(
	FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
	MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

	for (int &M : Mask)
	if (M >= 0 && M == FixIdx)
	M = FixFreeIdx;
	else if (M >= 0 && M == FixFreeIdx)
	M = FixIdx;
	};
	if (NumFlippedBToBInputs != 0) {
	int BPinnedIdx =
	BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
	} else {
	assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
	int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
	}
	}
	}

	int PSHUFDMask[] = {0, 1, 2, 3};
	PSHUFDMask[ADWord] = BDWord;
	PSHUFDMask[BDWord] = ADWord;
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// Adjust the mask to match the new locations of A and B.
	for (int &M : Mask)
	if (M >= 0 && M/2 == ADWord)
	M = 2 * BDWord + M % 2;
	else if (M >= 0 && M/2 == BDWord)
	M = 2 * ADWord + M % 2;

	// Recurse back into this routine to re-compute state now that this isn't
	// a 3 and 1 problem.
	return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
	};
	if ((NumLToL == 3 && NumHToL == 1) \|\| (NumLToL == 1 && NumHToL == 3))
	return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
	if ((NumHToH == 3 && NumLToH == 1) \|\| (NumHToH == 1 && NumLToH == 3))
	return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

	// At this point there are at most two inputs to the low and high halves from
	// each half. That means the inputs can always be grouped into dwords and
	// those dwords can then be moved to the correct half with a dword shuffle.
	// We use at most one low and one high word shuffle to collect these paired
	// inputs into dwords, and finally a dword shuffle to place them.
	int PSHUFLMask[4] = {-1, -1, -1, -1};
	int PSHUFHMask[4] = {-1, -1, -1, -1};
	int PSHUFDMask[4] = {-1, -1, -1, -1};

	// First fix the masks for all the inputs that are staying in their
	// original halves. This will then dictate the targets of the cross-half
	// shuffles.
	auto fixInPlaceInputs =
	[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
	MutableArrayRef<int> SourceHalfMask,
	MutableArrayRef<int> HalfMask, int HalfOffset) {
	if (InPlaceInputs.empty())
	return;
	if (InPlaceInputs.size() == 1) {
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
	return;
	}
	if (IncomingInputs.empty()) {
	// Just fix all of the in place inputs.
	for (int Input : InPlaceInputs) {
	SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
	PSHUFDMask[Input / 2] = Input / 2;
	}
	return;
	}

	assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	// Put the second input next to the first so that they are packed into
	// a dword. We find the adjacent index by toggling the low bit.
	int AdjIndex = InPlaceInputs[0] ^ 1;
	SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
	PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
	};
	fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
	fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

	// Now gather the cross-half inputs and place them into a free dword of
	// their target half.
	// FIXME: This operation could almost certainly be simplified dramatically to
	// look more like the 3-1 fixing operation.
	auto moveInputsToRightHalf = [&PSHUFDMask](
	MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
	MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
	MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
	int DestOffset) {
	auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
	return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
	};
	auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
	int Word) {
	int LowWord = Word & ~1;
	int HighWord = Word \| 1;
	return isWordClobbered(SourceHalfMask, LowWord) \|\|
	isWordClobbered(SourceHalfMask, HighWord);
	};

	if (IncomingInputs.empty())
	return;

	if (ExistingInputs.empty()) {
	// Map any dwords with inputs from them into the right half.
	for (int Input : IncomingInputs) {
	// If the source half mask maps over the inputs, turn those into
	// swaps and use the swapped lane.
	if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
	if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
	SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
	Input - SourceOffset;
	// We have to swap the uses in our half mask in one sweep.
	for (int &M : HalfMask)
	if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
	M = Input;
	else if (M == Input)
	M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	} else {
	assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
	Input - SourceOffset &&
	"Previous placement doesn't match!");
	}
	// Note that this correctly re-maps both when we do a swap and when
	// we observe the other side of the swap above. We rely on that to
	// avoid swapping the members of the input list directly.
	Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	}

	// Map the input's dword into the correct half.
	if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
	PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
	else
	assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
	Input / 2 &&
	"Previous placement doesn't match!");
	}

	// And just directly shift any other-half mask elements to be same-half
	// as we will have mirrored the dword containing the element into the
	// same position within that half.
	for (int &M : HalfMask)
	if (M >= SourceOffset && M < SourceOffset + 4) {
	M = M - SourceOffset + DestOffset;
	assert(M >= 0 && "This should never wrap below zero!");
	}
	return;
	}

	// Ensure we have the input in a viable dword of its current half. This
	// is particularly tricky because the original position may be clobbered
	// by inputs being moved and staying in that half.
	if (IncomingInputs.size() == 1) {
	if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
	SourceOffset;
	SourceHalfMask[InputFixed - SourceOffset] =
	IncomingInputs[0] - SourceOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
	InputFixed);
	IncomingInputs[0] = InputFixed;
	}
	} else if (IncomingInputs.size() == 2) {
	if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 \|\|
	isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	// We have two non-adjacent or clobbered inputs we need to extract from
	// the source half. To do this, we need to map them into some adjacent
	// dword slot in the source mask.
	int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
	IncomingInputs[1] - SourceOffset};

	// If there is a free slot in the source half mask adjacent to one of
	// the inputs, place the other input in it. We use (Index XOR 1) to
	// compute an adjacent index.
	if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
	SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	InputsFixed[1] = InputsFixed[0] ^ 1;
	} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
	SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
	InputsFixed[0] = InputsFixed[1] ^ 1;
	} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
	// The two inputs are in the same DWord but it is clobbered and the
	// adjacent DWord isn't used at all. Move both inputs to the free
	// slot.
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
	InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
	InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
	} else {
	// The only way we hit this point is if there is no clobbering
	// (because there are no off-half inputs to this half) and there is no
	// free slot adjacent to one of the inputs. In this case, we have to
	// swap an input with a non-input.
	for (int i = 0; i < 4; ++i)
	assert((SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) &&
	"We can't handle any clobbers here!");
	assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
	"Cannot have adjacent inputs here!");

	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

	// We also have to update the final source mask in this case because
	// it may need to undo the above swap.
	for (int &M : FinalSourceHalfMask)
	if (M == (InputsFixed[0] ^ 1) + SourceOffset)
	M = InputsFixed[1] + SourceOffset;
	else if (M == InputsFixed[1] + SourceOffset)
	M = (InputsFixed[0] ^ 1) + SourceOffset;

	InputsFixed[1] = InputsFixed[0] ^ 1;
	}

	// Point everything at the fixed inputs.
	for (int &M : HalfMask)
	if (M == IncomingInputs[0])
	M = InputsFixed[0] + SourceOffset;
	else if (M == IncomingInputs[1])
	M = InputsFixed[1] + SourceOffset;

	IncomingInputs[0] = InputsFixed[0] + SourceOffset;
	IncomingInputs[1] = InputsFixed[1] + SourceOffset;
	}
	} else {
	llvm_unreachable("Unhandled input size!");
	}

	// Now hoist the DWord down to the right half.
	int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
	assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
	PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
	for (int &M : HalfMask)
	for (int Input : IncomingInputs)
	if (M == Input)
	M = FreeDWord * 2 + Input % 2;
	};
	moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
	/SourceOffset/ 4, /DestOffset/ 0);
	moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
	/SourceOffset/ 0, /DestOffset/ 4);

	// Now enact all the shuffles we've computed to move the inputs into their
	// target half.
	if (!isNoopShuffleMask(PSHUFLMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFHMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFDMask))
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// At this point, each half should contain all its inputs, and we can then
	// just shuffle them into their final position.
	assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
	"Failed to lift all the high half inputs to the low mask!");
	assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
	"Failed to lift all the low half inputs to the high mask!");

	// Do a half shuffle for the low mask.
	if (!isNoopShuffleMask(LoMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

	// Do a half shuffle with the high mask after shifting its values down.
	for (int &M : HiMask)
	if (M >= 0)
	M -= 4;
	if (!isNoopShuffleMask(HiMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

	return V;
	}

	/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
	/// blend if only one input is used.
	static SDValue lowerShuffleAsBlendOfPSHUFBs(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
	assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
	"Lane crossing shuffle masks not supported");

	int NumBytes = VT.getSizeInBits() / 8;
	int Size = Mask.size();
	int Scale = NumBytes / Size;

	SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
	SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
	V1InUse = false;
	V2InUse = false;

	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Scale];
	if (M < 0)
	continue;

	const int ZeroMask = 0x80;
	int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
	int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
	if (Zeroable[i / Scale])
	V1Idx = V2Idx = ZeroMask;

	V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
	V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
	V1InUse \|= (ZeroMask != V1Idx);
	V2InUse \|= (ZeroMask != V2Idx);
	}

	MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
	if (V1InUse)
	V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
	DAG.getBuildVector(ShufVT, DL, V1Mask));
	if (V2InUse)
	V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
	DAG.getBuildVector(ShufVT, DL, V2Mask));

	// If we need shuffled inputs from both, blend the two.
	SDValue V;
	if (V1InUse && V2InUse)
	V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
	else
	V = V1InUse ? V1 : V2;

	// Cast the result back to the correct type.
	return DAG.getBitcast(VT, V);
	}

	/// Generic lowering of 8-lane i16 shuffles.
	///
	/// This handles both single-input shuffles and combined shuffle/blends with
	/// two inputs. The single input shuffles are immediately delegated to
	/// a dedicated lowering routine.
	///
	/// The blends are lowered in one of three fundamental ways. If there are few
	/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
	/// of the input is significantly cheaper when lowered as an interleaving of
	/// the two inputs, try to interleave them. Otherwise, blend the low and high
	/// halves of the inputs separately (making them have relatively few inputs)
	/// and then concatenate them.
	static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

	if (NumV2Inputs == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
	Subtarget, DAG))
	return Rotate;

	// Make a copy of the mask so it can be modified.
	SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
	return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
	Subtarget, DAG);
	}

	assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
	"All single-input shuffles should be canonicalized to be V1-input "
	"shuffles.");

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	// There are special ways we can lower some single-element blends.
	if (NumV2Inputs == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue BitBlend =
	lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
	return BitBlend;

	// Try to use byte shift instructions to mask.
	if (SDValue V = lowerVectorShuffleAsByteShiftMask(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG))
	return Unpack;

	// If we can't directly blend but can use PSHUFB, that will be better as it
	// can both shuffle and set up the inefficient blend.
	if (!IsBlendSupported && Subtarget.hasSSSE3()) {
	bool V1InUse, V2InUse;
	return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG, V1InUse, V2InUse);
	}

	// We can always bit-blend if we have to so the fallback strategy is to
	// decompose into single-input permutes and blends.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
	Mask, Subtarget, DAG);
	}

	/// Check whether a compaction lowering can be done by dropping even
	/// elements and compute how many times even elements must be dropped.
	///
	/// This handles shuffles which take every Nth element where N is a power of
	/// two. Example shuffle masks:
	///
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
	/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
	/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
	/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
	/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
	///
	/// Any of these lanes can of course be undef.
	///
	/// This routine only supports N <= 3.
	/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
	/// for larger N.
	///
	/// \returns N above, or the number of times even elements must be dropped if
	/// there is such a number. Otherwise returns zero.
	static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
	bool IsSingleInput) {
	// The modulus for the shuffle vector entries is based on whether this is
	// a single input or not.
	int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
	assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
	"We should only be called with masks with a power-of-2 size!");

	uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

	// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
	// and 2^3 simultaneously. This is because we may have ambiguity with
	// partially undef inputs.
	bool ViableForN[3] = {true, true, true};

	for (int i = 0, e = Mask.size(); i < e; ++i) {
	// Ignore undef lanes, we'll optimistically collapse them to the pattern we
	// want.
	if (Mask[i] < 0)
	continue;

	bool IsAnyViable = false;
	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j]) {
	uint64_t N = j + 1;

	// The shuffle mask must be equal to (i * 2^N) % M.
	if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
	IsAnyViable = true;
	else
	ViableForN[j] = false;
	}
	// Early exit if we exhaust the possible powers of two.
	if (!IsAnyViable)
	break;
	}

	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j])
	return j + 1;

	// Return 0 as there is no viable power of two.
	return 0;
	}

	static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
	MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());

	SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);

	return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
	}

	/// Generic lowering of v16i8 shuffles.
	///
	/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
	/// detect any complexity reducing interleaving. If that doesn't help, it uses
	/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
	/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
	/// back together.
	static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use a zext lowering.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

	// For single-input shuffles, there are some nicer lowering tricks we can use.
	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Check whether we can widen this to an i16 shuffle by duplicating bytes.
	// Notably, this handles splat and partial-splat shuffles more efficiently.
	// However, it only makes sense if the pre-duplication shuffle simplifies
	// things significantly. Currently, this means we need to be able to
	// express the pre-duplication shuffle as an i16 shuffle.
	//
	// FIXME: We should check for other patterns which can be widened into an
	// i16 shuffle as well.
	auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
	for (int i = 0; i < 16; i += 2)
	if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
	return false;

	return true;
	};
	auto tryToWidenViaDuplication = [&]() -> SDValue {
	if (!canWidenViaDuplication(Mask))
	return SDValue();
	SmallVector<int, 4> LoInputs;
	copy_if(Mask, std::back_inserter(LoInputs),
	[](int M) { return M >= 0 && M < 8; });
	array_pod_sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
	LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
	array_pod_sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
	HiInputs.end());

	bool TargetLo = LoInputs.size() >= HiInputs.size();
	ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
	ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

	int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
	SmallDenseMap<int, int, 8> LaneMap;
	for (int I : InPlaceInputs) {
	PreDupI16Shuffle[I/2] = I/2;
	LaneMap[I] = I;
	}
	int j = TargetLo ? 0 : 4, je = j + 4;
	for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
	// Check if j is already a shuffle of this input. This happens when
	// there are two adjacent bytes after we move the low one.
	if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
	// If we haven't yet mapped the input, search for a slot into which
	// we can map it.
	while (j < je && PreDupI16Shuffle[j] >= 0)
	++j;

	if (j == je)
	// We can't place the inputs into a single half with a simple i16 shuffle, so bail.
	return SDValue();

	// Map this input with the i16 shuffle.
	PreDupI16Shuffle[j] = MovingInputs[i] / 2;
	}

	// Update the lane map based on the mapping we ended up with.
	LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
	}
	V1 = DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

	// Unpack the bytes to form the i16s that will be shuffled into place.
	V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	MVT::v16i8, V1, V1);

	int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0) {
	int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
	assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
	if (PostDupI16Shuffle[i / 2] < 0)
	PostDupI16Shuffle[i / 2] = MappedMask;
	else
	assert(PostDupI16Shuffle[i / 2] == MappedMask &&
	"Conflicting entries in the original shuffle!");
	}
	return DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
	};
	if (SDValue V = tryToWidenViaDuplication())
	return V;
	}

	if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Try to use byte shift instructions to mask.
	if (SDValue V = lowerVectorShuffleAsByteShiftMask(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
	// with PSHUFB. It is important to do this before we attempt to generate any
	// blends but after all of the single-input lowerings. If the single input
	// lowerings can find an instruction sequence that is faster than a PSHUFB, we
	// want to preserve that and we can DAG combine any longer sequences into
	// a PSHUFB in the end. But once we start blending from multiple inputs,
	// the complexity of DAG combining bad patterns back into PSHUFB is too high,
	// and there are very few patterns that would actually be faster than the
	// PSHUFB approach because of its ability to zero lanes.
	//
	// FIXME: The only exceptions to the above are blends which are exact
	// interleavings with direct instructions supporting them. We currently don't
	// handle those well here.
	if (Subtarget.hasSSSE3()) {
	bool V1InUse = false;
	bool V2InUse = false;

	SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

	// If both V1 and V2 are in use and we can use a direct blend or an unpack,
	// do so. This avoids using them to handle blends-with-zero which is
	// important as a single pshufb is significantly faster for that.
	if (V1InUse && V2InUse) {
	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// We can use an unpack to do the blending rather than an or in some
	// cases. Even though the or may be (very minorly) more efficient, we
	// preference this lowering because there are common cases where part of
	// the complexity of the shuffles goes away when we do the final blend as
	// an unpack.
	// FIXME: It might be worth trying to detect if the unpack-feeding
	// shuffles will both be pshufb, in which case we shouldn't bother with
	// this.
	if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Unpack;

	// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);

	// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
	// PALIGNR will be cheaper than the second PSHUFB+OR.
	if (SDValue V = lowerShuffleAsByteRotateAndPermute(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return V;
	}

	return PSHUFB;
	}

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerShuffleAsElementInsertion(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
	return Blend;

	// Check whether a compaction lowering can be done. This handles shuffles
	// which take every Nth element for some even N. See the helper function for
	// details.
	//
	// We special case these as they can be particularly efficiently handled with
	// the PACKUSB instruction on x86 and they show up in common patterns of
	// rearranging bytes to truncate wide elements.
	bool IsSingleInput = V2.isUndef();
	if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
	// NumEvenDrops is the power of two stride of the elements. Another way of
	// thinking about it is that we need to drop the even elements this many
	// times to get the original input.

	// First we need to zero all the dropped bytes.
	assert(NumEvenDrops <= 3 &&
	"No support for dropping even elements more than 3 times.");
	// We use the mask type to pick which bytes are preserved based on how many
	// elements are dropped.
	MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
	SDValue ByteClearMask = DAG.getBitcast(
	MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
	V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
	if (!IsSingleInput)
	V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);

	// Now pack things back together.
	V1 = DAG.getBitcast(MVT::v8i16, V1);
	V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
	SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
	for (int i = 1; i < NumEvenDrops; ++i) {
	Result = DAG.getBitcast(MVT::v8i16, Result);
	Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
	}

	return Result;
	}

	// Handle multi-input cases by blending single-input shuffles.
	if (NumV2Elements > 0)
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
	Subtarget, DAG);

	// The fallback path for single-input shuffles widens this into two v8i16
	// vectors with unpacks, shuffles those, and then pulls them back together
	// with a pack.
	SDValue V = V1;

	std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0)
	(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

	SDValue VLoHalf, VHiHalf;
	// Check if any of the odd lanes in the v16i8 are used. If not, we can mask
	// them out and avoid using UNPCK{L,H} to extract the elements of V as
	// i16s.
	if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
	none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
	// Use a mask to drop the high bytes.
	VLoHalf = DAG.getBitcast(MVT::v8i16, V);
	VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
	DAG.getConstant(0x00FF, DL, MVT::v8i16));

	// This will be a single vector shuffle instead of a blend so nuke VHiHalf.
	VHiHalf = DAG.getUNDEF(MVT::v8i16);

	// Squash the masks to point directly into VLoHalf.
	for (int &M : LoBlendMask)
	if (M >= 0)
	M /= 2;
	for (int &M : HiBlendMask)
	if (M >= 0)
	M /= 2;
	} else {
	// Otherwise just unpack the low half of V into VLoHalf and the high half into
	// VHiHalf so that we can blend them as i16s.
	SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

	VLoHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
	VHiHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
	}

	SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
	SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

	return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
	}

	/// Dispatching routine to lower various 128-bit x86 vector shuffles.
	///
	/// This routine breaks down the specific type of 128-bit shuffle and
	/// dispatches to the lowering routines accordingly.
	static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	switch (VT.SimpleTy) {
	case MVT::v2i64:
	return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v2f64:
	return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i32:
	return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4f32:
	return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i16:
	return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i8:
	return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Unimplemented!");
	}
	}

	/// Generic routine to split vector shuffle into half-sized shuffles.
	///
	/// This routine just extracts two subvectors, shuffles them independently, and
	/// then concatenates them back together. This should work effectively with all
	/// AVX vector shuffle types.
	static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.getSizeInBits() >= 256 &&
	"Only for 256-bit or wider vector shuffles!");
	assert(V1.getSimpleValueType() == VT && "Bad operand type!");
	assert(V2.getSimpleValueType() == VT && "Bad operand type!");

	ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
	ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

	int NumElements = VT.getVectorNumElements();
	int SplitNumElements = NumElements / 2;
	MVT ScalarVT = VT.getVectorElementType();
	MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);

	// Rather than splitting build-vectors, just build two narrower build
	// vectors. This helps shuffling with splats and zeros.
	auto SplitVector = [&](SDValue V) {
	V = peekThroughBitcasts(V);

	MVT OrigVT = V.getSimpleValueType();
	int OrigNumElements = OrigVT.getVectorNumElements();
	int OrigSplitNumElements = OrigNumElements / 2;
	MVT OrigScalarVT = OrigVT.getVectorElementType();
	MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);

	SDValue LoV, HiV;

	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV) {
	LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(0, DL));
	HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(OrigSplitNumElements, DL));
	} else {

	SmallVector<SDValue, 16> LoOps, HiOps;
	for (int i = 0; i < OrigSplitNumElements; ++i) {
	LoOps.push_back(BV->getOperand(i));
	HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
	}
	LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
	HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
	}
	return std::make_pair(DAG.getBitcast(SplitVT, LoV),
	DAG.getBitcast(SplitVT, HiV));
	};

	SDValue LoV1, HiV1, LoV2, HiV2;
	std::tie(LoV1, HiV1) = SplitVector(V1);
	std::tie(LoV2, HiV2) = SplitVector(V2);

	// Now create two 4-way blends of these half-width vectors.
	auto HalfBlend = [&](ArrayRef<int> HalfMask) {
	bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
	SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
	for (int i = 0; i < SplitNumElements; ++i) {
	int M = HalfMask[i];
	if (M >= NumElements) {
	if (M >= NumElements + SplitNumElements)
	UseHiV2 = true;
	else
	UseLoV2 = true;
	V2BlendMask[i] = M - NumElements;
	BlendMask[i] = SplitNumElements + i;
	} else if (M >= 0) {
	if (M >= SplitNumElements)
	UseHiV1 = true;
	else
	UseLoV1 = true;
	V1BlendMask[i] = M;
	BlendMask[i] = i;
	}
	}

	// Because the lowering happens after all combining takes place, we need to
	// manually combine these blend masks as much as possible so that we create
	// a minimal number of high-level vector shuffle nodes.

	// First try just blending the halves of V1 or V2.
	if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
	return DAG.getUNDEF(SplitVT);
	if (!UseLoV2 && !UseHiV2)
	return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	if (!UseLoV1 && !UseHiV1)
	return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

	SDValue V1Blend, V2Blend;
	if (UseLoV1 && UseHiV1) {
	V1Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	} else {
	// We only use half of V1 so map the usage down into the final blend mask.
	V1Blend = UseLoV1 ? LoV1 : HiV1;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
	BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
	}
	if (UseLoV2 && UseHiV2) {
	V2Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
	} else {
	// We only use half of V2 so map the usage down into the final blend mask.
	V2Blend = UseLoV2 ? LoV2 : HiV2;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= SplitNumElements)
	BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
	}
	return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
	};
	SDValue Lo = HalfBlend(LoMask);
	SDValue Hi = HalfBlend(HiMask);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	/// Either split a vector in halves or decompose the shuffles and the
	/// blend.
	///
	/// This is provided as a good fallback for many lowerings of non-single-input
	/// shuffles with more than one 128-bit lane. In those cases, we want to select
	/// between splitting the shuffle into 128-bit components and stitching those
	/// back together vs. extracting the single-input shuffles and blending those
	/// results.
	static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This routine must not be used to lower single-input "
	"shuffles as it could then recurse on itself.");
	int Size = Mask.size();

	// If this can be modeled as a broadcast of two elements followed by a blend,
	// prefer that lowering. This is especially important because broadcasts can
	// often fold with memory operands.
	auto DoBothBroadcast = [&] {
	int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
	for (int M : Mask)
	if (M >= Size) {
	if (V2BroadcastIdx < 0)
	V2BroadcastIdx = M - Size;
	else if (M - Size != V2BroadcastIdx)
	return false;
	} else if (M >= 0) {
	if (V1BroadcastIdx < 0)
	V1BroadcastIdx = M;
	else if (M != V1BroadcastIdx)
	return false;
	}
	return true;
	};
	if (DoBothBroadcast())
	return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
	Subtarget, DAG);

	// If the inputs all stem from a single 128-bit lane of each input, then we
	// split them rather than blending because the split will decompose to
	// unusually few instructions.
	int LaneCount = VT.getSizeInBits() / 128;
	int LaneSize = Size / LaneCount;
	SmallBitVector LaneInputs[2];
	LaneInputs[0].resize(LaneCount, false);
	LaneInputs[1].resize(LaneCount, false);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
	if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

	// Otherwise, just fall back to decomposed shuffles and a blend. This requires
	// that the decomposed single-input shuffles don't end up here.
	return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
	DAG);
	}

	/// Lower a vector shuffle crossing multiple 128-bit lanes as
	/// a lane permutation followed by a per-lane permutation.
	///
	/// This is mainly for cases where we can have non-repeating permutes
	/// in each lane.
	///
	/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
	/// we should investigate merging them.
	static SDValue lowerShuffleAsLanePermuteAndPermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumEltsPerLane = NumElts / NumLanes;

	SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
	SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);

	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;

	// Ensure that each lane comes from a single source lane.
	int SrcLane = M / NumEltsPerLane;
	int DstLane = i / NumEltsPerLane;
	if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
	return SDValue();
	SrcLaneMask[DstLane] = SrcLane;

	PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
	}

	// Make sure we set all elements of the lane mask, to avoid undef propagation.
	SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
	for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
	int SrcLane = SrcLaneMask[DstLane];
	if (0 <= SrcLane)
	for (int j = 0; j != NumEltsPerLane; ++j) {
	LaneMask[(DstLane * NumEltsPerLane) + j] =
	(SrcLane * NumEltsPerLane) + j;
	}
	}

	// If we're only shuffling a single lowest lane and the rest are identity
	// then don't bother.
	// TODO - isShuffleMaskInputInPlace could be extended to something like this.
	int NumIdentityLanes = 0;
	bool OnlyShuffleLowestLane = true;
	for (int i = 0; i != NumLanes; ++i) {
	if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
	i * NumEltsPerLane))
	NumIdentityLanes++;
	else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
	OnlyShuffleLowestLane = false;
	}
	if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
	return SDValue();

	SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
	return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
	}

	/// Lower a vector shuffle crossing multiple 128-bit lanes as
	/// a permutation and blend of those lanes.
	///
	/// This essentially blends the out-of-lane inputs to each lane into the lane
	/// from a permuted copy of the vector. This lowering strategy results in four
	/// instructions in the worst case for a single-input cross lane shuffle which
	/// is lower than any other fully general cross-lane shuffle strategy I'm aware
	/// of. Special cases for each particular shuffle pattern should be handled
	/// prior to trying this lowering.
	static SDValue lowerShuffleAsLanePermuteAndBlend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// FIXME: This should probably be generalized for 512-bit vectors as well.
	assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
	int Size = Mask.size();
	int LaneSize = Size / 2;

	// If there are only inputs from one 128-bit lane, splitting will in fact be
	// less expensive. The flags track whether the given lane contains an element
	// that crosses to another lane.
	if (!Subtarget.hasAVX2()) {
	bool LaneCrossing[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
	if (!LaneCrossing[0] \|\| !LaneCrossing[1])
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	} else {
	bool LaneUsed[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneUsed[(Mask[i] / LaneSize)] = true;
	if (!LaneUsed[0] \|\| !LaneUsed[1])
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	assert(V2.isUndef() &&
	"This last part of this routine only works on single input shuffles");

	SmallVector<int, 32> FlippedBlendMask(Size);
	for (int i = 0; i < Size; ++i)
	FlippedBlendMask[i] =
	Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
	? Mask[i]
	: Mask[i] % LaneSize +
	(i / LaneSize) * LaneSize + Size);

	// Flip the vector, and blend the results which should now be in-lane.
	MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
	SDValue Flipped = DAG.getBitcast(PVT, V1);
	Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
	{ 2, 3, 0, 1 });
	Flipped = DAG.getBitcast(VT, Flipped);
	return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
	}

	/// Handle lowering 2-lane 128-bit shuffles.
	static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
	if (Subtarget.hasAVX2() && V2.isUndef())
	return SDValue();

	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
	return SDValue();

	bool IsLowZero = (Zeroable & 0x3) == 0x3;
	bool IsHighZero = (Zeroable & 0xc) == 0xc;

	// Try to use an insert into a zero vector.
	if (WidenedMask[0] == 0 && IsHighZero) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL), LoV,
	DAG.getIntPtrConstant(0, DL));
	}

	// TODO: If minimizing size and one of the inputs is a zero vector and the
	// the zero vector has only one use, we could use a VPERM2X128 to save the
	// instruction bytes needed to explicitly generate the zero vector.

	// Blends are faster and handle all the non-lane-crossing cases.
	if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return Blend;

	// If either input operand is a zero vector, use VPERM2X128 because its mask
	// allows us to replace the zero input with an implicit zero.
	if (!IsLowZero && !IsHighZero) {
	// Check for patterns which can be matched with a single insert of a 128-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {

	// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
	// this will likely become vinsertf128 which can't fold a 256-bit memop.
	if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
	DAG.getIntPtrConstant(2, DL));
	}
	}

	// Try to use SHUF128 if possible.
	if (Subtarget.hasVLX()) {
	if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
	unsigned PermMask = ((WidenedMask[0] % 2) << 0) \|
	((WidenedMask[1] % 2) << 1);
	return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
	DAG.getConstant(PermMask, DL, MVT::i8));
	}
	}
	}

	// Otherwise form a 128-bit permutation. After accounting for undefs,
	// convert the 64-bit shuffle mask selection values into 128-bit
	// selection bits by dividing the indexes by 2 and shifting into positions
	// defined by a vperm2*128 instruction's immediate control byte.

	// The immediate permute control byte looks like this:
	// [1:0] - select 128 bits from sources for low half of destination
	// [2] - ignore
	// [3] - zero low half of destination
	// [5:4] - select 128 bits from sources for high half of destination
	// [6] - ignore
	// [7] - zero high half of destination

	assert((WidenedMask[0] >= 0 \|\| IsLowZero) &&
	(WidenedMask[1] >= 0 \|\| IsHighZero) && "Undef half?");

	unsigned PermMask = 0;
	PermMask \|= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
	PermMask \|= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

	// Check the immediate mask and replace unused sources with undef.
	if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
	V1 = DAG.getUNDEF(VT);
	if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
	V2 = DAG.getUNDEF(VT);

	return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
	DAG.getConstant(PermMask, DL, MVT::i8));
	}

	/// Lower a vector shuffle by first fixing the 128-bit lanes and then
	/// shuffling each lane.
	///
	/// This attempts to create a repeated lane shuffle where each lane uses one
	/// or two of the lanes of the inputs. The lanes of the input vectors are
	/// shuffled in one or two independent shuffles to get the lanes into the
	/// position needed by the final shuffle.
	static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This is only useful with multiple inputs.");

	if (is128BitLaneRepeatedShuffleMask(VT, Mask))
	return SDValue();

	int Size = Mask.size();
	int NumLanes = VT.getSizeInBits() / 128;
	int LaneSize = 128 / VT.getScalarSizeInBits();
	SmallVector<int, 16> RepeatMask(LaneSize, -1);
	SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});

	// First pass will try to fill in the RepeatMask from lanes that need two
	// sources.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Srcs[2] = { -1, -1 };
	SmallVector<int, 16> InLaneMask(LaneSize, -1);
	for (int i = 0; i != LaneSize; ++i) {
	int M = Mask[(Lane * LaneSize) + i];
	if (M < 0)
	continue;
	// Determine which of the possible input lanes (NumLanes from each source)
	// this element comes from. Assign that as one of the sources for this
	// lane. We can assign up to 2 sources for this lane. If we run out
	// sources we can't do anything.
	int LaneSrc = M / LaneSize;
	int Src;
	if (Srcs[0] < 0 \|\| Srcs[0] == LaneSrc)
	Src = 0;
	else if (Srcs[1] < 0 \|\| Srcs[1] == LaneSrc)
	Src = 1;
	else
	return SDValue();

	Srcs[Src] = LaneSrc;
	InLaneMask[i] = (M % LaneSize) + Src * Size;
	}

	// If this lane has two sources, see if it fits with the repeat mask so far.
	if (Srcs[1] < 0)
	continue;

	LaneSrcs[Lane][0] = Srcs[0];
	LaneSrcs[Lane][1] = Srcs[1];

	auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
	assert(M1.size() == M2.size() && "Unexpected mask size");
	for (int i = 0, e = M1.size(); i != e; ++i)
	if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
	return false;
	return true;
	};

	auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
	assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
	for (int i = 0, e = MergedMask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;
	assert((MergedMask[i] < 0 \|\| MergedMask[i] == M) &&
	"Unexpected mask element");
	MergedMask[i] = M;
	}
	};

	if (MatchMasks(InLaneMask, RepeatMask)) {
	// Merge this lane mask into the final repeat mask.
	MergeMasks(InLaneMask, RepeatMask);
	continue;
	}

	// Didn't find a match. Swap the operands and try again.
	std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
	ShuffleVectorSDNode::commuteMask(InLaneMask);

	if (MatchMasks(InLaneMask, RepeatMask)) {
	// Merge this lane mask into the final repeat mask.
	MergeMasks(InLaneMask, RepeatMask);
	continue;
	}

	// Couldn't find a match with the operands in either order.
	return SDValue();
	}

	// Now handle any lanes with only one source.
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	// If this lane has already been processed, skip it.
	if (LaneSrcs[Lane][0] >= 0)
	continue;

	for (int i = 0; i != LaneSize; ++i) {
	int M = Mask[(Lane * LaneSize) + i];
	if (M < 0)
	continue;

	// If RepeatMask isn't defined yet we can define it ourself.
	if (RepeatMask[i] < 0)
	RepeatMask[i] = M % LaneSize;

	if (RepeatMask[i] < Size) {
	if (RepeatMask[i] != M % LaneSize)
	return SDValue();
	LaneSrcs[Lane][0] = M / LaneSize;
	} else {
	if (RepeatMask[i] != ((M % LaneSize) + Size))
	return SDValue();
	LaneSrcs[Lane][1] = M / LaneSize;
	}
	}

	if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
	return SDValue();
	}

	SmallVector<int, 16> NewMask(Size, -1);
	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Src = LaneSrcs[Lane][0];
	for (int i = 0; i != LaneSize; ++i) {
	int M = -1;
	if (Src >= 0)
	M = Src * LaneSize + i;
	NewMask[Lane * LaneSize + i] = M;
	}
	}
	SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	// Ensure we didn't get back the shuffle we started with.
	// FIXME: This is a hack to make up for some splat handling code in
	// getVectorShuffle.
	if (isa<ShuffleVectorSDNode>(NewV1) &&
	cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
	return SDValue();

	for (int Lane = 0; Lane != NumLanes; ++Lane) {
	int Src = LaneSrcs[Lane][1];
	for (int i = 0; i != LaneSize; ++i) {
	int M = -1;
	if (Src >= 0)
	M = Src * LaneSize + i;
	NewMask[Lane * LaneSize + i] = M;
	}
	}
	SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	// Ensure we didn't get back the shuffle we started with.
	// FIXME: This is a hack to make up for some splat handling code in
	// getVectorShuffle.
	if (isa<ShuffleVectorSDNode>(NewV2) &&
	cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
	return SDValue();

	for (int i = 0; i != Size; ++i) {
	NewMask[i] = RepeatMask[i % LaneSize];
	if (NewMask[i] < 0)
	continue;

	NewMask[i] += (i / LaneSize) * LaneSize;
	}
	return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
	}

	/// If the input shuffle mask results in a vector that is undefined in all upper
	/// or lower half elements and that mask accesses only 2 halves of the
	/// shuffle's operands, return true. A mask of half the width with mask indexes
	/// adjusted to access the extracted halves of the original shuffle operands is
	/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
	/// lower half of each input operand is accessed.
	static bool
	getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
	int &HalfIdx1, int &HalfIdx2) {
	assert((Mask.size() == HalfMask.size() * 2) &&
	"Expected input mask to be twice as long as output");

	// Exactly one half of the result must be undef to allow narrowing.
	bool UndefLower = isUndefLowerHalf(Mask);
	bool UndefUpper = isUndefUpperHalf(Mask);
	if (UndefLower == UndefUpper)
	return false;

	unsigned HalfNumElts = HalfMask.size();
	unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
	HalfIdx1 = -1;
	HalfIdx2 = -1;
	for (unsigned i = 0; i != HalfNumElts; ++i) {
	int M = Mask[i + MaskIndexOffset];
	if (M < 0) {
	HalfMask[i] = M;
	continue;
	}

	// Determine which of the 4 half vectors this element is from.
	// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
	int HalfIdx = M / HalfNumElts;

	// Determine the element index into its half vector source.
	int HalfElt = M % HalfNumElts;

	// We can shuffle with up to 2 half vectors, set the new 'half'
	// shuffle mask accordingly.
	if (HalfIdx1 < 0 \|\| HalfIdx1 == HalfIdx) {
	HalfMask[i] = HalfElt;
	HalfIdx1 = HalfIdx;
	continue;
	}
	if (HalfIdx2 < 0 \|\| HalfIdx2 == HalfIdx) {
	HalfMask[i] = HalfElt + HalfNumElts;
	HalfIdx2 = HalfIdx;
	continue;
	}

	// Too many half vectors referenced.
	return false;
	}

	return true;
	}

	/// Given the output values from getHalfShuffleMask(), create a half width
	/// shuffle of extracted vectors followed by an insert back to full width.
	static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
	ArrayRef<int> HalfMask, int HalfIdx1,
	int HalfIdx2, bool UndefLower,
	SelectionDAG &DAG) {
	assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
	assert(V1.getValueType().isSimple() && "Expecting only simple types");

	MVT VT = V1.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);

	auto getHalfVector = [&](int HalfIdx) {
	if (HalfIdx < 0)
	return DAG.getUNDEF(HalfVT);
	SDValue V = (HalfIdx < 2 ? V1 : V2);
	HalfIdx = (HalfIdx % 2) * HalfNumElts;
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
	DAG.getIntPtrConstant(HalfIdx, DL));
	};

	// ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
	SDValue Half1 = getHalfVector(HalfIdx1);
	SDValue Half2 = getHalfVector(HalfIdx2);
	SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
	unsigned Offset = UndefLower ? HalfNumElts : 0;
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
	DAG.getIntPtrConstant(Offset, DL));
	}

	/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
	/// This allows for fast cases such as subvector extraction/insertion
	/// or shuffling smaller vector types which can lower more efficiently.
	static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected 256-bit or 512-bit vector");

	bool UndefLower = isUndefLowerHalf(Mask);
	if (!UndefLower && !isUndefUpperHalf(Mask))
	return SDValue();

	assert((!UndefLower \|\| !isUndefUpperHalf(Mask)) &&
	"Completely undef shuffle mask should have been simplified already");

	// Upper half is undef and lower half is whole upper subvector.
	// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
	unsigned NumElts = VT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
	if (!UndefLower &&
	isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(0, DL));
	}

	// Lower half is undef and upper half is whole lower subvector.
	// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
	if (UndefLower &&
	isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	}

	int HalfIdx1, HalfIdx2;
	SmallVector<int, 8> HalfMask(HalfNumElts);
	if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
	return SDValue();

	assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");

	// Only shuffle the halves of the inputs when useful.
	unsigned NumLowerHalves =
	(HalfIdx1 == 0 \|\| HalfIdx1 == 2) + (HalfIdx2 == 0 \|\| HalfIdx2 == 2);
	unsigned NumUpperHalves =
	(HalfIdx1 == 1 \|\| HalfIdx1 == 3) + (HalfIdx2 == 1 \|\| HalfIdx2 == 3);
	assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");

	// Determine the larger pattern of undef/halves, then decide if it's worth
	// splitting the shuffle based on subtarget capabilities and types.
	unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
	if (!UndefLower) {
	// XXXXuuuu: no insert is needed.
	// Always extract lowers when setting lower - these are all free subreg ops.
	if (NumUpperHalves == 0)
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);

	if (NumUpperHalves == 1) {
	// AVX2 has efficient 32/64-bit element cross-lane shuffles.
	if (Subtarget.hasAVX2()) {
	// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
	if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
	!is128BitUnpackShuffleMask(HalfMask) &&
	(!isSingleSHUFPSMask(HalfMask) \|\|
	Subtarget.hasFastVariableShuffle()))
	return SDValue();
	// If this is a unary shuffle (assume that the 2nd operand is
	// canonicalized to undef), then we can use vpermpd. Otherwise, we
	// are better off extracting the upper half of 1 operand and using a
	// narrow shuffle.
	if (EltWidth == 64 && V2.isUndef())
	return SDValue();
	}
	// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
	if (Subtarget.hasAVX512() && VT.is512BitVector())
	return SDValue();
	// Extract + narrow shuffle is better than the wide alternative.
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);
	}

	// Don't extract both uppers, instead shuffle and then extract.
	assert(NumUpperHalves == 2 && "Half vector count went wrong");
	return SDValue();
	}

	// UndefLower - uuuuXXXX: an insert to high half is required if we split this.
	if (NumUpperHalves == 0) {
	// AVX2 has efficient 64-bit element cross-lane shuffles.
	// TODO: Refine to account for unary shuffle, splat, and other masks?
	if (Subtarget.hasAVX2() && EltWidth == 64)
	return SDValue();
	// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
	if (Subtarget.hasAVX512() && VT.is512BitVector())
	return SDValue();
	// Narrow shuffle + insert is better than the wide alternative.
	return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
	UndefLower, DAG);
	}

	// NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
	return SDValue();
	}

	/// Test whether the specified input (0 or 1) is in-place blended by the
	/// given mask.
	///
	/// This returns true if the elements from a particular input are already in the
	/// slot required by the given mask and require no permutation.
	static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
	assert((Input == 0 \|\| Input == 1) && "Only two inputs to shuffles.");
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
	return false;

	return true;
	}

	/// Handle case where shuffle sources are coming from the same 128-bit lane and
	/// every lane can be represented as the same repeating mask - allowing us to
	/// shuffle the sources with the repeating shuffle and then permute the result
	/// to the destination lanes.
	static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;

	// On AVX2 we may be able to just shuffle the lowest elements and then
	// broadcast the result.
	if (Subtarget.hasAVX2()) {
	for (unsigned BroadcastSize : {16, 32, 64}) {
	if (BroadcastSize <= VT.getScalarSizeInBits())
	continue;
	int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

	// Attempt to match a repeating pattern every NumBroadcastElts,
	// accounting for UNDEFs but only references the lowest 128-bit
	// lane of the inputs.
	auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j) {
	int M = Mask[i + j];
	if (M < 0)
	continue;
	int &R = RepeatMask[j];
	if (0 != ((M % NumElts) / NumLaneElts))
	return false;
	if (0 <= R && R != M)
	return false;
	R = M;
	}
	return true;
	};

	SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
	if (!FindRepeatingBroadcastMask(RepeatMask))
	continue;

	// Shuffle the (lowest) repeated elements in place for broadcast.
	SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

	// Shuffle the actual broadcast.
	SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j)
	BroadcastMask[i + j] = j;
	return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
	BroadcastMask);
	}
	}

	// Bail if the shuffle mask doesn't cross 128-bit lanes.
	if (!is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	// Bail if we already have a repeated lane shuffle mask.
	SmallVector<int, 8> RepeatedShuffleMask;
	if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
	return SDValue();

	// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
	// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
	int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
	int NumSubLanes = NumLanes * SubLaneScale;
	int NumSubLaneElts = NumLaneElts / SubLaneScale;

	// Check that all the sources are coming from the same lane and see if we can
	// form a repeating shuffle mask (local to each sub-lane). At the same time,
	// determine the source sub-lane for each destination sub-lane.
	int TopSrcSubLane = -1;
	SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
	SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};

	for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
	// Extract the sub-lane mask, check that it all comes from the same lane
	// and normalize the mask entries to come from the first lane.
	int SrcLane = -1;
	SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
	if (M < 0)
	continue;
	int Lane = (M % NumElts) / NumLaneElts;
	if ((0 <= SrcLane) && (SrcLane != Lane))
	return SDValue();
	SrcLane = Lane;
	int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
	SubLaneMask[Elt] = LocalM;
	}

	// Whole sub-lane is UNDEF.
	if (SrcLane < 0)
	continue;

	// Attempt to match against the candidate repeated sub-lane masks.
	for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
	auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
	for (int i = 0; i != NumSubLaneElts; ++i) {
	if (M1[i] < 0 \|\| M2[i] < 0)
	continue;
	if (M1[i] != M2[i])
	return false;
	}
	return true;
	};

	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
	if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
	continue;

	// Merge the sub-lane mask into the matching repeated sub-lane mask.
	for (int i = 0; i != NumSubLaneElts; ++i) {
	int M = SubLaneMask[i];
	if (M < 0)
	continue;
	assert((RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) &&
	"Unexpected mask element");
	RepeatedSubLaneMask[i] = M;
	}

	// Track the top most source sub-lane - by setting the remaining to UNDEF
	// we can greatly simplify shuffle matching.
	int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
	TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
	Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
	break;
	}

	// Bail if we failed to find a matching repeated sub-lane mask.
	if (Dst2SrcSubLanes[DstSubLane] < 0)
	return SDValue();
	}
	assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
	"Unexpected source lane");

	// Create a repeating shuffle mask for the entire vector.
	SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
	for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
	int Lane = SubLane / SubLaneScale;
	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = RepeatedSubLaneMask[Elt];
	if (M < 0)
	continue;
	int Idx = (SubLane * NumSubLaneElts) + Elt;
	RepeatedMask[Idx] = M + (Lane * NumLaneElts);
	}
	}
	SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

	// Shuffle each source sub-lane to its destination.
	SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumSubLaneElts) {
	int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
	if (SrcSubLane < 0)
	continue;
	for (int j = 0; j != NumSubLaneElts; ++j)
	SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
	}

	return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
	SubLaneMask);
	}

	static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &ShuffleImm, ArrayRef<int> Mask) {
	int NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() == 64 &&
	(NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) &&
	"Unexpected data type for VSHUFPD");

	// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
	// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
	ShuffleImm = 0;
	bool ShufpdMask = true;
	bool CommutableMask = true;
	for (int i = 0; i < NumElts; ++i) {
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] < 0)
	return false;
	int Val = (i & 6) + NumElts * (i & 1);
	int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
	if (Mask[i] < Val \|\| Mask[i] > Val + 1)
	ShufpdMask = false;
	if (Mask[i] < CommutVal \|\| Mask[i] > CommutVal + 1)
	CommutableMask = false;
	ShuffleImm \|= (Mask[i] % 2) << i;
	}

	if (ShufpdMask)
	return true;
	if (CommutableMask) {
	std::swap(V1, V2);
	return true;
	}

	return false;
	}

	static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	assert((VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64)&&
	"Unexpected data type for VSHUFPD");

	unsigned Immediate = 0;
	if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
	return SDValue();

	return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	DAG.getConstant(Immediate, DL, MVT::i8));
	}

	/// Handle lowering of 4-lane 64-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
	}

	// With AVX2 we have direct support for this permutation.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
	Mask, DAG, Subtarget))
	return V;

	// Otherwise, fall back.
	return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG,
	Subtarget);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return Op;

	// If we have one input in place, then we can permute the other input and
	// blend the result.
	if (isShuffleMaskInputInPlace(0, Mask) \|\| isShuffleMaskInputInPlace(1, Mask))
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) \|\|
	isShuffleMaskInputInPlace(1, Mask))))
	if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	// If we have AVX2 then we always want to lower with a blend because an v4 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 4-lane 64-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v4i64 shuffling..
	static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");

	if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on both lanes.
	SmallVector<int, 2> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v4i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
	DAG.getBitcast(MVT::v8i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	// AVX2 provides a direct instruction for permuting a single input across
	// lanes.
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or VEXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;
	}

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
	return V;

	// If we have one input in place, then we can permute the other input and
	// blend the result.
	if (isShuffleMaskInputInPlace(0, Mask) \|\| isShuffleMaskInputInPlace(1, Mask))
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!isShuffleMaskInputInPlace(0, Mask) &&
	!isShuffleMaskInputInPlace(1, Mask))
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 8-lane 32-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 &&
	"Repeated masks must be half the mask width!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
	// have already handled any direct blends.
	return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
	}

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have a single input shuffle with different shuffle patterns in the
	// two 128-bit lanes use the variable mask to VPERMILPS.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

	// Otherwise, fall back.
	return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
	DAG, Subtarget);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code using vpunpcklwd and
	// vpunpckhwd instrs than vblend.
	if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
	if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG))
	return V;

	// If we have AVX2 then we always want to lower with a blend because at v8 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG);

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 8-lane 32-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v8i32 shuffling..
	static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code than vblend by using
	// vpunpcklwd and vpunpckhwd instrs.
	if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
	!Subtarget.hasAVX512())
	if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the two 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or EXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;
	}

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If the shuffle patterns aren't repeated but it is a single input, directly
	// generate a cross-lane VPERMD instruction.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
	SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v8i32, ShufPS);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 16-lane 16-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v16i16 shuffling..
	static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// There are no generalized cross-lane shuffle operations available on i16
	// element types.
	if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
	return V;

	return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask,
	DAG, Subtarget);
	}

	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v16 case.
	return lowerV8I16GeneralSingleInputShuffle(
	DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512BWVL can lower to VPERMW.
	if (Subtarget.hasBWI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
	return V;

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// Handle lowering of 32-lane 8-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v32i8 shuffling..
	static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There are no generalized cross-lane shuffle operations available on i8
	// element types.
	if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
	return V;

	return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG,
	Subtarget);
	}

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512VBMIVL can lower to VPERMB.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Try to permute the lanes and then use a per-lane permute.
	if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
	DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
	return V;

	// Otherwise fall back on generic lowering.
	return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
	Subtarget, DAG);
	}

	/// High-level routine to lower various 256-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 256-bit x86 vector
	/// shuffle or splits it into two 128-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
	SDValue V1, SDValue V2, const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = VT.getVectorNumElements();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There is a really nice hard cut-over between AVX1 and AVX2 that means we
	// can check for those subtargets here and avoid much of the subtarget
	// querying in the per-vector-type lowering routines. With AVX1 we have
	// essentially zero ability to manipulate a 256-bit vector with integer
	// types. Since we'll use floating point types there eventually, just
	// immediately cast everything to a float and operate entirely in that domain.
	if (VT.isInteger() && !Subtarget.hasAVX2()) {
	int ElementBits = VT.getScalarSizeInBits();
	if (ElementBits < 32) {
	// No floating point type available, if we can't use the bit operations
	// for masking/blending then decompose into 128-bit vectors.
	if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
	Subtarget, DAG))
	return V;
	if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
	return V;
	return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
	VT.getVectorNumElements());
	V1 = DAG.getBitcast(FpVT, V1);
	V2 = DAG.getBitcast(FpVT, V2);
	return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
	}

	switch (VT.SimpleTy) {
	case MVT::v4f64:
	return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i64:
	return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8f32:
	return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i32:
	return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i16:
	return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i8:
	return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 256-bit x86 vector type!");
	}
	}

	/// Try to lower a vector shuffle as a 128-bit shuffles.
	static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(VT.getScalarSizeInBits() == 64 &&
	"Unexpected element type size for 128bit shuffle.");

	// To handle 256 bit vector requires VLX and most probably
	// function lowerV2X128VectorShuffle() is better solution.
	assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");

	// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, WidenedMask))
	return SDValue();

	// Try to use an insert into a zero vector.
	if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
	(WidenedMask[1] == 1 \|\| (Zeroable & 0x0c) == 0x0c)) {
	unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL), LoV,
	DAG.getIntPtrConstant(0, DL));
	}

	// Check for patterns which can be matched with a single insert of a 256-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 0, 1, 2, 3});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 8, 9, 10, 11})) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
	DAG.getIntPtrConstant(4, DL));
	}

	assert(WidenedMask.size() == 4);

	// See if this is an insertion of the lower 128-bits of V2 into V1.
	bool IsInsert = true;
	int V2Index = -1;
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	// Make sure all V1 subvectors are in place.
	if (WidenedMask[i] < 4) {
	if (WidenedMask[i] != i) {
	IsInsert = false;
	break;
	}
	} else {
	// Make sure we only have a single V2 index and its the lowest 128-bits.
	if (V2Index >= 0 \|\| WidenedMask[i] != 4) {
	IsInsert = false;
	break;
	}
	V2Index = i;
	}
	}
	if (IsInsert && V2Index >= 0) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
	DAG.getIntPtrConstant(0, DL));
	return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
	}

	// Try to lower to vshuf64x2/vshuf32x4.
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
	unsigned PermMask = 0;
	// Insure elements came from the same Op.
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
	unsigned OpIndex = i / 2;
	if (Ops[OpIndex].isUndef())
	Ops[OpIndex] = Op;
	else if (Ops[OpIndex] != Op)
	return SDValue();

	// Convert the 128-bit shuffle mask selection values into 128-bit selection
	// bits defined by a vshuf64x2 instruction's immediate control byte.
	PermMask \|= (WidenedMask[i] % 4) << (i * 2);
	}

	return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
	DAG.getConstant(PermMask, DL, MVT::i8));
	}

	/// Handle lowering of 8-lane 64-bit floating point shuffles.
	static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3) \|
	((Mask[4] == 5) << 4) \| ((Mask[5] == 5) << 5) \|
	((Mask[6] == 7) << 6) \| ((Mask[7] == 7) << 7);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
	}

	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
	}

	if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
	V2, Subtarget, DAG))
	return Shuf128;

	if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Unpck;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Op;

	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 16-lane 32-bit floating point shuffles.
	static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Otherwise, fall back to a SHUFPS sequence.
	return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
	if (V2.isUndef() &&
	!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
	SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
	}

	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 8-lane 64-bit integer shuffles.
	static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on all four
	// 128-bit lanes.
	SmallVector<int, 2> Repeated128Mask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v8i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
	DAG.getBitcast(MVT::v16i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	SmallVector<int, 4> Repeated256Mask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
	getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
	}

	if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
	V2, Subtarget, DAG))
	return Shuf128;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Unpck;
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 16-lane 32-bit integer shuffles.
	static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the four 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Try to use byte rotation instructions.
	if (Subtarget.hasBWI())
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	// Assume that a single SHUFPS is faster than using a permv shuffle.
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
	SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v16i32, ShufPS);
	}
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;
	return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 32-lane 16-bit integer shuffles.
	static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (V2.isUndef()) {
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v32 case.
	return lowerV8I16GeneralSingleInputShuffle(
	DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
	}

	/// Handle lowering of 64-lane 8-bit integer shuffles.
	static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
	DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
	Subtarget, DAG))
	return Rotate;

	if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
	Zeroable, Subtarget, DAG))
	return PSHUFB;

	// VBMI can use VPERMV/VPERMV3 byte shuffles.
	if (Subtarget.hasVBMI())
	return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (!V2.isUndef())
	if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// FIXME: Implement direct support for this type!
	return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
	}

	/// High-level routine to lower various 512-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 512-bit x86 vector
	/// shuffle or splits it into two 256-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/ basic ISA!");

	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = Mask.size();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	// Dispatch to each element type for lowering. If we don't have support for
	// specific element type shuffles at 512 bits, immediately split them and
	// lower them. Each lowering routine of a given type is allowed to assume that
	// the requisite ISA extensions for that element type are available.
	switch (VT.SimpleTy) {
	case MVT::v8f64:
	return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16f32:
	return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i64:
	return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i32:
	return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i16:
	return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v64i8:
	return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 512-bit x86 vector type!");
	}
	}

	// Determine if this shuffle can be implemented with a KSHIFT instruction.
	// Returns the shift amount if possible or -1 if not. This is a simplified
	// version of matchShuffleAsShift.
	static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
	int MaskOffset, const APInt &Zeroable) {
	int Size = Mask.size();

	auto CheckZeros = [&](int Shift, bool Left) {
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, bool Left) {
	unsigned Pos = Left ? Shift : 0;
	unsigned Low = Left ? 0 : Shift;
	unsigned Len = Size - Shift;
	return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
	};

	for (int Shift = 1; Shift != Size; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
	Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
	return Shift;
	}

	return -1;
	}


	// Lower vXi1 vector shuffles.
	// There is no a dedicated instruction on AVX-512 that shuffles the masks.
	// The only way to shuffle bits is to sign-extend the mask vector to SIMD
	// vector, shuffle and then truncate it back.
	static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/o basic ISA!");

	unsigned NumElts = Mask.size();

	// Try to recognize shuffles that are just padding a subvector with zeros.
	unsigned SubvecElts = 0;
	for (int i = 0; i != (int)NumElts; ++i) {
	if (Mask[i] >= 0 && Mask[i] != i)
	break;

	++SubvecElts;
	}
	assert(SubvecElts != NumElts && "Identity shuffle?");

	// Clip to a power 2.
	SubvecElts = PowerOf2Floor(SubvecElts);

	// Make sure the number of zeroable bits in the top at least covers the bits
	// not covered by the subvector.
	if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
	MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
	V1, DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL),
	Extract, DAG.getIntPtrConstant(0, DL));
	}

	// Try to match KSHIFTs.
	// TODO: Support narrower than legal shifts by widening and extracting.
	if (NumElts >= 16 \|\| (Subtarget.hasDQI() && NumElts == 8)) {
	unsigned Offset = 0;
	for (SDValue V : { V1, V2 }) {
	unsigned Opcode;
	int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
	if (ShiftAmt >= 0)
	return DAG.getNode(Opcode, DL, VT, V,
	DAG.getConstant(ShiftAmt, DL, MVT::i8));
	Offset += NumElts; // Increment for next iteration.
	}
	}


	MVT ExtVT;
	switch (VT.SimpleTy) {
	default:
	llvm_unreachable("Expected a vector of i1 elements");
	case MVT::v2i1:
	ExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	ExtVT = MVT::v4i32;
	break;
	case MVT::v8i1:
	// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
	// shuffle.
	ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
	break;
	case MVT::v16i1:
	// Take 512-bit type, unless we are avoiding 512-bit types and have the
	// 256-bit operation available.
	ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
	break;
	case MVT::v32i1:
	// Take 512-bit type, unless we are avoiding 512-bit types and have the
	// 256-bit operation available.
	assert(Subtarget.hasBWI() && "Expected AVX512BW support");
	ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
	break;
	case MVT::v64i1:
	ExtVT = MVT::v64i8;
	break;
	}

	V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
	V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

	SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
	// i1 was sign extended we can use X86ISD::CVT2MASK.
	int NumElems = VT.getVectorNumElements();
	if ((Subtarget.hasBWI() && (NumElems >= 32)) \|\|
	(Subtarget.hasDQI() && (NumElems < 32)))
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
	Shuffle, ISD::SETGT);

	return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
	}

	/// Helper function that returns true if the shuffle mask should be
	/// commuted to improve canonicalization.
	static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
	int NumElements = Mask.size();

	int NumV1Elements = 0, NumV2Elements = 0;
	for (int M : Mask)
	if (M < 0)
	continue;
	else if (M < NumElements)
	++NumV1Elements;
	else
	++NumV2Elements;

	// Commute the shuffle as needed such that more elements come from V1 than
	// V2. This allows us to match the shuffle pattern strictly on how many
	// elements come from V1 without handling the symmetric cases.
	if (NumV2Elements > NumV1Elements)
	return true;

	assert(NumV1Elements > 0 && "No V1 indices");

	if (NumV2Elements == 0)
	return false;

	// When the number of V1 and V2 elements are the same, try to minimize the
	// number of uses of V2 in the low half of the vector. When that is tied,
	// ensure that the sum of indices for V1 is equal to or lower than the sum
	// indices for V2. When those are equal, try to ensure that the number of odd
	// indices for V1 is lower than the number of odd indices for V2.
	if (NumV1Elements == NumV2Elements) {
	int LowV1Elements = 0, LowV2Elements = 0;
	for (int M : Mask.slice(0, NumElements / 2))
	if (M >= NumElements)
	++LowV2Elements;
	else if (M >= 0)
	++LowV1Elements;
	if (LowV2Elements > LowV1Elements)
	return true;
	if (LowV2Elements == LowV1Elements) {
	int SumV1Indices = 0, SumV2Indices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	SumV2Indices += i;
	else if (Mask[i] >= 0)
	SumV1Indices += i;
	if (SumV2Indices < SumV1Indices)
	return true;
	if (SumV2Indices == SumV1Indices) {
	int NumV1OddIndices = 0, NumV2OddIndices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	NumV2OddIndices += i % 2;
	else if (Mask[i] >= 0)
	NumV1OddIndices += i % 2;
	if (NumV2OddIndices < NumV1OddIndices)
	return true;
	}
	}
	}

	return false;
	}

	/// Top-level lowering for x86 vector shuffles.
	///
	/// This handles decomposition, canonicalization, and lowering of all x86
	/// vector shuffles. Most of the specific lowering strategies are encapsulated
	/// above in helper routines. The canonicalization attempts to widen shuffles
	/// to involve fewer lanes of wider elements, consolidate symmetric patterns
	/// s.t. only one of the two inputs needs to be tested, etc.
	static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
	ArrayRef<int> Mask = SVOp->getMask();
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	MVT VT = Op.getSimpleValueType();
	int NumElements = VT.getVectorNumElements();
	SDLoc DL(Op);
	bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

	assert((VT.getSizeInBits() != 64 \|\| Is1BitVector) &&
	"Can't lower MMX shuffles");

	bool V1IsUndef = V1.isUndef();
	bool V2IsUndef = V2.isUndef();
	if (V1IsUndef && V2IsUndef)
	return DAG.getUNDEF(VT);

	// When we create a shuffle node we put the UNDEF node to second operand,
	// but in some cases the first operand may be transformed to UNDEF.
	// In this case we should just commute the node.
	if (V1IsUndef)
	return DAG.getCommutedVectorShuffle(*SVOp);

	// Check for non-undef masks pointing at an undef vector and make the masks
	// undef as well. This makes it easier to match the shuffle based solely on
	// the mask.
	if (V2IsUndef &&
	any_of(Mask, [NumElements](int M) { return M >= NumElements; })) {
	SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
	for (int &M : NewMask)
	if (M >= NumElements)
	M = -1;
	return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	}

	// Check for illegal shuffle mask element index values.
	int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
	assert(llvm::all_of(Mask,
	[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
	"Out of bounds shuffle index");

	// We actually see shuffles that are entirely re-arrangements of a set of
	// zero inputs. This mostly happens while decomposing complex shuffles into
	// simple ones. Directly lower these as a buildvector of zeros.
	APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
	if (Zeroable.isAllOnesValue())
	return getZeroVector(VT, Subtarget, DAG, DL);

	bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());

	// Create an alternative mask with info about zeroable elements.
	// Here we do not set undef elements as zeroable.
	SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
	if (V2IsZero) {
	assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
	for (int i = 0; i != NumElements; ++i)
	if (Mask[i] != SM_SentinelUndef && Zeroable[i])
	ZeroableMask[i] = SM_SentinelZero;
	}

	// Try to collapse shuffles into using a vector type with fewer elements but
	// wider element types. We cap this to not form integers or floating point
	// elements wider than 64 bits, but it might be interesting to form i128
	// integers to handle flipping the low and high halves of AVX 256-bit vectors.
	SmallVector<int, 16> WidenedMask;
	if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
	canWidenShuffleElements(ZeroableMask, WidenedMask)) {
	// Shuffle mask widening should not interfere with a broadcast opportunity
	// by obfuscating the operands with bitcasts.
	// TODO: Avoid lowering directly from this top-level function: make this
	// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
	if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
	Subtarget, DAG))
	return Broadcast;

	MVT NewEltVT = VT.isFloatingPoint()
	? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
	: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
	int NewNumElts = NumElements / 2;
	MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
	// Make sure that the new vector type is legal. For example, v2f64 isn't
	// legal on SSE1.
	if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
	if (V2IsZero) {
	// Modify the new Mask to take all zeros from the all-zero vector.
	// Choose indices that are blend-friendly.
	bool UsedZeroVector = false;
	assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
	"V2's non-undef elements are used?!");
	for (int i = 0; i != NewNumElts; ++i)
	if (WidenedMask[i] == SM_SentinelZero) {
	WidenedMask[i] = i + NewNumElts;
	UsedZeroVector = true;
	}
	// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
	// some elements to be undef.
	if (UsedZeroVector)
	V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
	}
	V1 = DAG.getBitcast(NewVT, V1);
	V2 = DAG.getBitcast(NewVT, V2);
	return DAG.getBitcast(
	VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
	}
	}

	// Commute the shuffle if it will improve canonicalization.
	if (canonicalizeShuffleMaskWithCommute(Mask))
	return DAG.getCommutedVectorShuffle(*SVOp);

	if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
	return V;

	// For each vector width, delegate to a specialized lowering routine.
	if (VT.is128BitVector())
	return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (VT.is256BitVector())
	return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (VT.is512BitVector())
	return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	if (Is1BitVector)
	return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

	llvm_unreachable("Unimplemented!");
	}

	/// Try to lower a VSELECT instruction to a vector shuffle.
	static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();

	// Only non-legal VSELECTs reach this lowering, convert those into generic
	// shuffles and re-use the shuffle lowering path for blends.
	SmallVector<int, 32> Mask;
	if (createShuffleMaskFromVSELECT(Mask, Cond))
	return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);

	return SDValue();
	}

	SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);

	// A vselect where all conditions and data are constants can be optimized into
	// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
	if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
	return SDValue();

	// Try to lower this to a blend-style vector shuffle. This can handle all
	// constant condition cases.
	if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
	return BlendOp;

	// If this VSELECT has a vector if i1 as a mask, it will be directly matched
	// with patterns on the mask registers on AVX-512.
	MVT CondVT = Cond.getSimpleValueType();
	unsigned CondEltSize = Cond.getScalarValueSizeInBits();
	if (CondEltSize == 1)
	return Op;

	// Variable blends are only legal from SSE4.1 onward.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	unsigned EltSize = VT.getScalarSizeInBits();
	unsigned NumElts = VT.getVectorNumElements();

	// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
	// into an i1 condition so that we can use the mask-based 512-bit blend
	// instructions.
	if (VT.getSizeInBits() == 512) {
	// Build a mask by testing the condition against zero.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
	SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
	DAG.getConstant(0, dl, CondVT),
	ISD::SETNE);
	// Now return a new VSELECT using the mask.
	return DAG.getSelect(dl, VT, Mask, LHS, RHS);
	}

	// SEXT/TRUNC cases where the mask doesn't match the destination size.
	if (CondEltSize != EltSize) {
	// If we don't have a sign splat, rely on the expansion.
	if (CondEltSize != DAG.ComputeNumSignBits(Cond))
	return SDValue();

	MVT NewCondSVT = MVT::getIntegerVT(EltSize);
	MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
	Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
	return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
	}

	// Only some types will be legal on some subtargets. If we can emit a legal
	// VSELECT-matching blend, return Op, and but if we need to expand, return
	// a null value.
	switch (VT.SimpleTy) {
	default:
	// Most of the vector types have blends past SSE4.1.
	return Op;

	case MVT::v32i8:
	// The byte blends for AVX vectors were introduced only in AVX2.
	if (Subtarget.hasAVX2())
	return Op;

	return SDValue();

	case MVT::v8i16:
	case MVT::v16i16: {
	// Bitcast everything to the vXi8 type and use a vXi8 vselect.
	MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
	Cond = DAG.getBitcast(CastVT, Cond);
	LHS = DAG.getBitcast(CastVT, LHS);
	RHS = DAG.getBitcast(CastVT, RHS);
	SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
	return DAG.getBitcast(VT, Select);
	}
	}
	}

	static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
	return SDValue();

	if (VT.getSizeInBits() == 8) {
	SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (VT == MVT::f32) {
	// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
	// the result back to FR32 register. It's only worth matching if the
	// result has a single use which is a store or a bitcast to i32. And in
	// the case of a store, it's not worth it if the index is a constant 0,
	// because a MOVSSmr can be used instead, which is smaller and faster.
	if (!Op.hasOneUse())
	return SDValue();
	SDNode User = Op.getNode()->use_begin();
	if ((User->getOpcode() != ISD::STORE \|\|
	isNullConstant(Op.getOperand(1))) &&
	(User->getOpcode() != ISD::BITCAST \|\|
	User->getValueType(0) != MVT::i32))
	return SDValue();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
	Op.getOperand(1));
	return DAG.getBitcast(MVT::f32, Extract);
	}

	if (VT == MVT::i32 \|\| VT == MVT::i64) {
	// ExtractPS/pextrq works with constant index.
	if (isa<ConstantSDNode>(Op.getOperand(1)))
	return Op;
	}

	return SDValue();
	}

	/// Extract one bit from mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Vec = Op.getOperand(0);
	SDLoc dl(Vec);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);
	MVT EltVT = Op.getSimpleValueType();

	assert((VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI()) &&
	"Unexpected vector type in ExtractBitFromMaskVector");

	// variable index can't be handled in mask registers,
	// extend vector to VR512/128
	if (!isa<ConstantSDNode>(Idx)) {
	unsigned NumElts = VecVT.getVectorNumElements();
	// Extending v8i1/v16i1 to 512-bit get better performance on KNL
	// than extending to 128/256bit.
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0) // the operation is legal
	return Op;

	// Extend to natively supported kshift.
	unsigned NumElems = VecVT.getVectorNumElements();
	MVT WideVecVT = VecVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8) {
	WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
	DAG.getUNDEF(WideVecVT), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Use kshiftr instruction to move to the lower element.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue
	X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);

	if (VecVT.getVectorElementType() == MVT::i1)
	return ExtractBitFromMaskVector(Op, DAG, Subtarget);

	if (!isa<ConstantSDNode>(Idx)) {
	// Its more profitable to go through memory (1 cycles throughput)
	// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
	// IACA tool was used to get performance estimation
	// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
	//
	// example : extractelement <16 x i8> %a, i32 %i
	//
	// Block Throughput: 3.00 Cycles
	// Throughput Bottleneck: Port5
	//
	// \| Num Of \| Ports pressure in cycles \| \|
	// \| Uops \| 0 - DV \| 5 \| 6 \| 7 \| \|
	// ---------------------------------------------
	// \| 1 \| \| 1.0 \| \| \| CP \| vmovd xmm1, edi
	// \| 1 \| \| 1.0 \| \| \| CP \| vpshufb xmm0, xmm0, xmm1
	// \| 2 \| 1.0 \| 1.0 \| \| \| CP \| vpextrb eax, xmm0, 0x0
	// Total Num Of Uops: 4
	//
	//
	// Block Throughput: 1.00 Cycles
	// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
	//
	// \| \| Ports pressure in cycles \| \|
	// \|Uops\| 1 \| 2 - D \|3 - D \| 4 \| 5 \| \|
	// ---------------------------------------------------------
	// \|2^ \| \| 0.5 \| 0.5 \|1.0\| \|CP\| vmovaps xmmword ptr [rsp-0x18], xmm0
	// \|1 \|0.5\| \| \| \|0.5\| \| lea rax, ptr [rsp-0x18]
	// \|1 \| \|0.5, 0.5\|0.5, 0.5\| \| \|CP\| mov al, byte ptr [rdi+rax*1]
	// Total Num Of Uops: 4

	return SDValue();
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

	// If this is a 256-bit vector result, first extract the 128-bit vector and
	// then extract the element from the 128-bit vector.
	if (VecVT.is256BitVector() \|\| VecVT.is512BitVector()) {
	// Get the 128-bit vector.
	Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
	MVT EltVT = VecVT.getVectorElementType();

	unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
	// this can be done with a mask.
	IdxVal &= ElemsPerChunk - 1;
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(IdxVal, dl));
	}

	assert(VecVT.is128BitVector() && "Unexpected vector length");

	MVT VT = Op.getSimpleValueType();

	if (VT.getSizeInBits() == 16) {
	// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
	// we're going to zero extend the register or fold the store (SSE41 only).
	if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
	!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec), Idx));

	// Transform it so it match pextrw which produces a 32-bit result.
	SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (Subtarget.hasSSE41())
	if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
	return Res;

	// TODO: We only extract a single element from v16i8, we can probably afford
	// to be more aggressive here before using the default approach of spilling to
	// stack.
	if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
	// Extract either the lowest i32 or any i16, and extract the sub-byte.
	int DWordIdx = IdxVal / 4;
	if (DWordIdx == 0) {
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec),
	DAG.getIntPtrConstant(DWordIdx, dl));
	int ShiftVal = (IdxVal % 4) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i8));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	int WordIdx = IdxVal / 2;
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
	DAG.getBitcast(MVT::v8i16, Vec),
	DAG.getIntPtrConstant(WordIdx, dl));
	int ShiftVal = (IdxVal % 2) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i8));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	if (VT.getSizeInBits() == 32) {
	if (IdxVal == 0)
	return Op;

	// SHUFPS the element to the lowest double word, then movss.
	int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() == 64) {
	// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
	// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
	// to match extract_elt for f64.
	if (IdxVal == 0)
	return Op;

	// UNPCKHPD the element to the lowest double word, then movsd.
	// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
	// to a f64mem, the whole operation is folded into a single MOVHPDmr.
	int Mask[2] = { 1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	return SDValue();
	}

	/// Insert one bit to mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Elt = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);
	MVT VecVT = Vec.getSimpleValueType();

	if (!isa<ConstantSDNode>(Idx)) {
	// Non constant index. Extend source and destination,
	// insert element and then truncate the result.
	unsigned NumElts = VecVT.getVectorNumElements();
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
	}

	// Copy into a k-register, extract to v1i1 and insert_subvector.
	SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);

	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
	Op.getOperand(2));
	}

	SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();

	if (EltVT == MVT::i1)
	return InsertBitToMaskVector(Op, DAG, Subtarget);

	SDLoc dl(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	SDValue N2 = Op.getOperand(2);

	auto *N2C = dyn_cast<ConstantSDNode>(N2);
	if (!N2C \|\| N2C->getAPIntValue().uge(NumElts))
	return SDValue();
	uint64_t IdxVal = N2C->getZExtValue();

	bool IsZeroElt = X86::isZeroNode(N1);
	bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

	// If we are inserting a element, see if we can do this more efficiently with
	// a blend shuffle with a rematerializable vector than a costly integer
	// insertion.
	if ((IsZeroElt \|\| IsAllOnesElt) && Subtarget.hasSSE41() &&
	16 <= EltVT.getSizeInBits()) {
	SmallVector<int, 8> BlendMask;
	for (unsigned i = 0; i != NumElts; ++i)
	BlendMask.push_back(i == IdxVal ? i + NumElts : i);
	SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
	: getOnesVector(VT, DAG, dl);
	return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
	}

	// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
	// into that, and then insert the subvector back into the result.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	// With a 256-bit vector, we can insert into the zero element efficiently
	// using a blend if we have AVX or AVX2 and the right data type.
	if (VT.is256BitVector() && IdxVal == 0) {
	// TODO: It is worthwhile to cast integer to floating point and back
	// and incur a domain crossing penalty if that's what we'll end up
	// doing anyway after extracting to a 128-bit vector.
	if ((Subtarget.hasAVX() && (EltVT == MVT::f64 \|\| EltVT == MVT::f32)) \|\|
	(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
	SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	N2 = DAG.getIntPtrConstant(1, dl);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
	}
	}

	// Get the desired 128-bit vector chunk.
	SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

	// Insert the element into the desired chunk.
	unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(NumEltsIn128));
	// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
	unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
	DAG.getIntPtrConstant(IdxIn128, dl));

	// Insert the changed part back into the bigger vector
	return insert128BitVector(N0, V, IdxVal, DAG, dl);
	}
	assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");

	// This will be just movd/movq/movss/movsd.
	if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) &&
	(EltVT == MVT::i32 \|\| EltVT == MVT::f32 \|\| EltVT == MVT::f64 \|\|
	EltVT == MVT::i64)) {
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
	}

	// Transform it so it match pinsr{b,w} which expects a GR32 as its second
	// argument. SSE41 required for pinsrb.
	if (VT == MVT::v8i16 \|\| (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
	unsigned Opc;
	if (VT == MVT::v8i16) {
	assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
	Opc = X86ISD::PINSRW;
	} else {
	assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
	assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
	Opc = X86ISD::PINSRB;
	}

	if (N1.getValueType() != MVT::i32)
	N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
	if (N2.getValueType() != MVT::i32)
	N2 = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(Opc, dl, VT, N0, N1, N2);
	}

	if (Subtarget.hasSSE41()) {
	if (EltVT == MVT::f32) {
	// Bits [7:6] of the constant are the source select. This will always be
	// zero here. The DAG Combiner may combine an extract_elt index into
	// these bits. For example (insert (extract, 3), 2) could be matched by
	// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
	// Bits [5:4] of the constant are the destination select. This is the
	// value of the incoming immediate.
	// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
	// combine either bitwise AND or insert of float 0.0 to set these bits.

	bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
	if (IdxVal == 0 && (!MinSize \|\| !MayFoldLoad(N1))) {
	// If this is an insertion of 32-bits into the low 32-bits of
	// a vector, we prefer to generate a blend with immediate rather
	// than an insertps. Blends are simpler operations in hardware and so
	// will always have equal or better performance than insertps.
	// But if optimizing for size and there's a load folding opportunity,
	// generate insertps because blendps does not have a 32-bit memory
	// operand form.
	N2 = DAG.getIntPtrConstant(1, dl);
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
	}
	N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
	// Create this as a scalar to vector..
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
	}

	// PINSR* works with constant index.
	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64)
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT OpVT = Op.getSimpleValueType();

	// It's always cheaper to replace a xor+movd with xorps and simplifies further
	// combines.
	if (X86::isZeroNode(Op.getOperand(0)))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	// If this is a 256-bit vector result, first insert into a 128-bit
	// vector and then insert into the 256-bit vector.
	if (!OpVT.is128BitVector()) {
	// Insert into a 128-bit vector.
	unsigned SizeFactor = OpVT.getSizeInBits() / 128;
	MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
	OpVT.getVectorNumElements() / SizeFactor);

	Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

	// Insert the 128-bit vector.
	return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
	}
	assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
	"Expected an SSE type!");

	// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
	if (OpVT == MVT::v4i32)
	return Op;

	SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
	return DAG.getBitcast(
	OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
	}

	// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
	// simple superregister reference or explicit instructions to insert
	// the upper bits of a vector.
	static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);

	return insert1BitVector(Op, DAG, Subtarget);
	}

	static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Only vXi1 extract_subvectors need custom lowering");

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Idx = Op.getOperand(1);

	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0) // the operation is legal
	return Op;

	MVT VecVT = Vec.getSimpleValueType();
	unsigned NumElems = VecVT.getVectorNumElements();

	// Extend to natively supported kshift.
	MVT WideVecVT = VecVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8) {
	WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
	DAG.getUNDEF(WideVecVT), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Shift to the LSB.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));

	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Returns the appropriate wrapper opcode for a global reference.
	unsigned X86TargetLowering::getGlobalWrapperKind(
	const GlobalValue *GV, const unsigned char OpFlags) const {
	// References to absolute symbols are never PC-relative.
	if (GV && GV->isAbsoluteSymbolRef())
	return X86ISD::Wrapper;

	CodeModel::Model M = getTargetMachine().getCodeModel();
	if (Subtarget.isPICStyleRIPRel() &&
	(M == CodeModel::Small \|\| M == CodeModel::Kernel))
	return X86ISD::WrapperRIP;

	// GOTPCREL references must always use RIP.
	if (OpFlags == X86II::MO_GOTPCREL)
	return X86ISD::WrapperRIP;

	return X86ISD::Wrapper;
	}

	// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
	// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
	// one of the above mentioned nodes. It has to be wrapped because otherwise
	// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
	// be used to form addressing mode. These wrapped nodes will be selected
	// into MOV32ri.
	SDValue
	X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetConstantPool(
	CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
	SDLoc DL(CP);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
	// With PIC, the address is actually $g + Offset.
	if (OpFlag) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
	SDLoc DL(JT);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (OpFlag)
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

	return Result;
	}

	SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
	SelectionDAG &DAG) const {
	return LowerGlobalOrExternal(Op, DAG, /ForCall=/false);
	}

	SDValue
	X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
	// Create the TargetBlockAddressAddress node.
	unsigned char OpFlags =
	Subtarget.classifyBlockAddressReference();
	const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
	int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
	Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	return Result;
	}

	/// Creates target global address or external symbol nodes for calls or
	/// other uses.
	SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
	bool ForCall) const {
	// Unpack the global address or external symbol.
	const SDLoc &dl = SDLoc(Op);
	const GlobalValue *GV = nullptr;
	int64_t Offset = 0;
	const char *ExternalSym = nullptr;
	if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
	GV = G->getGlobal();
	Offset = G->getOffset();
	} else {
	const auto *ES = cast<ExternalSymbolSDNode>(Op);
	ExternalSym = ES->getSymbol();
	}

	// Calculate some flags for address lowering.
	const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
	unsigned char OpFlags;
	if (ForCall)
	OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
	else
	OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
	bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
	bool NeedsLoad = isGlobalStubReference(OpFlags);

	CodeModel::Model M = DAG.getTarget().getCodeModel();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result;

	if (GV) {
	// Create a target global address if this is a global. If possible, fold the
	// offset into the global address reference. Otherwise, ADD it on later.
	int64_t GlobalOffset = 0;
	if (OpFlags == X86II::MO_NO_FLAG &&
	X86::isOffsetSuitableForCodeModel(Offset, M)) {
	std::swap(GlobalOffset, Offset);
	}
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
	} else {
	// If this is not a global address, this must be an external symbol.
	Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
	}

	// If this is a direct call, avoid the wrapper if we don't need to do any
	// loads or adds. This allows SDAG ISel to match direct calls.
	if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
	return Result;

	Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (HasPICReg) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	// For globals that require a load from a stub to get the address, emit the
	// load.
	if (NeedsLoad)
	Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	// If there was a non-zero offset that we didn't fold, create an explicit
	// addition for it.
	if (Offset != 0)
	Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
	DAG.getConstant(Offset, dl, PtrVT));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
	return LowerGlobalOrExternal(Op, DAG, /ForCall=/false);
	}

	static SDValue
	GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
	SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
	unsigned char OperandFlags, bool LocalDynamic = false) {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDLoc dl(GA);
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(),
	OperandFlags);

	X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
	: X86ISD::TLSADDR;

	if (InFlag) {
	SDValue Ops[] = { Chain, TGA, *InFlag };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	} else {
	SDValue Ops[] = { Chain, TGA };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	}

	// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
	MFI.setAdjustsStack(true);
	MFI.setHasCalls(true);

	SDValue Flag = Chain.getValue(1);
	return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
	static SDValue
	LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	SDValue InFlag;
	SDLoc dl(GA); // ? function entry point might be better
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg,
	SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);

	return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
	static SDValue
	LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
	X86::RAX, X86II::MO_TLSGD);
	}

	static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
	SelectionDAG &DAG,
	const EVT PtrVT,
	bool is64Bit) {
	SDLoc dl(GA);

	// Get the start address of the TLS block for this module.
	X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
	.getInfo<X86MachineFunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	SDValue Base;
	if (is64Bit) {
	Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
	X86II::MO_TLSLD, /LocalDynamic=/true);
	} else {
	SDValue InFlag;
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);
	Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
	X86II::MO_TLSLDM, /LocalDynamic=/true);
	}

	// Note: the CleanupLocalDynamicTLSPass will remove redundant computations
	// of Base.

	// Build x@dtpoff.
	unsigned char OperandFlags = X86II::MO_DTPOFF;
	unsigned WrapperKind = X86ISD::Wrapper;
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	// Add x@dtpoff with the base.
	return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
	}

	// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
	static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT, TLSModel::Model model,
	bool is64Bit, bool isPIC) {
	SDLoc dl(GA);

	// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
	Value Ptr = Constant::getNullValue(Type::getInt8PtrTy(DAG.getContext(),
	is64Bit ? 257 : 256));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
	MachinePointerInfo(Ptr));

	unsigned char OperandFlags = 0;
	// Most TLS accesses are not RIP relative, even on x86-64. One exception is
	// initialexec.
	unsigned WrapperKind = X86ISD::Wrapper;
	if (model == TLSModel::LocalExec) {
	OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
	} else if (model == TLSModel::InitialExec) {
	if (is64Bit) {
	OperandFlags = X86II::MO_GOTTPOFF;
	WrapperKind = X86ISD::WrapperRIP;
	} else {
	OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
	}
	} else {
	llvm_unreachable("Unexpected model");
	}

	// emit "addl x@ntpoff,%eax" (local exec)
	// or "addl x@indntpoff,%eax" (initial exec)
	// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
	SDValue TGA =
	DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	if (model == TLSModel::InitialExec) {
	if (isPIC && !is64Bit) {
	Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);
	}

	Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
	}

	SDValue
	X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	if (DAG.getTarget().useEmulatedTLS())
	return LowerToTLSEmulatedModel(GA, DAG);

	const GlobalValue *GV = GA->getGlobal();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool PositionIndependent = isPositionIndependent();

	if (Subtarget.isTargetELF()) {
	TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
	switch (model) {
	case TLSModel::GeneralDynamic:
	if (Subtarget.is64Bit())
	return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
	return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
	case TLSModel::LocalDynamic:
	return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
	Subtarget.is64Bit());
	case TLSModel::InitialExec:
	case TLSModel::LocalExec:
	return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
	PositionIndependent);
	}
	llvm_unreachable("Unknown TLS model.");
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin only has one model of TLS. Lower to that.
	unsigned char OpFlag = 0;
	unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
	X86ISD::WrapperRIP : X86ISD::Wrapper;

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
	if (PIC32)
	OpFlag = X86II::MO_TLVP_PIC_BASE;
	else
	OpFlag = X86II::MO_TLVP;
	SDLoc DL(Op);
	SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
	GA->getValueType(0),
	GA->getOffset(), OpFlag);
	SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

	// With PIC32, the address is actually $g + Offset.
	if (PIC32)
	Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);

	// Lowering the machine isd will make sure everything is in the right
	// location.
	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
	SDValue Args[] = { Chain, Offset };
	Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
	DAG.getIntPtrConstant(0, DL, true),
	Chain.getValue(1), DL);

	// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// And our return value (tls address) is in the standard call return value
	// location.
	unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
	}

	if (Subtarget.isOSWindows()) {
	// Just use the implicit TLS architecture
	// Need to generate something similar to:
	// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
	// ; from TEB
	// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
	// mov rcx, qword [rdx+rcx*8]
	// mov eax, .tls$:tlsvar
	// [rax+rcx] contains the address
	// Windows 64bit: gs:0x58
	// Windows 32bit: fs:__tls_array

	SDLoc dl(GA);
	SDValue Chain = DAG.getEntryNode();

	// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
	// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
	// use its literal value of 0x2C.
	Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
	? Type::getInt8PtrTy(*DAG.getContext(),
	256)
	: Type::getInt32PtrTy(*DAG.getContext(),
	257));

	SDValue TlsArray = Subtarget.is64Bit()
	? DAG.getIntPtrConstant(0x58, dl)
	: (Subtarget.isTargetWindowsGNU()
	? DAG.getIntPtrConstant(0x2C, dl)
	: DAG.getExternalSymbol("_tls_array", PtrVT));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

	SDValue res;
	if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
	res = ThreadPointer;
	} else {
	// Load the _tls_index variable
	SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
	if (Subtarget.is64Bit())
	IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
	MachinePointerInfo(), MVT::i32);
	else
	IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

	auto &DL = DAG.getDataLayout();
	SDValue Scale =
	DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
	IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

	res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
	}

	res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

	// Get the offset of start of .tls section
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), X86II::MO_SECREL);
	SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
	}

	llvm_unreachable("TLS not implemented for this target.");
	}

	/// Lower SRA_PARTS and friends, which return two i32 values
	/// and take a 2 x i32 value to shift plus a shift amount.
	/// TODO: Can this be moved to general expansion code?
	static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	MVT VT = Op.getSimpleValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	// ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
	// ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
	// during isel.
	SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits - 1, dl, MVT::i8));
	SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i8))
	: DAG.getConstant(0, dl, VT);

	SDValue Tmp2, Tmp3;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
	} else {
	Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
	}

	// If the shift amount is larger or equal than the width of a part we can't
	// rely on the results of shld/shrd. Insert a test and select the appropriate
	// values for large shift amounts.
	SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i8));
	SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
	DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);

	SDValue Hi, Lo;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
	Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
	} else {
	Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
	Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
	}

	return DAG.getMergeValues({ Lo, Hi }, dl);
	}

	static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((Op.getOpcode() == ISD::FSHL \|\| Op.getOpcode() == ISD::FSHR) &&
	"Unexpected funnel shift opcode!");

	SDLoc DL(Op);
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Amt = Op.getOperand(2);

	bool IsFSHR = Op.getOpcode() == ISD::FSHR;

	if (VT.isVector()) {
	assert(Subtarget.hasVBMI2() && "Expected VBMI2");

	if (IsFSHR)
	std::swap(Op0, Op1);

	APInt APIntShiftAmt;
	if (isConstantSplat(Amt, APIntShiftAmt)) {
	uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
	return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
	Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8));
	}

	return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
	Op0, Op1, Amt);
	}

	assert((VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64) &&
	"Unexpected funnel shift type!");

	// Expand slow SHLD/SHRD cases if we are not optimizing for size.
	bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
	if (!OptForSize && Subtarget.isSHLDSlow())
	return SDValue();

	if (IsFSHR)
	std::swap(Op0, Op1);

	// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
	if (VT == MVT::i16)
	Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
	DAG.getConstant(15, DL, Amt.getValueType()));

	unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
	return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
	}

	// Try to use a packed vector operation to handle i64 on 32-bit targets when
	// AVX512DQ is enabled.
	static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Op.getOpcode() == ISD::SINT_TO_FP \|\|
	Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();

	if (!Subtarget.hasDQI() \|\| SrcVT != MVT::i64 \|\| Subtarget.is64Bit() \|\|
	(VT != MVT::f32 && VT != MVT::f64))
	return SDValue();

	// Pack the i64 into a vector, do the operation and extract.

	// Using 256-bit to ensure result is 128-bits for f32 case.
	unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
	MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
	MVT VecVT = MVT::getVectorVT(VT, NumElts);

	SDLoc dl(Op);
	SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
	SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
	const X86Subtarget &Subtarget) {
	switch (Opcode) {
	case ISD::SINT_TO_FP:
	// TODO: Handle wider types with AVX/AVX512.
	if (!Subtarget.hasSSE2() \|\| FromVT != MVT::v4i32)
	return false;
	// CVTDQ2PS or (V)CVTDQ2PD
	return ToVT == MVT::v4f32 \|\| (Subtarget.hasAVX() && ToVT == MVT::v4f64);

	case ISD::UINT_TO_FP:
	// TODO: Handle wider types and i64 elements.
	if (!Subtarget.hasAVX512() \|\| FromVT != MVT::v4i32)
	return false;
	// VCVTUDQ2PS or VCVTUDQ2PD
	return ToVT == MVT::v4f32 \|\| ToVT == MVT::v4f64;

	default:
	return false;
	}
	}

	/// Given a scalar cast operation that is extracted from a vector, try to
	/// vectorize the cast op followed by extraction. This will avoid an expensive
	/// round-trip between XMM and GPR.
	static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// TODO: This could be enhanced to handle smaller integer types by peeking
	// through an extend.
	SDValue Extract = Cast.getOperand(0);
	MVT DestVT = Cast.getSimpleValueType();
	if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Extract.getOperand(1)))
	return SDValue();

	// See if we have a 128-bit vector cast op for this type of cast.
	SDValue VecOp = Extract.getOperand(0);
	MVT FromVT = VecOp.getSimpleValueType();
	unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
	MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
	MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
	if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
	return SDValue();

	// If we are extracting from a non-zero element, first shuffle the source
	// vector to allow extracting from element zero.
	SDLoc DL(Cast);
	if (!isNullConstant(Extract.getOperand(1))) {
	SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
	Mask[0] = Extract.getConstantOperandVal(1);
	VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
	}
	// If the source vector is wider than 128-bits, extract the low part. Do not
	// create an unnecessarily wide vector cast op.
	if (FromVT != Vec128VT)
	VecOp = extract128BitVector(VecOp, 0, DAG, DL);

	// cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
	// cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
	SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
	return Extract;

	if (SrcVT.isVector()) {
	if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
	return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(SrcVT)));
	}
	return SDValue();
	}

	assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
	"Unknown SINT_TO_FP to lower!");

	// These are really Legal; return the operand so the caller accepts it as
	// Legal.
	if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
	return Op;
	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit())
	return Op;

	if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
	return V;

	SDValue ValueToStore = Op.getOperand(0);
	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
	!Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

	unsigned Size = SrcVT.getSizeInBits()/8;
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	SDValue Chain = DAG.getStore(
	DAG.getEntryNode(), dl, ValueToStore, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
	}

	SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
	SDValue StackSlot,
	SelectionDAG &DAG) const {
	// Build the FILD
	SDLoc DL(Op);
	SDVTList Tys;
	bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
	if (useSSE)
	Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
	else
	Tys = DAG.getVTList(Op.getValueType(), MVT::Other);

	unsigned ByteSize = SrcVT.getSizeInBits() / 8;

	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
	MachineMemOperand *LoadMMO;
	if (FI) {
	int SSFI = FI->getIndex();
	LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, ByteSize, ByteSize);
	} else {
	LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
	StackSlot = StackSlot.getOperand(1);
	}
	SDValue FILDOps[] = {Chain, StackSlot};
	SDValue Result =
	DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,
	Tys, FILDOps, SrcVT, LoadMMO);

	if (useSSE) {
	Chain = Result.getValue(1);
	SDValue InFlag = Result.getValue(2);

	// FIXME: Currently the FST is glued to the FILD_FLAG. This
	// shouldn't be necessary except that RFP cannot be live across
	// multiple blocks. When stackifier is fixed, they can be uncoupled.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned SSFISize = Op.getValueSizeInBits() / 8;
	int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
	auto PtrVT = getPointerTy(MF.getDataLayout());
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	Tys = DAG.getVTList(MVT::Other);
	SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag};
	MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOStore, SSFISize, SSFISize);

	Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps,
	Op.getValueType(), StoreMMO);
	Result = DAG.getLoad(
	Op.getValueType(), DL, Chain, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	}

	return Result;
	}

	/// 64-bit unsigned integer to double expansion.
	static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This algorithm is not obvious. Here it is what we're trying to output:
	/*
	movq %rax, %xmm0
	punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
	subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
	#ifdef __SSE3__
	haddpd %xmm0, %xmm0
	#else
	pshufd $0x4e, %xmm0, %xmm1
	addpd %xmm1, %xmm0
	#endif
	*/

	SDLoc dl(Op);
	LLVMContext *Context = DAG.getContext();

	// Build some magic constants.
	static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
	Constant C0 = ConstantDataVector::get(Context, CV0);
	auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);

	SmallVector<Constant*,2> CV1;
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4330000000000000ULL))));
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4530000000000000ULL))));
	Constant *C1 = ConstantVector::get(CV1);
	SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);

	// Load the 64-bit value into an XMM register.
	SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	Op.getOperand(0));
	SDValue CLod0 =
	DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue Unpck1 =
	getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

	SDValue CLod1 =
	DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
	SDValue Result;

	if (Subtarget.hasSSE3()) {
	// FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'.
	Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
	} else {
	SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
	Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
	}

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
	DAG.getIntPtrConstant(0, dl));
	}

	/// 32-bit unsigned integer to float expansion.
	static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	// FP constant to bias correct the final result.
	SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
	MVT::f64);

	// Load the 32-bit value into an XMM register.
	SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
	Op.getOperand(0));

	// Zero out the upper parts of the register.
	Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

	Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Load),
	DAG.getIntPtrConstant(0, dl));

	// Or the load with the bias.
	SDValue Or = DAG.getNode(
	ISD::OR, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
	Or =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

	// Subtract the bias.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

	// Handle final rounding.
	return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
	}

	static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (Op.getSimpleValueType() != MVT::v2f64)
	return SDValue();

	SDValue N0 = Op.getOperand(0);
	assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");

	// Legalize to v4i32 type.
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));

	if (Subtarget.hasAVX512())
	return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

	// Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
	// but using v2i32 to v2f64 with X86ISD::CVTSI2P.
	SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
	SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);

	// Two to the power of half-word-size.
	SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64);

	// Clear upper part of LO, lower HI.
	SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
	SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);

	SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
	fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
	SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);

	// Add the two halves.
	return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
	}

	static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// The algorithm is the following:
	// #ifdef __SSE4_1__
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	// #else
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	// #endif
	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	// return (float4) lo + fhi;

	// We shouldn't use it when unsafe-fp-math is enabled though: we might later
	// reassociate the two FADDs, and if we do that, the algorithm fails
	// spectacularly (PR24512).
	// FIXME: If we ever have some kind of Machine FMF, this should be marked
	// as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
	// there's also the MachineCombiner reassociations happening on Machine IR.
	if (DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	SDLoc DL(Op);
	SDValue V = Op->getOperand(0);
	MVT VecIntVT = V.getSimpleValueType();
	bool Is128 = VecIntVT == MVT::v4i32;
	MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
	// If we convert to something else than the supported type, e.g., to v4f64,
	// abort early.
	if (VecFloatVT != Op->getSimpleValueType(0))
	return SDValue();

	assert((VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) &&
	"Unsupported custom type");

	// In the #idef/#else code, we have in common:
	// - The vector of constants:
	// -- 0x4b000000
	// -- 0x53000000
	// - A shift:
	// -- v >> 16

	// Create the splat vector for 0x4b000000.
	SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
	// Create the splat vector for 0x53000000.
	SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

	// Create the right shift.
	SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
	SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

	SDValue Low, High;
	if (Subtarget.hasSSE41()) {
	MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
	SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
	// Low will be bitcasted right away, so do not bother bitcasting back to its
	// original type.
	Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
	VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
	SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
	// High will be bitcasted right away, so do not bother bitcasting back to
	// its original type.
	High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
	VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
	} else {
	SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
	Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
	}

	// Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
	SDValue VecCstFAdd = DAG.getConstantFP(
	APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);

	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue FHigh =
	DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
	// return (float4) lo + fhi;
	SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
	return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
	}

	static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = Op.getOperand(0);
	MVT SrcVT = N0.getSimpleValueType();
	SDLoc dl(Op);

	switch (SrcVT.SimpleTy) {
	default:
	llvm_unreachable("Custom UINT_TO_FP is not supported!");
	case MVT::v2i32:
	return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
	case MVT::v4i32:
	case MVT::v8i32:
	assert(!Subtarget.hasAVX512());
	return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
	}
	}

	SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (Op.getSimpleValueType().isVector())
	return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

	if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
	return Extract;

	MVT SrcVT = N0.getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
	(SrcVT == MVT::i32 \|\| (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
	// Conversions from unsigned i32 to f32/f64 are legal,
	// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
	return Op;
	}

	if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
	return V;

	if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
	if (SrcVT == MVT::i32 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
	if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
	return SDValue();

	// Make a 64-bit buffer, and use it to build an FILD.
	SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
	if (SrcVT == MVT::i32) {
	SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
	SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
	StackSlot, MachinePointerInfo());
	SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
	OffsetSlot, MachinePointerInfo());
	SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
	return Fild;
	}

	assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
	SDValue ValueToStore = Op.getOperand(0);
	if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
	MachinePointerInfo());
	// For i64 source, we need to add the appropriate power of 2 if the input
	// was negative. This is the same as the optimization in
	// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
	// we must be careful to do the computation in x87 extended precision, not
	// in SSE. (The generic code can't know it's OK to do this, or how to.)
	int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, 8, 8);

	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Store, StackSlot };
	SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
	MVT::i64, MMO);

	APInt FF(32, 0x5F800000ULL);

	// Check whether the sign bit is set.
	SDValue SignSet = DAG.getSetCC(
	dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
	Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

	// Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
	SDValue FudgePtr = DAG.getConstantPool(
	ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);

	// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
	SDValue Zero = DAG.getIntPtrConstant(0, dl);
	SDValue Four = DAG.getIntPtrConstant(4, dl);
	SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
	FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

	// Load the value out, extending it from f32 to f80.
	// FIXME: Avoid the extend by constructing the right constant pool?
	SDValue Fudge = DAG.getExtLoad(
	ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
	/* Alignment = */ 4);
	// Extend everything to 80 bits to force it to be done on x87.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
	return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
	DAG.getIntPtrConstant(0, dl));
	}

	// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
	// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
	// just return an SDValue().
	// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
	// to i16, i32 or i64, and we lower it to a legal sequence and return the
	// result.
	SDValue
	X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
	bool IsSigned) const {
	SDLoc DL(Op);

	EVT DstTy = Op.getValueType();
	EVT TheVT = Op.getOperand(0).getValueType();
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
	// f16 must be promoted before using the lowering in this routine.
	// fp128 does not use this lowering.
	return SDValue();
	}

	// If using FIST to compute an unsigned i64, we'll need some fixup
	// to handle values above the maximum signed i64. A FIST is always
	// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
	bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;

	if (!IsSigned && DstTy != MVT::i64) {
	// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
	// The low 32 bits of the fist result will have the correct uint32 result.
	assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
	DstTy = MVT::i64;
	}

	assert(DstTy.getSimpleVT() <= MVT::i64 &&
	DstTy.getSimpleVT() >= MVT::i16 &&
	"Unknown FP_TO_INT to lower!");

	// We lower FP->int64 into FISTP64 followed by a load from a temporary
	// stack slot.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned MemSize = DstTy.getStoreSize();
	int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

	SDValue Chain = DAG.getEntryNode();
	SDValue Value = Op.getOperand(0);
	SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

	if (UnsignedFixup) {
	//
	// Conversion to unsigned i64 is implemented with a select,
	// depending on whether the source value fits in the range
	// of a signed i64. Let Thresh be the FP equivalent of
	// 0x8000000000000000ULL.
	//
	// Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
	// FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
	// Fist-to-mem64 FistSrc
	// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
	// to XOR'ing the high 32 bits with Adjust.
	//
	// Being a power of 2, Thresh is exactly representable in all FP formats.
	// For X87 we'd like to use the smallest FP type for this constant, but
	// for DAG type consistency we have to match the FP operand type.

	APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
	LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
	bool LosesInfo = false;
	if (TheVT == MVT::f64)
	// The rounding mode is irrelevant as the conversion should be exact.
	Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
	&LosesInfo);
	else if (TheVT == MVT::f80)
	Status = Thresh.convert(APFloat::x87DoubleExtended(),
	APFloat::rmNearestTiesToEven, &LosesInfo);

	assert(Status == APFloat::opOK && !LosesInfo &&
	"FP conversion should have been exact");

	SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

	SDValue Cmp = DAG.getSetCC(DL,
	getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT),
	Value, ThreshVal, ISD::SETLT);
	Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
	DAG.getConstant(0, DL, MVT::i64),
	DAG.getConstant(APInt::getSignMask(64),
	DL, MVT::i64));
	SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
	Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT),
	Value, ThreshVal, ISD::SETLT);
	Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
	}

	MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

	// FIXME This causes a redundant load/store if the SSE-class value is already
	// in memory, such as if it is on the callstack.
	if (isScalarFPTypeInSSEReg(TheVT)) {
	assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
	Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
	SDVTList Tys = DAG.getVTList(TheVT, MVT::Other);
	SDValue Ops[] = { Chain, StackSlot };

	unsigned FLDSize = TheVT.getStoreSize();
	assert(FLDSize <= MemSize && "Stack slot not big enough");
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize);
	Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
	Chain = Value.getValue(1);
	}

	// Build the FP_TO_INT*_IN_MEM
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MPI, MachineMemOperand::MOStore, MemSize, MemSize);
	SDValue Ops[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
	DAG.getVTList(MVT::Other),
	Ops, DstTy, MMO);

	SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);

	// If we need an unsigned fixup, XOR the result with adjust.
	if (UnsignedFixup)
	Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);

	return Res;
	}

	static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	assert(VT.isVector() && InVT.isVector() && "Expected vector type");
	assert((Opc == ISD::ANY_EXTEND \|\| Opc == ISD::ZERO_EXTEND) &&
	"Unexpected extension opcode");
	assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
	"Expected same number of elements");
	assert((VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::i64) &&
	"Unexpected element type");
	assert((InVT.getVectorElementType() == MVT::i8 \|\|
	InVT.getVectorElementType() == MVT::i16 \|\|
	InVT.getVectorElementType() == MVT::i32) &&
	"Unexpected element type");

	unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);

	// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
	if (InVT == MVT::v8i8) {
	if (!ExperimentalVectorWideningLegalization \|\| VT != MVT::v8i64)
	return SDValue();

	In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
	MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
	return DAG.getNode(ExtendInVecOpc, dl, VT, In);
	}

	if (Subtarget.hasInt256())
	return Op;

	// Optimize vectors in AVX mode:
	//
	// v8i16 -> v8i32
	// Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
	// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
	// Concat upper and lower parts.
	//
	// v4i32 -> v4i64
	// Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
	// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
	// Concat upper and lower parts.
	//

	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);

	SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);

	// Short-circuit if we can determine that each 128-bit half is the same value.
	// Otherwise, this is difficult to match and optimize.
	if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
	if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);

	SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
	SDValue Undef = DAG.getUNDEF(InVT);
	bool NeedZero = Opc == ISD::ZERO_EXTEND;
	SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
	OpHi = DAG.getBitcast(HalfVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
	static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
	const SDLoc &dl, SelectionDAG &DAG) {
	assert((VT == MVT::v16i8 \|\| VT == MVT::v16i16) && "Unexpected VT.");
	SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
	DAG.getIntPtrConstant(0, dl));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
	DAG.getIntPtrConstant(8, dl));
	Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
	Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	SDLoc DL(Op);
	unsigned NumElts = VT.getVectorNumElements();

	// For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
	// avoids a constant pool load.
	if (VT.getVectorElementType() != MVT::i8) {
	SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
	return DAG.getNode(ISD::SRL, DL, VT, Extend,
	DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
	}

	// Extend VT if BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI()) {
	// If v16i32 is to be avoided, we'll need to split and concatenate.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
	return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);

	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
	}

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, DL));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
	NumElts);
	}

	SDValue One = DAG.getConstant(1, DL, WideVT);
	SDValue Zero = DAG.getConstant(0, DL, WideVT);

	SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

	// Truncate if we had to extend above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(MVT::i8, NumElts);
	SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
	DAG.getIntPtrConstant(0, DL));

	return SelectedVal;
	}

	static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	if (SVT.getVectorElementType() == MVT::i1)
	return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

	assert(Subtarget.hasAVX() && "Expected AVX support");
	return LowerAVXExtend(Op, DAG, Subtarget);
	}

	/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
	/// It makes use of the fact that vectors with enough leading sign/zero bits
	/// prevent the PACKSS/PACKUS from saturating the results.
	/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
	/// within each 128-bit lane.
	static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Opcode == X86ISD::PACKSS \|\| Opcode == X86ISD::PACKUS) &&
	"Unexpected PACK opcode");
	assert(DstVT.isVector() && "VT not a vector?");

	// Requires SSE2 but AVX512 has fast vector truncate.
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT SrcVT = In.getValueType();

	// No truncation required, we might get here due to recursive calls.
	if (SrcVT == DstVT)
	return In;

	// We only support vector truncation to 64bits or greater from a
	// 128bits or greater source.
	unsigned DstSizeInBits = DstVT.getSizeInBits();
	unsigned SrcSizeInBits = SrcVT.getSizeInBits();
	if ((DstSizeInBits % 64) != 0 \|\| (SrcSizeInBits % 128) != 0)
	return SDValue();

	unsigned NumElems = SrcVT.getVectorNumElements();
	if (!isPowerOf2_32(NumElems))
	return SDValue();

	LLVMContext &Ctx = *DAG.getContext();
	assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
	assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");

	EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

	// Pack to the largest type possible:
	// vXi64/vXi32 -> PACKSDW and vXi16 -> PACKSWB.
	EVT InVT = MVT::i16, OutVT = MVT::i8;
	if (SrcVT.getScalarSizeInBits() > 16 &&
	(Opcode == X86ISD::PACKSS \|\| Subtarget.hasSSE41())) {
	InVT = MVT::i32;
	OutVT = MVT::i16;
	}

	// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
	if (SrcVT.is128BitVector()) {
	InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
	OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
	In = DAG.getBitcast(InVT, In);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
	Res = extractSubVector(Res, 0, DAG, DL, 64);
	return DAG.getBitcast(DstVT, Res);
	}

	// Extract lower/upper subvectors.
	unsigned NumSubElts = NumElems / 2;
	SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
	SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);

	unsigned SubSizeInBits = SrcSizeInBits / 2;
	InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
	OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

	// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
	if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
	return DAG.getBitcast(DstVT, Res);
	}

	// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
	// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
	if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

	// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
	// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
	// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
	SmallVector<int, 64> Mask;
	int Scale = 64 / OutVT.getScalarSizeInBits();
	scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);
	Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);

	if (DstVT.is256BitVector())
	return DAG.getBitcast(DstVT, Res);

	// If 512bit -> 128bit truncate another stage.
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	Res = DAG.getBitcast(PackedVT, Res);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	// Recursively pack lower/upper subvectors, concat result and pack again.
	assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
	Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
	Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

	PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();

	assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");

	// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
	unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
	if (InVT.getScalarSizeInBits() <= 16) {
	if (Subtarget.hasBWI()) {
	// legal, will go to VPMOVB2M, VPMOVW2M
	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	// Shift packed bytes not supported natively, bitcast to word
	MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
	In = DAG.getNode(ISD::SHL, DL, ExtVT,
	DAG.getBitcast(ExtVT, In),
	DAG.getConstant(ShiftInx, DL, ExtVT));
	In = DAG.getBitcast(InVT, In);
	}
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
	In, ISD::SETGT);
	}
	// Use TESTD/Q, extended vector to packed dword/qword.
	assert((InVT.is256BitVector() \|\| InVT.is128BitVector()) &&
	"Unexpected vector type.");
	unsigned NumElts = InVT.getVectorNumElements();
	assert((NumElts == 8 \|\| NumElts == 16) && "Unexpected number of elements");
	// We need to change to a wider element type that we have support for.
	// For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
	// For 16 element vectors we extend to v16i32 unless we are explicitly
	// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
	// we need to split into two 8 element vectors which we can extend to v8i32,
	// truncate and concat the results. There's an additional complication if
	// the original type is v16i8. In that case we can't split the v16i8 so
	// first we pre-extend it to v16i16 which we can split to v8i16, then extend
	// to v8i32, truncate that to v8i1 and concat the two halves.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
	if (InVT == MVT::v16i8) {
	// First we need to sign extend up to 256-bits so we can split that.
	InVT = MVT::v16i16;
	In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
	}
	SDValue Lo = extract128BitVector(In, 0, DAG, DL);
	SDValue Hi = extract128BitVector(In, 8, DAG, DL);
	// We're split now, just emit two truncates and a concat. The two
	// truncates will trigger legalization to come back to this function.
	Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
	Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}
	// We either have 8 elements or we're allowed to use 512-bit vectors.
	// If we have VLX, we want to use the narrowest vector that can get the
	// job done so we use vXi32.
	MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
	MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
	In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
	InVT = ExtVT;
	ShiftInx = InVT.getScalarSizeInBits() - 1;
	}

	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	In = DAG.getNode(ISD::SHL, DL, InVT, In,
	DAG.getConstant(ShiftInx, DL, InVT));
	}
	// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
	if (Subtarget.hasDQI())
	return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
	return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
	}

	SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();
	unsigned InNumEltBits = InVT.getScalarSizeInBits();

	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Invalid TRUNCATE operation");

	// If called by the legalizer just return.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
	return SDValue();

	if (VT.getVectorElementType() == MVT::i1)
	return LowerTruncateVecI1(Op, DAG, Subtarget);

	// vpmovqb/w/d, vpmovdb/w, vpmovwb
	if (Subtarget.hasAVX512()) {
	// word to byte only under BWI. Otherwise we have to promoted to v16i32
	// and then truncate that. But we should only do that if we haven't been
	// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
	// handled by isel patterns.
	if (InVT != MVT::v16i16 \|\| Subtarget.hasBWI() \|\|
	Subtarget.canExtendTo512DQ())
	return Op;
	}

	unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
	unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

	// Truncate with PACKUS if we are truncating a vector with leading zero bits
	// that extend all the way to the packed/truncated value.
	// Pre-SSE41 we can only use PACKUSWB.
	KnownBits Known = DAG.computeKnownBits(In);
	if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
	return V;

	// Truncate with PACKSS if we are truncating a vector with sign-bits that
	// extend all the way to the packed/truncated value.
	if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
	return V;

	if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
	// On AVX2, v4i64 -> v4i32 becomes VPERMD.
	if (Subtarget.hasInt256()) {
	static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
	In = DAG.getBitcast(MVT::v8i32, In);
	In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(2, DL));
	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
	static const int ShufMask[] = {0, 2, 4, 6};
	return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
	}

	if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
	// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
	if (Subtarget.hasInt256()) {
	In = DAG.getBitcast(MVT::v32i8, In);

	// The PSHUFB mask:
	static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1,
	16, 17, 20, 21, 24, 25, 28, 29,
	-1, -1, -1, -1, -1, -1, -1, -1 };
	In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
	In = DAG.getBitcast(MVT::v4i64, In);

	static const int ShufMask2[] = {0, 2, -1, -1};
	In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
	In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getBitcast(VT, In);
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(0, DL));

	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(4, DL));

	OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
	OpHi = DAG.getBitcast(MVT::v16i8, OpHi);

	// The PSHUFB mask:
	static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1};

	OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
	OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);

	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

	// The MOVLHPS Mask:
	static const int ShufMask2[] = {0, 1, 4, 5};
	SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
	return DAG.getBitcast(MVT::v8i16, res);
	}

	if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
	// Use an AND to zero uppper bits for PACKUS.
	In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));

	SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
	DAG.getIntPtrConstant(8, DL));
	return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
	}

	// Handle truncation of V256 to V128 using shuffles.
	assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");

	assert(Subtarget.hasAVX() && "256-bit vector without AVX!");

	unsigned NumElems = VT.getVectorNumElements();
	MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);

	SmallVector<int, 16> MaskVec(NumElems * 2, -1);
	// Prepare truncation shuffle mask
	for (unsigned i = 0; i != NumElems; ++i)
	MaskVec[i] = i * 2;
	In = DAG.getBitcast(NVT, In);
	SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
	bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
	MVT VT = Op.getSimpleValueType();
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	SDLoc dl(Op);

	if (VT.isVector()) {
	if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
	MVT ResVT = MVT::v4i32;
	MVT TruncVT = MVT::v4i1;
	unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	if (!IsSigned && !Subtarget.hasVLX()) {
	// Widen to 512-bits.
	ResVT = MVT::v8i32;
	TruncVT = MVT::v8i1;
	Opc = ISD::FP_TO_UINT;
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
	DAG.getUNDEF(MVT::v8f64),
	Src, DAG.getIntPtrConstant(0, dl));
	}
	SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
	Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
	if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
	return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32)));
	}

	return SDValue();
	}

	assert(!VT.isVector());

	bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);

	if (!IsSigned && Subtarget.hasAVX512()) {
	// Conversions from f32/f64 should be legal.
	if (UseSSEReg)
	return Op;

	// Use default expansion.
	if (VT == MVT::i64)
	return SDValue();
	}

	// Promote i16 to i32 if we can use a SSE operation.
	if (VT == MVT::i16 && UseSSEReg) {
	assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
	SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	// If this is a SINT_TO_FP using SSEReg we're done.
	if (UseSSEReg && IsSigned)
	return Op;

	// Fall back to X87.
	if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned))
	return V;

	llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
	}

	static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");

	return DAG.getNode(X86ISD::VFPEXT, DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
	In, DAG.getUNDEF(SVT)));
	}

	/// Horizontal vector math instructions may be slower than normal math with
	/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
	/// implementation, and likely shuffle complexity of the alternate sequence.
	static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
	bool HasFastHOps = Subtarget.hasFastHorizontalOps();
	return !IsSingleSource \|\| IsOptimizingSize \|\| HasFastHOps;
	}

	/// Depending on uarch and/or optimizing for size, we might prefer to use a
	/// vector operation in place of the typical scalar operation.
	static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// If both operands have other uses, this is probably not profitable.
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	if (!LHS.hasOneUse() && !RHS.hasOneUse())
	return Op;

	// FP horizontal add/sub were added with SSE3. Integer with SSSE3.
	bool IsFP = Op.getSimpleValueType().isFloatingPoint();
	if (IsFP && !Subtarget.hasSSE3())
	return Op;
	if (!IsFP && !Subtarget.hasSSSE3())
	return Op;

	// Extract from a common vector.
	if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	LHS.getOperand(0) != RHS.getOperand(0) \|\|
	!isa<ConstantSDNode>(LHS.getOperand(1)) \|\|
	!isa<ConstantSDNode>(RHS.getOperand(1)) \|\|
	!shouldUseHorizontalOp(true, DAG, Subtarget))
	return Op;

	// Allow commuted 'hadd' ops.
	// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
	unsigned HOpcode;
	switch (Op.getOpcode()) {
	case ISD::ADD: HOpcode = X86ISD::HADD; break;
	case ISD::SUB: HOpcode = X86ISD::HSUB; break;
	case ISD::FADD: HOpcode = X86ISD::FHADD; break;
	case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
	default:
	llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
	}
	unsigned LExtIndex = LHS.getConstantOperandVal(1);
	unsigned RExtIndex = RHS.getConstantOperandVal(1);
	if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
	(HOpcode == X86ISD::HADD \|\| HOpcode == X86ISD::FHADD))
	std::swap(LExtIndex, RExtIndex);

	if ((LExtIndex & 1) != 0 \|\| RExtIndex != (LExtIndex + 1))
	return Op;

	SDValue X = LHS.getOperand(0);
	EVT VecVT = X.getValueType();
	unsigned BitWidth = VecVT.getSizeInBits();
	unsigned NumLanes = BitWidth / 128;
	unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
	assert((BitWidth == 128 \|\| BitWidth == 256 \|\| BitWidth == 512) &&
	"Not expecting illegal vector widths here");

	// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
	// equivalent, so extract the 256/512-bit source op to 128-bit if we can.
	SDLoc DL(Op);
	if (BitWidth == 256 \|\| BitWidth == 512) {
	unsigned LaneIdx = LExtIndex / NumEltsPerLane;
	X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
	LExtIndex %= NumEltsPerLane;
	}

	// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
	// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
	// add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
	// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
	SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
	DAG.getIntPtrConstant(LExtIndex / 2, DL));
	}

	/// Depending on uarch and/or optimizing for size, we might prefer to use a
	/// vector operation in place of the typical scalar operation.
	static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Op.getValueType() == MVT::f32 \|\| Op.getValueType() == MVT::f64) &&
	"Only expecting float/double");
	return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
	}

	/// The only differences between FABS and FNEG are the mask and the logic op.
	/// FNEG also has a folding opportunity for FNEG(FABS(x)).
	static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
	assert((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) &&
	"Wrong opcode for lowering FABS or FNEG.");

	bool IsFABS = (Op.getOpcode() == ISD::FABS);

	// If this is a FABS and it has an FNEG user, bail out to fold the combination
	// into an FNABS. We'll lower the FABS after that if it is still in use.
	if (IsFABS)
	for (SDNode *User : Op->uses())
	if (User->getOpcode() == ISD::FNEG)
	return Op;

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFABSorFNEG");

	// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
	// decide if we should generate a 16-byte constant mask when we only need 4 or
	// 8 bytes for the scalar case.

	// There are no scalar bitwise logical SSE/AVX instructions, so we
	// generate a 16-byte vector constant and logic op even for the scalar case.
	// Using a 16-byte mask allows folding the load of the mask with
	// the logic op, so it can save (~4 bytes) on code size.
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	unsigned EltBits = VT.getScalarSizeInBits();
	// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
	APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
	APInt::getSignMask(EltBits);
	const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
	SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

	SDValue Op0 = Op.getOperand(0);
	bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
	unsigned LogicOp = IsFABS ? X86ISD::FAND :
	IsFNABS ? X86ISD::FOR :
	X86ISD::FXOR;
	SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

	if (VT.isVector() \|\| IsF128)
	return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

	// For the scalar case extend to a 128-bit vector, perform the logic op,
	// and extract the scalar result back out.
	Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
	SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue Mag = Op.getOperand(0);
	SDValue Sign = Op.getOperand(1);
	SDLoc dl(Op);

	// If the sign operand is smaller, extend it first.
	MVT VT = Op.getSimpleValueType();
	if (Sign.getSimpleValueType().bitsLT(VT))
	Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

	// And if it is bigger, shrink it first.
	if (Sign.getSimpleValueType().bitsGT(VT))
	Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));

	// At this point the operands and the result should have the same
	// type, and that won't be f80 since that is not custom lowered.
	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFCOPYSIGN");

	const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

	// Perform all scalar logic operations as 16-byte vectors because there are no
	// scalar FP logic instructions in SSE.
	// TODO: This isn't necessary. If we used scalar types, we might avoid some
	// unnecessary splats, but we might miss load folding opportunities. Should
	// this decision be based on OptimizeForSize?
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	// The mask constants are automatically splatted for vector types.
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue SignMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
	SDValue MagMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);

	// First, clear all bits but the sign bit from the second operand (sign).
	if (IsFakeVector)
	Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
	SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

	// Next, clear the sign bit from the first operand (magnitude).
	// TODO: If we had general constant folding for FP logic ops, this check
	// wouldn't be necessary.
	SDValue MagBits;
	if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
	APFloat APF = Op0CN->getValueAPF();
	APF.clearSign();
	MagBits = DAG.getConstantFP(APF, dl, LogicVT);
	} else {
	// If the magnitude operand wasn't a constant, we need to AND out the sign.
	if (IsFakeVector)
	Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
	MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
	}

	// OR the magnitude value with the sign bit.
	SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
	return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	MVT OpVT = N0.getSimpleValueType();
	assert((OpVT == MVT::f32 \|\| OpVT == MVT::f64) &&
	"Unexpected type for FGETSIGN");

	// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
	MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
	Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
	Res = DAG.getZExtOrTrunc(Res, dl, VT);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
	return Res;
	}

	/// Helper for creating a X86ISD::SETCC node.
	static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
	SelectionDAG &DAG) {
	return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
	DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
	}

	/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
	/// style scalarized (associative) reduction patterns.
	static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp,
	SmallVectorImpl<SDValue> &SrcOps) {
	SmallVector<SDValue, 8> Opnds;
	DenseMap<SDValue, APInt> SrcOpMap;
	EVT VT = MVT::Other;

	// Recognize a special case where a vector is casted into wide integer to
	// test all 0s.
	assert(Op.getOpcode() == unsigned(BinOp) &&
	"Unexpected bit reduction opcode");
	Opnds.push_back(Op.getOperand(0));
	Opnds.push_back(Op.getOperand(1));

	for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
	SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
	// BFS traverse all BinOp operands.
	if (I->getOpcode() == unsigned(BinOp)) {
	Opnds.push_back(I->getOperand(0));
	Opnds.push_back(I->getOperand(1));
	// Re-evaluate the number of nodes to be traversed.
	e += 2; // 2 more nodes (LHS and RHS) are pushed.
	continue;
	}

	// Quit if a non-EXTRACT_VECTOR_ELT
	if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return false;

	// Quit if without a constant index.
	SDValue Idx = I->getOperand(1);
	if (!isa<ConstantSDNode>(Idx))
	return false;

	SDValue Src = I->getOperand(0);
	DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
	if (M == SrcOpMap.end()) {
	VT = Src.getValueType();
	// Quit if not the same type.
	if (SrcOpMap.begin() != SrcOpMap.end() &&
	VT != SrcOpMap.begin()->first.getValueType())
	return false;
	unsigned NumElts = VT.getVectorNumElements();
	APInt EltCount = APInt::getNullValue(NumElts);
	M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
	SrcOps.push_back(Src);
	}
	// Quit if element already used.
	unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (M->second[CIdx])
	return false;
	M->second.setBit(CIdx);
	}

	// Quit if not all elements are used.
	for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
	E = SrcOpMap.end();
	I != E; ++I) {
	if (!I->second.isAllOnesValue())
	return false;
	}

	return true;
	}

	// Check whether an OR'd tree is PTEST-able.
	static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG, SDValue &X86CC) {
	assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");

	if (!Subtarget.hasSSE41() \|\| !Op->hasOneUse())
	return SDValue();

	SmallVector<SDValue, 8> VecIns;
	if (!matchBitOpReduction(Op, ISD::OR, VecIns))
	return SDValue();

	// Quit if not 128/256-bit vector.
	EVT VT = VecIns[0].getValueType();
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();

	SDLoc DL(Op);
	MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

	// Cast all vectors into TestVT for PTEST.
	for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
	VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);

	// If more than one full vector is evaluated, OR them first before PTEST.
	for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
	// Each iteration will OR 2 nodes and append the result until there is only
	// 1 node left, i.e. the final OR'd value of all vectors.
	SDValue LHS = VecIns[Slot];
	SDValue RHS = VecIns[Slot + 1];
	VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
	}

	X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL,
	MVT::i8);
	return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
	}

	/// return true if \c Op has a use that doesn't just read flags.
	static bool hasNonFlagsUse(SDValue Op) {
	for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
	++UI) {
	SDNode User = UI;
	unsigned UOpNo = UI.getOperandNo();
	if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
	// Look pass truncate.
	UOpNo = User->use_begin().getOperandNo();
	User = *User->use_begin();
	}

	if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
	!(User->getOpcode() == ISD::SELECT && UOpNo == 0))
	return true;
	}
	return false;
	}

	/// Emit nodes that will be selected as "test Op0,Op0", or something
	/// equivalent.
	static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// CF and OF aren't always set the way we want. Determine which
	// of these we need.
	bool NeedCF = false;
	bool NeedOF = false;
	switch (X86CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	NeedCF = true;
	break;
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	case X86::COND_O: case X86::COND_NO: {
	// Check if we really need to set the
	// Overflow flag. If NoSignedWrap is present
	// that is not actually needed.
	switch (Op->getOpcode()) {
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::SHL:
	if (Op.getNode()->getFlags().hasNoSignedWrap())
	break;
	LLVM_FALLTHROUGH;
	default:
	NeedOF = true;
	break;
	}
	break;
	}
	}
	// See if we can use the EFLAGS value from the operand instead of
	// doing a separate TEST. TEST always sets OF and CF to 0, so unless
	// we prove that the arithmetic won't overflow, we can't use OF or CF.
	if (Op.getResNo() != 0 \|\| NeedOF \|\| NeedCF) {
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	unsigned Opcode = 0;
	unsigned NumOperands = 0;

	SDValue ArithOp = Op;

	// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
	// which may be the result of a CAST. We use the variable 'Op', which is the
	// non-casted variable when we check for possible users.
	switch (ArithOp.getOpcode()) {
	case ISD::AND:
	// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
	// because a TEST instruction will be better.
	if (!hasNonFlagsUse(Op))
	break;

	LLVM_FALLTHROUGH;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::OR:
	case ISD::XOR:
	// Transform to an x86-specific ALU node with flags if there is a chance of
	// using an RMW op or only the flags are used. Otherwise, leave
	// the node alone and emit a 'test' instruction.
	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
	UE = Op.getNode()->use_end(); UI != UE; ++UI)
	if (UI->getOpcode() != ISD::CopyToReg &&
	UI->getOpcode() != ISD::SETCC &&
	UI->getOpcode() != ISD::STORE)
	goto default_case;

	// Otherwise use a regular EFLAGS-setting instruction.
	switch (ArithOp.getOpcode()) {
	default: llvm_unreachable("unexpected operator!");
	case ISD::ADD: Opcode = X86ISD::ADD; break;
	case ISD::SUB: Opcode = X86ISD::SUB; break;
	case ISD::XOR: Opcode = X86ISD::XOR; break;
	case ISD::AND: Opcode = X86ISD::AND; break;
	case ISD::OR: Opcode = X86ISD::OR; break;
	}

	NumOperands = 2;
	break;
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::OR:
	case X86ISD::XOR:
	case X86ISD::AND:
	return SDValue(Op.getNode(), 1);
	default:
	default_case:
	break;
	}

	if (Opcode == 0) {
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

	SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
	return SDValue(New.getNode(), 1);
	}

	/// Emit nodes that will be selected as "cmp Op0,Op1", or something
	/// equivalent.
	SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
	const SDLoc &dl, SelectionDAG &DAG) const {
	if (isNullConstant(Op1))
	return EmitTest(Op0, X86CC, dl, DAG, Subtarget);

	EVT CmpVT = Op0.getValueType();

	if (CmpVT.isFloatingPoint())
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);

	assert((CmpVT == MVT::i8 \|\| CmpVT == MVT::i16 \|\|
	CmpVT == MVT::i32 \|\| CmpVT == MVT::i64) && "Unexpected VT!");

	// Only promote the compare up to I32 if it is a 16 bit operation
	// with an immediate. 16 bit immediates are to be avoided.
	if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
	!DAG.getMachineFunction().getFunction().hasMinSize()) {
	ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
	ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
	// Don't do this if the immediate can fit in 8-bits.
	if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) \|\|
	(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
	unsigned ExtendOp =
	isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	if (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE) {
	// For equality comparisons try to use SIGN_EXTEND if the input was
	// truncate from something with enough sign bits.
	if (Op0.getOpcode() == ISD::TRUNCATE) {
	SDValue In = Op0.getOperand(0);
	unsigned EffBits =
	In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
	if (EffBits <= 16)
	ExtendOp = ISD::SIGN_EXTEND;
	} else if (Op1.getOpcode() == ISD::TRUNCATE) {
	SDValue In = Op1.getOperand(0);
	unsigned EffBits =
	In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
	if (EffBits <= 16)
	ExtendOp = ISD::SIGN_EXTEND;
	}
	}

	CmpVT = MVT::i32;
	Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
	Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
	}
	}
	// Use SUB instead of CMP to enable CSE between SUB and CMP.
	SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
	SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
	return Sub.getValue(1);
	}

	/// Convert a comparison if required by the subtarget.
	SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
	SelectionDAG &DAG) const {
	// If the subtarget does not support the FUCOMI instruction, floating-point
	// comparisons have to be converted.
	if (Subtarget.hasCMov() \|\|
	Cmp.getOpcode() != X86ISD::CMP \|\|
	!Cmp.getOperand(0).getValueType().isFloatingPoint() \|\|
	!Cmp.getOperand(1).getValueType().isFloatingPoint())
	return Cmp;

	// The instruction selector will select an FUCOM instruction instead of
	// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
	// build an SDNode sequence that transfers the result from FPSW into EFLAGS:
	// (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
	SDLoc dl(Cmp);
	SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
	SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
	SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
	DAG.getConstant(8, dl, MVT::i8));
	SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);

	// Some 64-bit targets lack SAHF support, but they do support FCOMI.
	assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
	return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
	}

	/// Check if replacement of SQRT with RSQRT should be disabled.
	bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	// We never want to use both SQRT and RSQRT instructions for the same input.
	if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
	return false;

	if (VT.isVector())
	return Subtarget.hasFastVectorFSQRT();
	return Subtarget.hasFastScalarFSQRT();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
	SelectionDAG &DAG, int Enabled,
	int &RefinementSteps,
	bool &UseOneConstNR,
	bool Reciprocal) const {
	EVT VT = Op.getValueType();

	// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
	// It is likely not profitable to do this for f64 because a double-precision
	// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
	// instructions: convert to single, rsqrtss, convert back to double, refine
	// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.
	// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
	// after legalize types.
	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	UseOneConstNR = false;
	// There is no FSQRT for 512-bits, but there is RSQRT14.
	unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
	return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
	int Enabled,
	int &RefinementSteps) const {
	EVT VT = Op.getValueType();

	// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
	// It is likely not profitable to do this for f64 because a double-precision
	// reciprocal estimate with refinement on x86 prior to FMA requires
	// 15 instructions: convert to single, rcpss, convert back to double, refine
	// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.

	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
	// Enable estimate codegen with 1 refinement step for vector division.
	// Scalar division estimates are disabled because they break too much
	// real-world code. These defaults are intended to match GCC behavior.
	if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
	return SDValue();

	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	// There is no FSQRT for 512-bits, but there is RCP14.
	unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
	return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// If we have at least two divisions that use the same divisor, convert to
	/// multiplication by a reciprocal. This may need to be adjusted for a given
	/// CPU if a division's cost is not at least twice the cost of a multiplication.
	/// This is because we still need one division to calculate the reciprocal and
	/// then we need two multiplies by that reciprocal as replacements for the
	/// original divisions.
	unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
	return 2;
	}

	/// Result of 'and' is compared against zero. Change to a BT node if possible.
	/// Returns the BT node and the condition code needed to use it.
	static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG,
	SDValue &X86CC) {
	assert(And.getOpcode() == ISD::AND && "Expected AND node!");
	SDValue Op0 = And.getOperand(0);
	SDValue Op1 = And.getOperand(1);
	if (Op0.getOpcode() == ISD::TRUNCATE)
	Op0 = Op0.getOperand(0);
	if (Op1.getOpcode() == ISD::TRUNCATE)
	Op1 = Op1.getOperand(0);

	SDValue Src, BitNo;
	if (Op1.getOpcode() == ISD::SHL)
	std::swap(Op0, Op1);
	if (Op0.getOpcode() == ISD::SHL) {
	if (isOneConstant(Op0.getOperand(0))) {
	// If we looked past a truncate, check that it's only truncating away
	// known zeros.
	unsigned BitWidth = Op0.getValueSizeInBits();
	unsigned AndBitWidth = And.getValueSizeInBits();
	if (BitWidth > AndBitWidth) {
	KnownBits Known = DAG.computeKnownBits(Op0);
	if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
	return SDValue();
	}
	Src = Op1;
	BitNo = Op0.getOperand(1);
	}
	} else if (Op1.getOpcode() == ISD::Constant) {
	ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
	uint64_t AndRHSVal = AndRHS->getZExtValue();
	SDValue AndLHS = Op0;

	if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
	Src = AndLHS.getOperand(0);
	BitNo = AndLHS.getOperand(1);
	} else {
	// Use BT if the immediate can't be encoded in a TEST instruction or we
	// are optimizing for size and the immedaite won't fit in a byte.
	bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
	if ((!isUInt<32>(AndRHSVal) \|\| (OptForSize && !isUInt<8>(AndRHSVal))) &&
	isPowerOf2_64(AndRHSVal)) {
	Src = AndLHS;
	BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
	Src.getValueType());
	}
	}
	}

	// No patterns found, give up.
	if (!Src.getNode())
	return SDValue();

	// If Src is i8, promote it to i32 with any_extend. There is no i8 BT
	// instruction. Since the shift amount is in-range-or-undefined, we know
	// that doing a bittest on the i32 value is ok. We extend to i32 because
	// the encoding for the i16 version is larger than the i32 version.
	// Also promote i16 to i32 for performance / code size reason.
	if (Src.getValueType() == MVT::i8 \|\| Src.getValueType() == MVT::i16)
	Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);

	// See if we can use the 32-bit instruction instead of the 64-bit one for a
	// shorter encoding. Since the former takes the modulo 32 of BitNo and the
	// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
	// known to be zero.
	if (Src.getValueType() == MVT::i64 &&
	DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
	Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);

	// If the operand types disagree, extend the shift amount to match. Since
	// BT ignores high bits (like shifts) we can use anyextend.
	if (Src.getValueType() != BitNo.getValueType())
	BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);

	X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
	dl, MVT::i8);
	return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
	}

	/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
	/// CMPs.
	static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
	SDValue &Op1) {
	unsigned SSECC;
	bool Swap = false;

	// SSE Condition code mapping:
	// 0 - EQ
	// 1 - LT
	// 2 - LE
	// 3 - UNORD
	// 4 - NEQ
	// 5 - NLT
	// 6 - NLE
	// 7 - ORD
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETOEQ:
	case ISD::SETEQ: SSECC = 0; break;
	case ISD::SETOGT:
	case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLT:
	case ISD::SETOLT: SSECC = 1; break;
	case ISD::SETOGE:
	case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLE:
	case ISD::SETOLE: SSECC = 2; break;
	case ISD::SETUO: SSECC = 3; break;
	case ISD::SETUNE:
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: SSECC = 5; break;
	case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGT: SSECC = 6; break;
	case ISD::SETO: SSECC = 7; break;
	case ISD::SETUEQ: SSECC = 8; break;
	case ISD::SETONE: SSECC = 12; break;
	}
	if (Swap)
	std::swap(Op0, Op1);

	return SSECC;
	}

	/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
	/// concatenate the result back.
	static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);
	SDValue CC = Op.getOperand(2);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	// Issue the operation on the smaller types and concatenate the result back
	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
	}

	static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(VT.getVectorElementType() == MVT::i1 &&
	"Cannot set masked compare for this operation");

	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

	// If this is a seteq make sure any build vectors of all zeros are on the RHS.
	// This helps with vptestm matching.
	// TODO: Should we just canonicalize the setcc during DAG combine?
	if ((SetCCOpcode == ISD::SETEQ \|\| SetCCOpcode == ISD::SETNE) &&
	ISD::isBuildVectorAllZeros(Op0.getNode()))
	std::swap(Op0, Op1);

	// Prefer SETGT over SETLT.
	if (SetCCOpcode == ISD::SETLT) {
	SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
	std::swap(Op0, Op1);
	}

	return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
	}

	/// Given a buildvector constant, return a new vector constant with each element
	/// incremented or decremented. If incrementing or decrementing would result in
	/// unsigned overflow or underflow or this is not a simple vector constant,
	/// return an empty value.
	static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
	auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
	if (!BV)
	return SDValue();

	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<SDValue, 8> NewVecC;
	SDLoc DL(V);
	for (unsigned i = 0; i < NumElts; ++i) {
	auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
	if (!Elt \|\| Elt->isOpaque() \|\| Elt->getSimpleValueType(0) != EltVT)
	return SDValue();

	// Avoid overflow/underflow.
	const APInt &EltC = Elt->getAPIntValue();
	if ((IsInc && EltC.isMaxValue()) \|\| (!IsInc && EltC.isNullValue()))
	return SDValue();

	NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
	}

	return DAG.getBuildVector(VT, DL, NewVecC);
	}

	/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
	/// Op0 u<= Op1:
	/// t = psubus Op0, Op1
	/// pcmpeq t, <0..0>
	static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
	ISD::CondCode Cond, const SDLoc &dl,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	MVT VET = VT.getVectorElementType();
	if (VET != MVT::i8 && VET != MVT::i16)
	return SDValue();

	switch (Cond) {
	default:
	return SDValue();
	case ISD::SETULT: {
	// If the comparison is against a constant we can turn this into a
	// setule. With psubus, setule does not require a swap. This is
	// beneficial because the constant in the register is no longer
	// destructed as the destination so it can be hoisted out of a loop.
	// Only do this pre-AVX since vpcmp* is no longer destructive.
	if (Subtarget.hasAVX())
	return SDValue();
	SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false);
	if (!ULEOp1)
	return SDValue();
	Op1 = ULEOp1;
	break;
	}
	case ISD::SETUGT: {
	// If the comparison is against a constant, we can turn this into a setuge.
	// This is beneficial because materializing a constant 0 for the PCMPEQ is
	// probably cheaper than XOR+PCMPGT using 2 different vector constants:
	// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
	SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true);
	if (!UGEOp1)
	return SDValue();
	Op1 = Op0;
	Op0 = UGEOp1;
	break;
	}
	// Psubus is better than flip-sign because it requires no inversion.
	case ISD::SETUGE:
	std::swap(Op0, Op1);
	break;
	case ISD::SETULE:
	break;
	}

	SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
	return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
	DAG.getConstant(0, dl, VT));
	}

	static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
	bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
	SDLoc dl(Op);

	if (isFP) {
	#ifndef NDEBUG
	MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
	assert(EltVT == MVT::f32 \|\| EltVT == MVT::f64);
	#endif

	unsigned Opc;
	if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
	assert(VT.getVectorNumElements() <= 16);
	Opc = X86ISD::CMPM;
	} else {
	Opc = X86ISD::CMPP;
	// The SSE/AVX packed FP comparison nodes are defined with a
	// floating-point vector result that matches the operand type. This allows
	// them to work with an SSE1 target (integer vector types are not legal).
	VT = Op0.getSimpleValueType();
	}

	// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
	// emit two comparisons and a logic op to tie them together.
	SDValue Cmp;
	unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
	if (SSECC >= 8 && !Subtarget.hasAVX()) {
	// LLVM predicate is SETUEQ or SETONE.
	unsigned CC0, CC1;
	unsigned CombineOpc;
	if (Cond == ISD::SETUEQ) {
	CC0 = 3; // UNORD
	CC1 = 0; // EQ
	CombineOpc = X86ISD::FOR;
	} else {
	assert(Cond == ISD::SETONE);
	CC0 = 7; // ORD
	CC1 = 4; // NEQ
	CombineOpc = X86ISD::FAND;
	}

	SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CC0, dl, MVT::i8));
	SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CC1, dl, MVT::i8));
	Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
	} else {
	// Handle all other FP comparisons here.
	Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(SSECC, dl, MVT::i8));
	}

	// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
	// result type of SETCC. The bitcast is expected to be optimized away
	// during combining/isel.
	if (Opc == X86ISD::CMPP)
	Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

	return Cmp;
	}

	MVT VTOp0 = Op0.getSimpleValueType();
	assert(VTOp0 == Op1.getSimpleValueType() &&
	"Expected operands with same type!");
	assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
	"Invalid number of packed elements for source and destination!");

	// This is being called by type legalization because v2i32 is marked custom
	// for result type legalization for v2f32.
	if (VTOp0 == MVT::v2i32)
	return SDValue();

	// The non-AVX512 code below works under the assumption that source and
	// destination types are the same.
	assert((Subtarget.hasAVX512() \|\| (VT == VTOp0)) &&
	"Value types for source and destination must be the same!");

	// The result is boolean, but operands are int/float
	if (VT.getVectorElementType() == MVT::i1) {
	// In AVX-512 architecture setcc returns mask with i1 elements,
	// But there is no compare instruction for i8 and i16 elements in KNL.
	assert((VTOp0.getScalarSizeInBits() >= 32 \|\| Subtarget.hasBWI()) &&
	"Unexpected operand type");
	return LowerIntVSETCC_AVX512(Op, DAG);
	}

	// Lower using XOP integer comparisons.
	if (VT.is128BitVector() && Subtarget.hasXOP()) {
	// Translate compare code to XOP PCOM compare mode.
	unsigned CmpMode = 0;
	switch (Cond) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETULT:
	case ISD::SETLT: CmpMode = 0x00; break;
	case ISD::SETULE:
	case ISD::SETLE: CmpMode = 0x01; break;
	case ISD::SETUGT:
	case ISD::SETGT: CmpMode = 0x02; break;
	case ISD::SETUGE:
	case ISD::SETGE: CmpMode = 0x03; break;
	case ISD::SETEQ: CmpMode = 0x04; break;
	case ISD::SETNE: CmpMode = 0x05; break;
	}

	// Are we comparing unsigned or signed integers?
	unsigned Opc =
	ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CmpMode, dl, MVT::i8));
	}

	// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
	// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
	if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
	SDValue BC0 = peekThroughBitcasts(Op0);
	if (BC0.getOpcode() == ISD::AND) {
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (getTargetConstantBitsFromNode(BC0.getOperand(1),
	VT.getScalarSizeInBits(), UndefElts,
	EltBits, false, false)) {
	if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
	Cond = ISD::SETEQ;
	Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
	}
	}
	}
	}

	// ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
	if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
	Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
	ConstantSDNode *C1 = isConstOrConstSplat(Op1);
	if (C1 && C1->getAPIntValue().isPowerOf2()) {
	unsigned BitWidth = VT.getScalarSizeInBits();
	unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;

	SDValue Result = Op0.getOperand(0);
	Result = DAG.getNode(ISD::SHL, dl, VT, Result,
	DAG.getConstant(ShiftAmt, dl, VT));
	Result = DAG.getNode(ISD::SRA, dl, VT, Result,
	DAG.getConstant(BitWidth - 1, dl, VT));
	return Result;
	}
	}

	// Break 256-bit integer vector compare into smaller ones.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntVSETCC(Op, DAG);

	// If this is a SETNE against the signed minimum value, change it to SETGT.
	// If this is a SETNE against the signed maximum value, change it to SETLT.
	// which will be swapped to SETGT.
	// Otherwise we use PCMPEQ+invert.
	APInt ConstValue;
	if (Cond == ISD::SETNE &&
	ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
	if (ConstValue.isMinSignedValue())
	Cond = ISD::SETGT;
	else if (ConstValue.isMaxSignedValue())
	Cond = ISD::SETLT;
	}

	// If both operands are known non-negative, then an unsigned compare is the
	// same as a signed compare and there's no need to flip signbits.
	// TODO: We could check for more general simplifications here since we're
	// computing known bits.
	bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
	!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

	// Special case: Use min/max operations for unsigned compares.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (ISD::isUnsignedIntSetCC(Cond) &&
	(FlipSigns \|\| ISD::isTrueWhenEqual(Cond)) &&
	TLI.isOperationLegal(ISD::UMIN, VT)) {
	// If we have a constant operand, increment/decrement it and change the
	// condition to avoid an invert.
	if (Cond == ISD::SETUGT &&
	ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
	return !C->getAPIntValue().isMaxValue();
	})) {
	// X > C --> X >= (C+1) --> X == umax(X, C+1)
	Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT));
	Cond = ISD::SETUGE;
	}
	if (Cond == ISD::SETULT &&
	ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
	return !C->getAPIntValue().isNullValue();
	})) {
	// X < C --> X <= (C-1) --> X == umin(X, C-1)
	Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT));
	Cond = ISD::SETULE;
	}
	bool Invert = false;
	unsigned Opc;
	switch (Cond) {
	default: llvm_unreachable("Unexpected condition code");
	case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
	case ISD::SETULE: Opc = ISD::UMIN; break;
	case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: Opc = ISD::UMAX; break;
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	return Result;
	}

	// Try to use SUBUS and PCMPEQ.
	if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
	return V;

	// We are handling one of the integer comparisons here. Since SSE only has
	// GT and EQ comparisons for integer, swapping operands and multiple
	// operations may be required for some comparisons.
	unsigned Opc = (Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) ? X86ISD::PCMPEQ
	: X86ISD::PCMPGT;
	bool Swap = Cond == ISD::SETLT \|\| Cond == ISD::SETULT \|\|
	Cond == ISD::SETGE \|\| Cond == ISD::SETUGE;
	bool Invert = Cond == ISD::SETNE \|\|
	(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

	if (Swap)
	std::swap(Op0, Op1);

	// Check that the operation in question is available (most are plain SSE2,
	// but PCMPGTQ and PCMPEQQ have different requirements).
	if (VT == MVT::v2i64) {
	if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
	assert(Subtarget.hasSSE2() && "Don't know how to lower!");

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations. The lower
	// compare is always unsigned.
	SDValue SB;
	if (FlipSigns) {
	SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
	} else {
	SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
	}
	Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
	Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);

	// Cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Emulate PCMPGTQ with (hi1 > hi2) \| ((hi1 == hi2) & (lo1 > lo2))
	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

	// Create masks for only the low parts/high parts of the 64 bit integers.
	static const int MaskHi[] = { 1, 1, 3, 3 };
	static const int MaskLo[] = { 0, 0, 2, 2 };
	SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
	SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
	SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
	Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}

	if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
	// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
	// pcmpeqd + pshufd + pand.
	assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Do the compare.
	SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

	// Make sure the lower and upper halves are both all-ones.
	static const int Mask[] = { 1, 0, 3, 2 };
	SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
	Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}
	}

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations.
	if (FlipSigns) {
	MVT EltVT = VT.getVectorElementType();
	SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
	VT);
	Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
	Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	return Result;
	}

	// Try to select this as a KORTEST+SETCC if possible.
	static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDValue &X86CC) {
	// Only support equality comparisons.
	if (CC != ISD::SETEQ && CC != ISD::SETNE)
	return SDValue();

	// Must be a bitcast from vXi1.
	if (Op0.getOpcode() != ISD::BITCAST)
	return SDValue();

	Op0 = Op0.getOperand(0);
	MVT VT = Op0.getSimpleValueType();
	if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
	!(Subtarget.hasDQI() && VT == MVT::v8i1) &&
	!(Subtarget.hasBWI() && (VT == MVT::v32i1 \|\| VT == MVT::v64i1)))
	return SDValue();

	X86::CondCode X86Cond;
	if (isNullConstant(Op1)) {
	X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
	} else if (isAllOnesConstant(Op1)) {
	// C flag is set for all ones.
	X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
	} else
	return SDValue();

	// If the input is an OR, we can combine it's operands into the KORTEST.
	SDValue LHS = Op0;
	SDValue RHS = Op0;
	if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
	LHS = Op0.getOperand(0);
	RHS = Op0.getOperand(1);
	}

	X86CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
	}

	/// Emit flags for the given setcc condition and operands. Also returns the
	/// corresponding X86 condition code constant in X86CC.
	SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
	ISD::CondCode CC, const SDLoc &dl,
	SelectionDAG &DAG,
	SDValue &X86CC) const {
	// Optimize to BT if possible.
	// Lower (X & (1 << N)) == 0 to BT(X, N).
	// Lower ((X >>u N) & 1) != 0 to BT(X, N).
	// Lower ((X >>s N) & 1) != 0 to BT(X, N).
	if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
	return BT;
	}

	// Try to use PTEST for a tree ORs equality compared with 0.
	// TODO: We could do AND tree with all 1s as well by using the C flag.
	if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))
	return PTEST;
	}

	// Try to lower using KORTEST.
	if (SDValue KORTEST = EmitKORTEST(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
	return KORTEST;

	// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
	// these.
	if ((isOneConstant(Op1) \|\| isNullConstant(Op1)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// If the input is a setcc, then reuse the input setcc or use a new one with
	// the inverted condition.
	if (Op0.getOpcode() == X86ISD::SETCC) {
	bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);

	X86CC = Op0.getOperand(0);
	if (Invert) {
	X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	X86CC = DAG.getConstant(CCode, dl, MVT::i8);
	}

	return Op0.getOperand(1);
	}
	}

	bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
	X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
	if (CondCode == X86::COND_INVALID)
	return SDValue();

	SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG);
	EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
	X86CC = DAG.getConstant(CondCode, dl, MVT::i8);
	return EFLAGS;
	}

	SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	MVT VT = Op.getSimpleValueType();

	if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

	assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDLoc dl(Op);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

	SDValue X86CC;
	SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
	if (!EFLAGS)
	return SDValue();

	return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
	}

	SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue Carry = Op.getOperand(2);
	SDValue Cond = Op.getOperand(3);
	SDLoc DL(Op);

	assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
	X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

	// Recreate the carry if needed.
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
	SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
	return getSETCC(CC, Cmp.getValue(1), DL, DAG);
	}

	// This function returns three things: the arithmetic computation itself
	// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
	// flag and the condition code define the case in which the arithmetic
	// computation overflows.
	static std::pair<SDValue, SDValue>
	getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
	assert(Op.getResNo() == 0 && "Unexpected result number!");
	SDValue Value, Overflow;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	unsigned BaseOp = 0;
	SDLoc DL(Op);
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Unknown ovf instruction!");
	case ISD::SADDO:
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_O;
	break;
	case ISD::UADDO:
	BaseOp = X86ISD::ADD;
	Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
	break;
	case ISD::SSUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_O;
	break;
	case ISD::USUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_B;
	break;
	case ISD::SMULO:
	BaseOp = X86ISD::SMUL;
	Cond = X86::COND_O;
	break;
	case ISD::UMULO:
	BaseOp = X86ISD::UMUL;
	Cond = X86::COND_O;
	break;
	}

	if (BaseOp) {
	// Also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
	Overflow = Value.getValue(1);
	}

	return std::make_pair(Value, Overflow);
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
	// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
	// looks for this combo and may remove the "setcc" instruction if the "setcc"
	// has only one use.
	SDLoc DL(Op);
	X86::CondCode Cond;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);

	SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
	assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
	return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
	}

	/// Return true if opcode is a X86 logical comparison.
	static bool isX86LogicalCmp(SDValue Op) {
	unsigned Opc = Op.getOpcode();
	if (Opc == X86ISD::CMP \|\| Opc == X86ISD::COMI \|\| Opc == X86ISD::UCOMI \|\|
	Opc == X86ISD::SAHF)
	return true;
	if (Op.getResNo() == 1 &&
	(Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB \|\| Opc == X86ISD::ADC \|\|
	Opc == X86ISD::SBB \|\| Opc == X86ISD::SMUL \|\| Opc == X86ISD::UMUL \|\|
	Opc == X86ISD::OR \|\| Opc == X86ISD::XOR \|\| Opc == X86ISD::AND))
	return true;

	return false;
	}

	static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
	if (V.getOpcode() != ISD::TRUNCATE)
	return false;

	SDValue VOp0 = V.getOperand(0);
	unsigned InBits = VOp0.getValueSizeInBits();
	unsigned Bits = V.getValueSizeInBits();
	return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
	}

	SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	bool AddTest = true;
	SDValue Cond = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Op2 = Op.getOperand(2);
	SDLoc DL(Op);
	MVT VT = Op1.getSimpleValueType();
	SDValue CC;

	// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
	// are available or VBLENDV if AVX is available.
	// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
	if (Cond.getOpcode() == ISD::SETCC &&
	((Subtarget.hasSSE2() && VT == MVT::f64) \|\|
	(Subtarget.hasSSE1() && VT == MVT::f32)) &&
	VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
	SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
	unsigned SSECC = translateX86FSETCC(
	cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);

	if (Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
	CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
	assert(!VT.isVector() && "Not a scalar type?");
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	if (SSECC < 8 \|\| Subtarget.hasAVX()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
	DAG.getConstant(SSECC, DL, MVT::i8));

	// If we have AVX, we can use a variable vector select (VBLENDV) instead
	// of 3 logic instructions for size savings and potentially speed.
	// Unfortunately, there is no scalar form of VBLENDV.

	// If either operand is a +0.0 constant, don't try this. We can expect to
	// optimize away at least one of the logic instructions later in that
	// case, so that sequence would be faster than a variable blend.

	// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
	// uses XMM0 as the selection register. That may need just as many
	// instructions as the AND/ANDN/OR sequence due to register moves, so
	// don't bother.
	if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
	!isNullFPConstant(Op2)) {
	// Convert to vectors, do a VSELECT, and convert back to scalar.
	// All of the conversions should be optimized away.
	MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
	SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
	SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
	SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

	MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
	VCmp = DAG.getBitcast(VCmpVT, VCmp);

	SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	VSel, DAG.getIntPtrConstant(0, DL));
	}
	SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
	SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
	return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
	}
	}

	// AVX512 fallback is to lower selects of scalar floats to masked moves.
	if ((VT == MVT::f64 \|\| VT == MVT::f32) && Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	// For v64i1 without 64-bit support we need to split and rejoin.
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	assert(Subtarget.hasBWI() && "Expected BWI to be legal");
	SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
	SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
	SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
	SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
	SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
	SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
	SDValue Op1Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
	Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
	else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
	Op1Scalar = Op1.getOperand(0);
	SDValue Op2Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
	Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
	else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
	Op2Scalar = Op2.getOperand(0);
	if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
	SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
	Op1Scalar, Op2Scalar);
	if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, newSelect);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
	DAG.getIntPtrConstant(0, DL));
	}
	}

	if (Cond.getOpcode() == ISD::SETCC) {
	if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
	Cond = NewCond;
	// If the condition was updated, it's possible that the operands of the
	// select were also updated (for example, EmitTest has a RAUW). Refresh
	// the local references to the select operands in case they got stale.
	Op1 = Op.getOperand(1);
	Op2 = Op.getOperand(2);
	}
	}

	// (select (x == 0), -1, y) -> (sign_bit (x - 1)) \| y
	// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) \| y
	// (select (x != 0), y, -1) -> (sign_bit (x - 1)) \| y
	// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) \| y
	// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
	// (select (and (x , 0x1) == 0), y, (z \| y) ) -> (-(and (x , 0x1)) & z ) \| y
	if (Cond.getOpcode() == X86ISD::SETCC &&
	Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(Cond.getOperand(1).getOperand(1))) {
	SDValue Cmp = Cond.getOperand(1);
	unsigned CondCode =
	cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();

	if ((isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(CondCode == X86::COND_E \|\| CondCode == X86::COND_NE)) {
	SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
	SDValue CmpOp0 = Cmp.getOperand(0);

	// Apply further optimizations for special cases
	// (select (x != 0), -1, 0) -> neg & sbb
	// (select (x == 0), 0, -1) -> neg & sbb
	if (isNullConstant(Y) &&
	(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
	SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
	SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	Zero = DAG.getConstant(0, DL, Op.getValueType());
	return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero);
	}

	Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
	CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
	SDValue Res = // Res = 0 or -1.
	DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);

	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
	Res = DAG.getNOT(DL, Res, Res.getValueType());

	if (!isNullConstant(Op2))
	Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
	return Res;
	} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
	Cmp.getOperand(0).getOpcode() == ISD::AND &&
	isOneConstant(Cmp.getOperand(0).getOperand(1))) {
	SDValue CmpOp0 = Cmp.getOperand(0);
	SDValue Src1, Src2;
	// true if Op2 is XOR or OR operator and one of its operands
	// is equal to Op1
	// ( a , a op b) \|\| ( b , a op b)
	auto isOrXorPattern = [&]() {
	if ((Op2.getOpcode() == ISD::XOR \|\| Op2.getOpcode() == ISD::OR) &&
	(Op2.getOperand(0) == Op1 \|\| Op2.getOperand(1) == Op1)) {
	Src1 =
	Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
	Src2 = Op1;
	return true;
	}
	return false;
	};

	if (isOrXorPattern()) {
	SDValue Neg;
	unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
	// we need mask of all zeros or ones with same size of the other
	// operands.
	if (CmpSz > VT.getSizeInBits())
	Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
	else if (CmpSz < VT.getSizeInBits())
	Neg = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
	DAG.getConstant(1, DL, VT));
	else
	Neg = CmpOp0;
	SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	Neg); // -(and (x, 0x1))
	SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
	return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
	}
	}
	}

	// Look past (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	MVT VT = Op.getSimpleValueType();

	bool IllegalFPCMov = false;
	if (VT.isFloatingPoint() && !VT.isVector() &&
	!isScalarFPTypeInSSEReg(VT)) // FPStack?
	IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

	if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) \|\|
	Cmp.getOpcode() == X86ISD::BT) { // FIXME
	Cond = Cmp;
	AddTest = false;
	}
	} else if (CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) {
	SDValue Value;
	X86::CondCode X86Cond;
	std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

	CC = DAG.getConstant(X86Cond, DL, MVT::i8);
	AddTest = false;
	}

	if (AddTest) {
	// Look past the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	SDValue BTCC;
	if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
	CC = BTCC;
	Cond = BT;
	AddTest = false;
	}
	}
	}

	if (AddTest) {
	CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
	Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()),
	X86::COND_NE, DL, DAG);
	}

	// a < b ? -1 : 0 -> RES = ~setcc_carry
	// a < b ? 0 : -1 -> RES = setcc_carry
	// a >= b ? -1 : 0 -> RES = setcc_carry
	// a >= b ? 0 : -1 -> RES = ~setcc_carry
	if (Cond.getOpcode() == X86ISD::SUB) {
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

	if ((CondCode == X86::COND_AE \|\| CondCode == X86::COND_B) &&
	(isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(isNullConstant(Op1) \|\| isNullConstant(Op2))) {
	SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	Cond);
	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
	return DAG.getNOT(DL, Res, Res.getValueType());
	return Res;
	}
	}

	// X86 doesn't have an i8 cmov. If both operands are the result of a truncate
	// widen the cmov and push the truncate through. This avoids introducing a new
	// branch during isel and doesn't add any extensions.
	if (Op.getValueType() == MVT::i8 &&
	Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
	SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
	if (T1.getValueType() == T2.getValueType() &&
	// Blacklist CopyFromReg to avoid partial register stalls.
	T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
	CC, Cond);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}
	}

	// Or finally, promote i8 cmovs if we have CMOV,
	// or i16 cmovs if it won't prevent folding a load.
	// FIXME: we should not limit promotion of i8 case to only when the CMOV is
	// legal, but EmitLoweredSelect() can not deal with these extensions
	// being inserted between two CMOV's. (in i16 case too TBN)
	// https://bugs.llvm.org/show_bug.cgi?id=40974
	if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) \|\|
	(Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
	!MayFoldLoad(Op2))) {
	Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
	Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}

	// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
	// condition is true.
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
	}

	static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	MVT VTElt = VT.getVectorElementType();
	SDLoc dl(Op);

	unsigned NumElts = VT.getVectorNumElements();

	// Extend VT if the scalar type is i8/i16 and BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
	// If v16i32 is to be avoided, we'll need to split and concatenate.
	if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
	return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);

	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
	}

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, dl));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
	}

	SDValue V;
	MVT WideEltVT = WideVT.getVectorElementType();
	if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) \|\|
	(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
	V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
	} else {
	SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
	SDValue Zero = DAG.getConstant(0, dl, WideVT);
	V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
	}

	// Truncate if we had to extend i16/i8 above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(VTElt, NumElts);
	V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
	DAG.getIntPtrConstant(0, dl));

	return V;
	}

	static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	assert(Subtarget.hasAVX() && "Expected AVX support");
	return LowerAVXExtend(Op, DAG, Subtarget);
	}

	// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
	// For sign extend this needs to handle all vector sizes and SSE4.1 and
	// non-SSE4.1 targets. For zero extend this should only handle inputs of
	// MVT::v64i8 when BWI is not supported, but AVX512 is.
	static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT VT = Op->getSimpleValueType(0);
	MVT InVT = In.getSimpleValueType();

	MVT SVT = VT.getVectorElementType();
	MVT InSVT = InVT.getVectorElementType();
	assert(SVT.getSizeInBits() > InSVT.getSizeInBits());

	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();
	if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
	!(VT.is256BitVector() && Subtarget.hasAVX()) &&
	!(VT.is512BitVector() && Subtarget.hasAVX512()))
	return SDValue();

	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();
	unsigned NumElts = VT.getVectorNumElements();

	// For 256-bit vectors, we only need the lower (128-bit) half of the input.
	// For 512-bit vectors, we need 128-bits or 256-bits.
	if (InVT.getSizeInBits() > 128) {
	// Input needs to be at least the same number of elements as output, and
	// at least 128-bits.
	int InSize = InSVT.getSizeInBits() * NumElts;
	In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
	InVT = In.getSimpleValueType();
	}

	// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
	// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
	// need to be handled here for 256/512-bit results.
	if (Subtarget.hasInt256()) {
	assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");

	if (InVT.getVectorNumElements() != NumElts)
	return DAG.getNode(Op.getOpcode(), dl, VT, In);

	// FIXME: Apparently we create inreg operations that could be regular
	// extends.
	unsigned ExtOpc =
	Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
	: ISD::ZERO_EXTEND;
	return DAG.getNode(ExtOpc, dl, VT, In);
	}

	// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
	if (Subtarget.hasAVX()) {
	assert(VT.is256BitVector() && "256-bit vector expected");
	int HalfNumElts = NumElts / 2;
	MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts);

	unsigned NumSrcElts = InVT.getVectorNumElements();
	SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
	for (int i = 0; i != HalfNumElts; ++i)
	HiMask[i] = HalfNumElts + i;

	SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
	SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
	Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	}

	// We should only get here for sign extend.
	assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
	assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");

	// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
	SDValue Curr = In;
	SDValue SignExt = Curr;

	// As SRAI is only available on i16/i32 types, we expand only up to i32
	// and handle i64 separately.
	if (InVT != MVT::v4i32) {
	MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;

	unsigned DestWidth = DestVT.getScalarSizeInBits();
	unsigned Scale = DestWidth / InSVT.getSizeInBits();

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned DestElts = DestVT.getVectorNumElements();

	// Build a shuffle mask that takes each input element and places it in the
	// MSBs of the new element size.
	SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
	for (unsigned i = 0; i != DestElts; ++i)
	Mask[i * Scale + (Scale - 1)] = i;

	Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
	Curr = DAG.getBitcast(DestVT, Curr);

	unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
	SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
	DAG.getConstant(SignExtShift, dl, MVT::i8));
	}

	if (VT == MVT::v2i64) {
	assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
	SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
	SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
	SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
	SignExt = DAG.getBitcast(VT, SignExt);
	}

	return SignExt;
	}

	static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	assert(VT.isVector() && InVT.isVector() && "Expected vector type");
	assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
	"Expected same number of elements");
	assert((VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::i64) &&
	"Unexpected element type");
	assert((InVT.getVectorElementType() == MVT::i8 \|\|
	InVT.getVectorElementType() == MVT::i16 \|\|
	InVT.getVectorElementType() == MVT::i32) &&
	"Unexpected element type");

	// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
	if (InVT == MVT::v8i8) {
	if (!ExperimentalVectorWideningLegalization \|\| VT != MVT::v8i64)
	return SDValue();

	In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
	MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
	return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
	}

	if (Subtarget.hasInt256())
	return Op;

	// Optimize vectors in AVX mode
	// Sign extend v8i16 to v8i32 and
	// v4i32 to v4i64
	//
	// Divide input vector into two parts
	// for v4i32 the high shuffle mask will be {2, 3, -1, -1}
	// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
	// concat the vectors to original VT

	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);

	SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);

	unsigned NumElems = InVT.getVectorNumElements();
	SmallVector<int,8> ShufMask(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask[i] = i + NumElems/2;

	SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
	OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	/// Change a vector store into a pair of half-size vector stores.
	static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
	SDValue StoredVal = Store->getValue();
	assert((StoredVal.getValueType().is256BitVector() \|\|
	StoredVal.getValueType().is512BitVector()) &&
	"Expecting 256/512-bit op");

	// Splitting volatile memory ops is not allowed unless the operation was not
	// legal to begin with. We are assuming the input op is legal (this transform
	// is only used for targets with AVX).
	if (Store->isVolatile())
	return SDValue();

	MVT StoreVT = StoredVal.getSimpleValueType();
	unsigned NumElems = StoreVT.getVectorNumElements();
	unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
	unsigned HalfAlign = (128 == HalfSize ? 16 : 32);

	SDLoc DL(Store);
	SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize);
	SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize);
	SDValue Ptr0 = Store->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL);
	unsigned Alignment = Store->getAlignment();
	SDValue Ch0 =
	DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
	Alignment, Store->getMemOperand()->getFlags());
	SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
	Store->getPointerInfo().getWithOffset(HalfAlign),
	MinAlign(Alignment, HalfAlign),
	Store->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
	}

	/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
	/// type.
	static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
	SelectionDAG &DAG) {
	SDValue StoredVal = Store->getValue();
	assert(StoreVT.is128BitVector() &&
	StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
	StoredVal = DAG.getBitcast(StoreVT, StoredVal);

	// Splitting volatile memory ops is not allowed unless the operation was not
	// legal to begin with. We are assuming the input op is legal (this transform
	// is only used for targets with AVX).
	if (Store->isVolatile())
	return SDValue();

	MVT StoreSVT = StoreVT.getScalarType();
	unsigned NumElems = StoreVT.getVectorNumElements();
	unsigned ScalarSize = StoreSVT.getStoreSize();
	unsigned Alignment = Store->getAlignment();

	SDLoc DL(Store);
	SmallVector<SDValue, 4> Stores;
	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Offset = i * ScalarSize;
	SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
	SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
	DAG.getIntPtrConstant(i, DL));
	SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
	Store->getPointerInfo().getWithOffset(Offset),
	MinAlign(Alignment, Offset),
	Store->getMemOperand()->getFlags());
	Stores.push_back(Ch);
	}
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
	}

	static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
	SDLoc dl(St);
	SDValue StoredVal = St->getValue();

	// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
	if (StoredVal.getValueType().isVector() &&
	StoredVal.getValueType().getVectorElementType() == MVT::i1) {
	assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
	"Unexpected VT");
	assert(!St->isTruncatingStore() && "Expected non-truncating store");
	assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
	"Expected AVX512F without AVX512DQI");

	StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
	DAG.getUNDEF(MVT::v16i1), StoredVal,
	DAG.getIntPtrConstant(0, dl));
	StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
	StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	if (St->isTruncatingStore())
	return SDValue();

	// If this is a 256-bit store of concatenated ops, we are better off splitting
	// that store into two 128-bit stores. This avoids spurious use of 256-bit ops
	// and each half can execute independently. Some cores would split the op into
	// halves anyway, so the concat (vinsertf128) is purely an extra op.
	MVT StoreVT = StoredVal.getSimpleValueType();
	if (StoreVT.is256BitVector()) {
	SmallVector<SDValue, 4> CatOps;
	if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
	return splitVectorStore(St, DAG);
	return SDValue();
	}

	assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
	"Unexpected VT");
	if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) !=
	TargetLowering::TypeWidenVector)
	return SDValue();

	MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
	StoreVT.getVectorNumElements() * 2);
	StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
	DAG.getUNDEF(StoreVT));

	if (Subtarget.hasSSE2()) {
	// Widen the vector, cast to a v2x64 type, extract the single 64-bit element
	// and store it.
	MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
	MVT CastVT = MVT::getVectorVT(StVT, 2);
	StoredVal = DAG.getBitcast(CastVT, StoredVal);
	StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
	DAG.getIntPtrConstant(0, dl));

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}
	assert(Subtarget.hasSSE1() && "Expected SSE");
	SDVTList Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
	return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
	St->getMemOperand());
	}

	// Lower vector extended loads using a shuffle. If SSSE3 is not available we
	// may emit an illegal shuffle but the expansion is still better than scalar
	// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
	// we'll emit a shuffle and a arithmetic shift.
	// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
	// TODO: It is possible to support ZExt by zeroing the undef values during
	// the shuffle phase or after the shuffle.
	static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT RegVT = Op.getSimpleValueType();
	assert(RegVT.isVector() && "We only custom lower vector loads.");
	assert(RegVT.isInteger() &&
	"We only custom lower integer vector loads.");

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);
	EVT MemVT = Ld->getMemoryVT();

	// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
	if (RegVT.getVectorElementType() == MVT::i1) {
	assert(EVT(RegVT) == MemVT && "Expected non-extending load");
	assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
	assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
	"Expected AVX512F without AVX512DQI");

	SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());

	// Replace chain users with the new chain.
	assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");

	SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
	Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
	DAG.getBitcast(MVT::v16i1, Val),
	DAG.getIntPtrConstant(0, dl));
	return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
	}

	// Nothing useful we can do without SSE2 shuffles.
	assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned RegSz = RegVT.getSizeInBits();

	ISD::LoadExtType Ext = Ld->getExtensionType();

	assert((Ext == ISD::EXTLOAD \|\| Ext == ISD::SEXTLOAD)
	&& "Only anyext and sext are currently implemented.");
	assert(MemVT != RegVT && "Cannot extend to the same type");
	assert(MemVT.isVector() && "Must load a vector from memory");

	unsigned NumElems = RegVT.getVectorNumElements();
	unsigned MemSz = MemVT.getSizeInBits();
	assert(RegSz > MemSz && "Register size must be greater than the mem size");

	if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
	// The only way in which we have a legal 256-bit vector result but not the
	// integer 256-bit operations needed to directly lower a sextload is if we
	// have AVX1 but not AVX2. In that case, we can always emit a sextload to
	// a 128-bit vector and a normal sign_extend to 256-bits that should get
	// correctly legalized. We do this late to allow the canonical form of
	// sextload to persist throughout the rest of the DAG combiner -- it wants
	// to fold together any extensions it can, and so will fuse a sign_extend
	// of an sextload into a sextload targeting a wider value.
	SDValue Load;
	if (MemSz == 128) {
	// Just switch this to a normal load.
	assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
	"it must be a legal 128-bit vector "
	"type!");
	Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	} else {
	assert(MemSz < 128 &&
	"Can't extend a type wider than 128 bits to a 256 bit vector!");
	// Do an sext load to a 128-bit vector type. We want to use the same
	// number of elements, but elements half as wide. This will end up being
	// recursively lowered by this routine, but will succeed as we definitely
	// have all the necessary features if we're using AVX1.
	EVT HalfEltVT =
	EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
	EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
	Load =
	DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	}

	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");

	// Finally, do a normal sign-extend to the desired register.
	SDValue SExt = DAG.getSExtOrTrunc(Load, dl, RegVT);
	return DAG.getMergeValues({SExt, Load.getValue(1)}, dl);
	}

	// All sizes must be a power of two.
	assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
	"Non-power-of-two elements are not custom lowered!");

	// Attempt to load the original value using scalar loads.
	// Find the largest scalar type that divides the total loaded size.
	MVT SclrLoadTy = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
	SclrLoadTy = Tp;
	}
	}

	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
	if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
	(64 <= MemSz))
	SclrLoadTy = MVT::f64;

	// Calculate the number of scalar loads that we need to perform
	// in order to load our vector from memory.
	unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();

	assert((Ext != ISD::SEXTLOAD \|\| NumLoads == 1) &&
	"Can only lower sext loads with a single scalar load!");

	unsigned loadRegSize = RegSz;
	if (Ext == ISD::SEXTLOAD && RegSz >= 256)
	loadRegSize = 128;

	// If we don't have BWI we won't be able to create the shuffle needed for
	// v8i8->v8i64.
	if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
	MemVT == MVT::v8i8)
	loadRegSize = 128;

	// Represent our vector as a sequence of elements which are the
	// largest scalar that we can load.
	EVT LoadUnitVecVT = EVT::getVectorVT(
	*DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits());

	// Represent the data using the same element type that is stored in
	// memory. In practice, we ''widen'' MemVT.
	EVT WideVecVT =
	EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	loadRegSize / MemVT.getScalarSizeInBits());

	assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
	"Invalid vector type");

	// We can't shuffle using an illegal type.
	assert(TLI.isTypeLegal(WideVecVT) &&
	"We only lower types that form legal widened vector types");

	SmallVector<SDValue, 8> Chains;
	SDValue Ptr = Ld->getBasePtr();
	unsigned OffsetInc = SclrLoadTy.getSizeInBits() / 8;
	SDValue Increment = DAG.getConstant(OffsetInc, dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	SDValue Res = DAG.getUNDEF(LoadUnitVecVT);

	unsigned Offset = 0;
	for (unsigned i = 0; i < NumLoads; ++i) {
	unsigned NewAlign = MinAlign(Ld->getAlignment(), Offset);

	// Perform a single load.
	SDValue ScalarLoad =
	DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr,
	Ld->getPointerInfo().getWithOffset(Offset),
	NewAlign, Ld->getMemOperand()->getFlags());
	Chains.push_back(ScalarLoad.getValue(1));
	// Create the first element type using SCALAR_TO_VECTOR in order to avoid
	// another round of DAGCombining.
	if (i == 0)
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
	else
	Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
	ScalarLoad, DAG.getIntPtrConstant(i, dl));

	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
	Offset += OffsetInc;
	}

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);

	// Bitcast the loaded value to a vector of the original element type, in
	// the size of the target vector type.
	SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
	unsigned SizeRatio = RegSz / MemSz;

	if (Ext == ISD::SEXTLOAD) {
	SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG);
	return DAG.getMergeValues({Sext, TF}, dl);
	}

	if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
	MemVT == MVT::v8i8) {
	SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG);
	return DAG.getMergeValues({Sext, TF}, dl);
	}

	// Redistribute the loaded elements into the different locations.
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i * SizeRatio] = i;

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
	DAG.getUNDEF(WideVecVT), ShuffleVec);

	// Bitcast to the requested type.
	Shuff = DAG.getBitcast(RegVT, Shuff);
	return DAG.getMergeValues({Shuff, TF}, dl);
	}

	/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
	/// each of which has no other use apart from the AND / OR.
	static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
	Opc = Op.getOpcode();
	if (Opc != ISD::OR && Opc != ISD::AND)
	return false;
	return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse() &&
	Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(1).hasOneUse());
	}

	/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
	/// SETCC node has a single use.
	static bool isXor1OfSetCC(SDValue Op) {
	if (Op.getOpcode() != ISD::XOR)
	return false;
	if (isOneConstant(Op.getOperand(1)))
	return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse();
	return false;
	}

	SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
	bool addTest = true;
	SDValue Chain = Op.getOperand(0);
	SDValue Cond = Op.getOperand(1);
	SDValue Dest = Op.getOperand(2);
	SDLoc dl(Op);
	SDValue CC;
	bool Inverted = false;

	if (Cond.getOpcode() == ISD::SETCC) {
	// Check for setcc([su]{add,sub,mul}o == 0).
	if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
	isNullConstant(Cond.getOperand(1)) &&
	Cond.getOperand(0).getResNo() == 1 &&
	(Cond.getOperand(0).getOpcode() == ISD::SADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SSUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::USUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SMULO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
	Inverted = true;
	Cond = Cond.getOperand(0);
	} else {
	if (SDValue NewCond = LowerSETCC(Cond, DAG))
	Cond = NewCond;
	}
	}
	#if 0
	// FIXME: LowerXALUO doesn't handle these!!
	else if (Cond.getOpcode() == X86ISD::ADD \|\|
	Cond.getOpcode() == X86ISD::SUB \|\|
	Cond.getOpcode() == X86ISD::SMUL \|\|
	Cond.getOpcode() == X86ISD::UMUL)
	Cond = LowerXALUO(Cond, DAG);
	#endif

	// Look pass (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	unsigned Opc = Cmp.getOpcode();
	// FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
	if (isX86LogicalCmp(Cmp) \|\| Opc == X86ISD::BT) {
	Cond = Cmp;
	addTest = false;
	} else {
	switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
	default: break;
	case X86::COND_O:
	case X86::COND_B:
	// These can only come from an arithmetic instruction with overflow,
	// e.g. SADDO, UADDO.
	Cond = Cond.getOperand(1);
	addTest = false;
	break;
	}
	}
	}
	CondOpcode = Cond.getOpcode();
	if (CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) {
	SDValue Value;
	X86::CondCode X86Cond;
	std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

	if (Inverted)
	X86Cond = X86::GetOppositeBranchCondition(X86Cond);

	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	addTest = false;
	} else {
	unsigned CondOpc;
	if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
	SDValue Cmp = Cond.getOperand(0).getOperand(1);
	if (CondOpc == ISD::OR) {
	// Also, recognize the pattern generated by an FCMP_UNE. We can emit
	// two branches instead of an explicit OR instruction with a
	// separate test.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp)) {
	CC = Cond.getOperand(0).getOperand(0);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = Cond.getOperand(1).getOperand(0);
	Cond = Cmp;
	addTest = false;
	}
	} else { // ISD::AND
	// Also, recognize the pattern generated by an FCMP_OEQ. We can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp) &&
	Op.getNode()->hasOneUse()) {
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	}
	} else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
	// Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
	// It should be transformed during dag combiner except when the condition
	// is set by a arithmetics with overflow node.
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	Cond = Cond.getOperand(0).getOperand(1);
	addTest = false;
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
	// For FCMP_OEQ, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
	// For FCMP_UNE, we can emit
	// two branches instead of an explicit OR instruction with a
	// separate test.
	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}

	if (addTest) {
	// Look pass the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	SDValue BTCC;
	if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, dl, DAG, BTCC)) {
	CC = BTCC;
	Cond = BT;
	addTest = false;
	}
	}
	}

	if (addTest) {
	X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()),
	X86Cond, dl, DAG);
	}
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cond);
	}

	// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
	// Calls to _alloca are needed to probe the stack when allocating more than 4k
	// bytes in one go. Touching the stack at 4K increments is necessary to ensure
	// that the guard pages used by the OS virtual memory manager are allocated in
	// correct sequence.
	SDValue
	X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool SplitStack = MF.shouldSplitStack();
	bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
	bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) \|\|
	SplitStack \|\| EmitStackProbe;
	SDLoc dl(Op);

	// Get the inputs.
	SDNode *Node = Op.getNode();
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	EVT VT = Node->getValueType(0);

	// Chain the dynamic stack allocation so that it doesn't modify the stack
	// pointer when other instructions are using the stack.
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	bool Is64Bit = Subtarget.is64Bit();
	MVT SPTy = getPointerTy(DAG.getDataLayout());

	SDValue Result;
	if (!Lower) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
	assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
	" not tell us which reg is the stack pointer!");

	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
	Chain = SP.getValue(1);
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlign = TFI.getStackAlignment();
	Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
	if (Align > StackAlign)
	Result = DAG.getNode(ISD::AND, dl, VT, Result,
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
	} else if (SplitStack) {
	MachineRegisterInfo &MRI = MF.getRegInfo();

	if (Is64Bit) {
	// The 64 bit implementation of segmented stacks needs to clobber both r10
	// r11. This makes it impossible to use it along with nested parameters.
	const Function &F = MF.getFunction();
	for (const auto &A : F.args()) {
	if (A.hasNestAttr())
	report_fatal_error("Cannot use segmented stacks with functions that "
	"have nested arguments.");
	}
	}

	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
	Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
	Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
	DAG.getRegister(Vreg, SPTy));
	} else {
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
	MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned SPReg = RegInfo->getStackRegister();
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
	Chain = SP.getValue(1);

	if (Align) {
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
	}

	Result = SP;
	}

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	SDValue Ops[2] = {Result, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SDLoc DL(Op);

	if (!Subtarget.is64Bit() \|\|
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	// __va_list_tag:
	// gp_offset (0 - 6 * 8)
	// fp_offset (48 - 48 + 8 * 16)
	// overflow_arg_area (point to parameters coming in memory).
	// reg_save_area
	SmallVector<SDValue, 8> MemOps;
	SDValue FIN = Op.getOperand(1);
	// Store gp_offset
	SDValue Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV));
	MemOps.push_back(Store);

	// Store fp_offset
	FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
	Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV, 4));
	MemOps.push_back(Store);

	// Store ptr to overflow_arg_area
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
	SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	Store =
	DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
	MemOps.push_back(Store);

	// Store ptr to reg_save_area.
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
	Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
	Store = DAG.getStore(
	Op.getOperand(0), DL, RSFIN, FIN,
	MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
	MemOps.push_back(Store);
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.is64Bit() &&
	"LowerVAARG only handles 64-bit va_arg!");
	assert(Op.getNumOperands() == 4);

	MachineFunction &MF = DAG.getMachineFunction();
	if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
	// The Win64 ABI uses char* instead of a structure.
	return DAG.expandVAArg(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue SrcPtr = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	unsigned Align = Op.getConstantOperandVal(3);
	SDLoc dl(Op);

	EVT ArgVT = Op.getNode()->getValueType(0);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
	uint8_t ArgMode;

	// Decide which area this value should be read from.
	// TODO: Implement the AMD64 ABI in its entirety. This simple
	// selection mechanism works only for the basic types.
	if (ArgVT == MVT::f80) {
	llvm_unreachable("va_arg for f80 not yet implemented");
	} else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /bytes/) {
	ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
	} else if (ArgVT.isInteger() && ArgSize <= 32 /bytes/) {
	ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
	} else {
	llvm_unreachable("Unhandled argument type in LowerVAARG");
	}

	if (ArgMode == 2) {
	// Sanity Check: Make sure using fp_offset makes sense.
	assert(!Subtarget.useSoftFloat() &&
	!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
	Subtarget.hasSSE1());
	}

	// Insert VAARG_64 node into the DAG
	// VAARG_64 returns two values: Variable Argument Address, Chain
	SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
	DAG.getConstant(ArgMode, dl, MVT::i8),
	DAG.getConstant(Align, dl, MVT::i32)};
	SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
	SDValue VAARG = DAG.getMemIntrinsicNode(
	X86ISD::VAARG_64, dl,
	VTs, InstOps, MVT::i64,
	MachinePointerInfo(SV),
	/Align=/0,
	MachineMemOperand::MOLoad \| MachineMemOperand::MOStore);
	Chain = VAARG.getValue(1);

	// Load the next argument and return it
	return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
	}

	static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// X86-64 va_list is a struct { i32, i32, i8, i8 }, except on Windows,
	// where a va_list is still an i8*.
	assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
	if (Subtarget.isCallingConvWin64(
	DAG.getMachineFunction().getFunction().getCallingConv()))
	// Probably a Win64 va_copy.
	return DAG.expandVACopy(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue DstPtr = Op.getOperand(1);
	SDValue SrcPtr = Op.getOperand(2);
	const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	SDLoc DL(Op);

	return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
	DAG.getIntPtrConstant(24, DL), 8, /isVolatile/false,
	false, false,
	MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
	}

	// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
	static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
	switch (Opc) {
	case ISD::SHL:
	case X86ISD::VSHL:
	case X86ISD::VSHLI:
	return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
	case ISD::SRL:
	case X86ISD::VSRL:
	case X86ISD::VSRLI:
	return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
	case ISD::SRA:
	case X86ISD::VSRA:
	case X86ISD::VSRAI:
	return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
	}
	llvm_unreachable("Unknown target vector shift node");
	}

	/// Handle vector element shifts where the shift amount is a constant.
	/// Takes immediate version of shift as input.
	static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, uint64_t ShiftAmt,
	SelectionDAG &DAG) {
	MVT ElementType = VT.getVectorElementType();

	// Bitcast the source vector to the output type, this is mainly necessary for
	// vXi8/vXi64 shifts.
	if (VT != SrcOp.getSimpleValueType())
	SrcOp = DAG.getBitcast(VT, SrcOp);

	// Fold this packed shift into its first operand if ShiftAmt is 0.
	if (ShiftAmt == 0)
	return SrcOp;

	// Check for ShiftAmt >= element width
	if (ShiftAmt >= ElementType.getSizeInBits()) {
	if (Opc == X86ISD::VSRAI)
	ShiftAmt = ElementType.getSizeInBits() - 1;
	else
	return DAG.getConstant(0, dl, VT);
	}

	assert((Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD::VSRAI)
	&& "Unknown target vector shift-by-constant node");

	// Fold this packed vector shift into a build vector if SrcOp is a
	// vector of Constants or UNDEFs.
	if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
	SmallVector<SDValue, 8> Elts;
	unsigned NumElts = SrcOp->getNumOperands();

	switch (Opc) {
	default: llvm_unreachable("Unknown opcode!");
	case X86ISD::VSHLI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRLI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRAI:
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	auto *ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
	}
	break;
	}

	return DAG.getBuildVector(VT, dl, Elts);
	}

	return DAG.getNode(Opc, dl, VT, SrcOp,
	DAG.getConstant(ShiftAmt, dl, MVT::i8));
	}

	/// Handle vector element shifts where the shift amount may or may not be a
	/// constant. Takes immediate version of shift as input.
	static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, SDValue ShAmt,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SVT = ShAmt.getSimpleValueType();
	assert((SVT == MVT::i32 \|\| SVT == MVT::i64) && "Unexpected value type!");

	// Catch shift-by-constant.
	if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
	return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
	CShAmt->getZExtValue(), DAG);

	// Change opcode to non-immediate version.
	Opc = getTargetVShiftUniformOpcode(Opc, true);

	// Need to build a vector containing shift amount.
	// SSE/AVX packed shifts only use the lower 64-bit of the shift count.
	// +====================+============+=======================================+
	// \| ShAmt is \| HasSSE4.1? \| Construct ShAmt vector as \|
	// +====================+============+=======================================+
	// \| i64 \| Yes, No \| Use ShAmt as lowest elt \|
	// \| i32 \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16/i8)) \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16/i8)) \| No \| byte-shift-in-reg \|
	// \| i16/i32 \| No \| v4i32 build_vector(ShAmt, 0, ud, ud)) \|
	// +====================+============+=======================================+

	if (SVT == MVT::i64)
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
	else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
	ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	(ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 \|\|
	ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
	ShAmt = ShAmt.getOperand(0);
	MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
	if (Subtarget.hasSSE41())
	ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
	MVT::v2i64, ShAmt);
	else {
	SDValue ByteShift = DAG.getConstant(
	(128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
	ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
	ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
	ByteShift);
	ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
	ByteShift);
	}
	} else if (Subtarget.hasSSE41() &&
	ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
	ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
	MVT::v2i64, ShAmt);
	} else {
	SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
	DAG.getUNDEF(SVT)};
	ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
	}

	// The return type has to be a 128-bit type with the same element
	// type as the input type.
	MVT EltVT = VT.getVectorElementType();
	MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());

	ShAmt = DAG.getBitcast(ShVT, ShAmt);
	return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
	}

	/// Return Mask with the necessary casting or extending
	/// for \p Mask according to \p MaskVT when lowering masking intrinsics
	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {

	if (isAllOnesConstant(Mask))
	return DAG.getConstant(1, dl, MaskVT);
	if (X86::isZeroNode(Mask))
	return DAG.getConstant(0, dl, MaskVT);

	assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");

	if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
	assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	// In case 32bit mode, bitcast i64 is illegal, extend/split it.
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(0, dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(1, dl, MVT::i32));

	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	Hi = DAG.getBitcast(MVT::v32i1, Hi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	} else {
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
	// are extracted by EXTRACT_SUBVECTOR.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
	DAG.getBitcast(BitcastVT, Mask),
	DAG.getIntPtrConstant(0, dl));
	}
	}

	/// Return (and \p Op, \p Mask) for compare instructions or
	/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
	/// necessary casting or extending for \p Mask when lowering masking intrinsics
	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	unsigned OpcodeSelect = ISD::VSELECT;
	SDLoc dl(Op);

	if (isAllOnesConstant(Mask))
	return Op;

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
	}

	/// Creates an SDNode for a predicated scalar operation.
	/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
	/// The mask is coming as MVT::i8 and it should be transformed
	/// to MVT::v1i1 while lowering masking intrinsics.
	/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
	/// "X86select" instead of "vselect". We just can't create the "vselect" node
	/// for a scalar instruction.
	static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
	if (MaskConst->getZExtValue() & 0x1)
	return Op;

	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
	SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
	DAG.getBitcast(MVT::v8i1, Mask),
	DAG.getIntPtrConstant(0, dl));
	if (Op.getOpcode() == X86ISD::FSETCCM \|\|
	Op.getOpcode() == X86ISD::FSETCCM_SAE \|\|
	Op.getOpcode() == X86ISD::VFPCLASSS)
	return DAG.getNode(ISD::AND, dl, VT, Op, IMask);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
	}

	static int getSEHRegistrationNodeSize(const Function *Fn) {
	if (!Fn->hasPersonalityFn())
	report_fatal_error(
	"querying registration node size for function without personality");
	// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
	// WinEHStatePass for the full struct definition.
	switch (classifyEHPersonality(Fn->getPersonalityFn())) {
	case EHPersonality::MSVC_X86SEH: return 24;
	case EHPersonality::MSVC_CXX: return 16;
	default: break;
	}
	report_fatal_error(
	"can only recover FP for 32-bit MSVC EH personality functions");
	}

	/// When the MSVC runtime transfers control to us, either to an outlined
	/// function or when returning to a parent frame after catching an exception, we
	/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
	/// Here's the math:
	/// RegNodeBase = EntryEBP - RegNodeSize
	/// ParentFP = RegNodeBase - ParentFrameOffset
	/// Subtracting RegNodeSize takes us to the offset of the registration node, and
	/// subtracting the offset (negative on x86) takes us back to the parent FP.
	static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
	SDValue EntryEBP) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDLoc dl;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

	// It's possible that the parent function no longer has a personality function
	// if the exceptional code was optimized away, in which case we just return
	// the incoming EBP.
	if (!Fn->hasPersonalityFn())
	return EntryEBP;

	// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
	// registration, or the .set_setframe offset.
	MCSymbol *OffsetSym =
	MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));
	SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
	SDValue ParentFrameOffset =
	DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

	// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
	// prologue to RBP in the parent function.
	const X86Subtarget &Subtarget =
	static_cast<const X86Subtarget &>(DAG.getSubtarget());
	if (Subtarget.is64Bit())
	return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

	int RegNodeSize = getSEHRegistrationNodeSize(Fn);
	// RegNodeBase = EntryEBP - RegNodeSize
	// ParentFP = RegNodeBase - ParentFrameOffset
	SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
	DAG.getConstant(RegNodeSize, dl, PtrVT));
	return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
	}

	SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	// Helper to detect if the operand is CUR_DIRECTION rounding mode.
	auto isRoundModeCurDirection = [](SDValue Rnd) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
	return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;

	return false;
	};
	auto isRoundModeSAE = [](SDValue Rnd) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
	return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC;

	return false;
	};
	auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
	if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
	RC = C->getZExtValue();
	if (RC & X86::STATIC_ROUNDING::NO_EXC) {
	// Clear the NO_EXC bit and check remaining bits.
	RC ^= X86::STATIC_ROUNDING::NO_EXC;
	return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT \|\|
	RC == X86::STATIC_ROUNDING::TO_NEG_INF \|\|
	RC == X86::STATIC_ROUNDING::TO_POS_INF \|\|
	RC == X86::STATIC_ROUNDING::TO_ZERO;
	}
	}

	return false;
	};

	SDLoc dl(Op);
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	MVT VT = Op.getSimpleValueType();
	const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
	if (IntrData) {
	switch(IntrData->Type) {
	case INTR_TYPE_1OP: {
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(2);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Op.getOperand(1),
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
	}
	case INTR_TYPE_1OP_SAE: {
	SDValue Sae = Op.getOperand(2);

	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
	}
	case INTR_TYPE_2OP: {
	SDValue Src2 = Op.getOperand(2);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(3);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Op.getOperand(1), Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}

	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Src2);
	}
	case INTR_TYPE_2OP_SAE: {
	SDValue Sae = Op.getOperand(3);

	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2));
	}
	case INTR_TYPE_3OP:
	case INTR_TYPE_3OP_IMM8: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);

	if (IntrData->Type == INTR_TYPE_3OP_IMM8)
	Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Src1, Src2, Src3,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}

	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Src1, Src2, Src3);
	}
	case INTR_TYPE_4OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
	case INTR_TYPE_1OP_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RC Opcode is specified and
	// - RC is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return getVectorMaskingNode(
	DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
	Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
	Mask, PassThru, Subtarget, DAG);
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_1OP_MASK_SAE: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Rnd = Op.getOperand(4);

	unsigned Opc;
	if (isRoundModeCurDirection(Rnd))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Rnd))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	// There are 2 kinds of intrinsics in this group:
	// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
	// (2) With rounding mode and sae - 7 operands.
	bool HasRounding = IntrWithRoundingModeOpcode != 0;
	if (Op.getNumOperands() == (5U + HasRounding)) {
	if (HasRounding) {
	SDValue Rnd = Op.getOperand(5);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	return getScalarMaskingNode(
	DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32)),
	Mask, passThru, Subtarget, DAG);
	if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2),
	Mask, passThru, Subtarget, DAG);
	}

	assert(Op.getNumOperands() == (6U + HasRounding) &&
	"Unexpected intrinsic form");
	SDValue RoundingMode = Op.getOperand(5);
	unsigned Opc = IntrData->Opc0;
	if (HasRounding) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrWithRoundingModeOpcode;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
	Src2, RoundingMode),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_RND: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue Rnd = Op.getOperand(5);

	SDValue NewOp;
	unsigned RC = 0;
	if (isRoundModeCurDirection(Rnd))
	NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
	else if (isRoundModeSAEToX(Rnd, RC))
	NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	else
	return SDValue();

	return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue Sae = Op.getOperand(5);
	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue NewOp;
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	unsigned RC = 0;
	if (isRoundModeSAEToX(Rnd, RC))
	NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
	DAG.getTargetConstant(RC, dl, MVT::i32));
	else if (!isRoundModeCurDirection(Rnd))
	return SDValue();
	}
	if (!NewOp)
	NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
	return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(5);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}

	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Sae = Op.getOperand(6);
	unsigned Opc;
	if (isRoundModeCurDirection(Sae))
	Opc = IntrData->Opc0;
	else if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else
	return SDValue();

	return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_MASK_SAE: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case BLENDV: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);

	EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
	Src3 = DAG.getBitcast(MaskVT, Src3);

	// Reverse the operands to match VSELECT order.
	return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
	}
	case VPERM_2OP : {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);

	// Swap Src1 and Src2 in the node creation
	return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
	}
	case IFMA_OP:
	// NOTE: We need to swizzle the operands to pass the multiply operands
	// first.
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
	case FPCLASSS: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
	SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
	Subtarget, DAG);
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getConstant(0, dl, MVT::v8i1),
	FPclassMask, DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(MVT::i8, Ins);
	}

	case CMP_MASK_CC: {
	MVT MaskVT = Op.getSimpleValueType();
	SDValue CC = Op.getOperand(3);
	CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(4);
	if (isRoundModeSAE(Sae))
	return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC, Sae);
	if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	//default rounding mode
	return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC);
	}
	case CMP_MASK_SCALAR_CC: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
	SDValue Mask = Op.getOperand(4);

	SDValue Cmp;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(5);
	if (isRoundModeSAE(Sae))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}
	//default rounding mode
	if (!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

	SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
	Subtarget, DAG);
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getConstant(0, dl, MVT::v8i1),
	CmpMask, DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(MVT::i8, Ins);
	}
	case COMI: { // Comparison intrinsics
	ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
	SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
	SDValue SetCC;
	switch (CC) {
	case ISD::SETEQ: { // (ZF = 0 and PF = 0)
	SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
	SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
	break;
	}
	case ISD::SETNE: { // (ZF = 1 or PF = 1)
	SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
	SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
	break;
	}
	case ISD::SETGT: // (CF = 0 and ZF = 0)
	SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
	break;
	case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
	SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
	break;
	}
	case ISD::SETGE: // CF = 0
	SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
	break;
	case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
	SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
	break;
	default:
	llvm_unreachable("Unexpected illegal condition!");
	}
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case COMI_RM: { // Comparison intrinsics with Sae
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	SDValue Sae = Op.getOperand(4);

	SDValue FCmp;
	if (isRoundModeCurDirection(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
	DAG.getConstant(CondVal, dl, MVT::i8));
	else if (isRoundModeSAE(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
	DAG.getConstant(CondVal, dl, MVT::i8), Sae);
	else
	return SDValue();
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
	DAG.getConstant(0, dl, MVT::v16i1),
	FCmp, DAG.getIntPtrConstant(0, dl));
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
	DAG.getBitcast(MVT::i16, Ins));
	}
	case VSHIFT:
	return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
	Op.getOperand(1), Op.getOperand(2), Subtarget,
	DAG);
	case COMPRESS_EXPAND_IN_REG: {
	SDValue Mask = Op.getOperand(3);
	SDValue DataToCompress = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
	return Op.getOperand(1);

	// Avoid false dependency.
	if (PassThru.isUndef())
	PassThru = DAG.getConstant(0, dl, VT);

	return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
	Mask);
	}
	case FIXUPIMM:
	case FIXUPIMM_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Imm = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Passthru = (IntrData->Type == FIXUPIMM)
	? Src1
	: getZeroVector(VT, Subtarget, DAG, dl);

	unsigned Opc = IntrData->Opc0;
	if (IntrData->Opc1 != 0) {
	SDValue Sae = Op.getOperand(6);
	if (isRoundModeSAE(Sae))
	Opc = IntrData->Opc1;
	else if (!isRoundModeCurDirection(Sae))
	return SDValue();
	}

	SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);

	if (Opc == X86ISD::VFIXUPIMM \|\| Opc == X86ISD::VFIXUPIMM_SAE)
	return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

	return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
	}
	case ROUNDP: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
	Op.getOperand(2),
	DAG.getConstant(0xf, dl, MVT::i32));
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), RoundingMode);
	}
	case ROUNDS: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
	Op.getOperand(3),
	DAG.getConstant(0xf, dl, MVT::i32));
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), RoundingMode);
	}
	// ADC/ADCX/SBB
	case ADX: {
	SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
	SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);

	SDValue Res;
	// If the carry in is zero, then we should just use ADD/SUB instead of
	// ADC/SBB.
	if (isNullConstant(Op.getOperand(1))) {
	Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
	Op.getOperand(3));
	} else {
	SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
	DAG.getConstant(-1, dl, MVT::i8));
	Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
	Op.getOperand(3), GenCF.getValue(1));
	}
	SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
	SDValue Results[] = { SetCC, Res };
	return DAG.getMergeValues(Results, dl);
	}
	case CVTPD2PS_MASK:
	case CVTPD2DQ_MASK:
	case CVTQQ2PS_MASK:
	case TRUNCATE_TO_REG: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);

	if (isAllOnesConstant(Mask))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

	MVT SrcVT = Src.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
	Mask);
	}
	case CVTPS2PH_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue Rnd = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	if (isAllOnesConstant(Mask))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);

	MVT SrcVT = Src.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
	PassThru, Mask);

	}
	case CVTNEPS2BF16_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);

	if (ISD::isBuildVectorAllOnes(Mask.getNode()))
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

	// Break false dependency.
	if (PassThru.isUndef())
	PassThru = DAG.getConstant(0, dl, PassThru.getValueType());

	return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
	Mask);
	}
	default:
	break;
	}
	}

	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.

	// ptest and testp intrinsics. The intrinsic these come from are designed to
	// return an integer value, not just an instruction so lower it to the ptest
	// or testp pattern and a setcc for the result.
	case Intrinsic::x86_avx512_ktestc_b:
	case Intrinsic::x86_avx512_ktestc_w:
	case Intrinsic::x86_avx512_ktestc_d:
	case Intrinsic::x86_avx512_ktestc_q:
	case Intrinsic::x86_avx512_ktestz_b:
	case Intrinsic::x86_avx512_ktestz_w:
	case Intrinsic::x86_avx512_ktestz_d:
	case Intrinsic::x86_avx512_ktestz_q:
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestz_256:
	case Intrinsic::x86_avx_ptestc_256:
	case Intrinsic::x86_avx_ptestnzc_256:
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256: {
	unsigned TestOpc = X86ISD::PTEST;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
	case Intrinsic::x86_avx512_ktestc_b:
	case Intrinsic::x86_avx512_ktestc_w:
	case Intrinsic::x86_avx512_ktestc_d:
	case Intrinsic::x86_avx512_ktestc_q:
	// CF = 1
	TestOpc = X86ISD::KTEST;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx512_ktestz_b:
	case Intrinsic::x86_avx512_ktestz_w:
	case Intrinsic::x86_avx512_ktestz_d:
	case Intrinsic::x86_avx512_ktestz_q:
	TestOpc = X86ISD::KTEST;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_avx_ptestz_256:
	// ZF = 1
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_avx_ptestc_256:
	// CF = 1
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256:
	TestOpc = X86ISD::TESTP;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestnzc_256:
	// ZF and CF = 0
	X86CC = X86::COND_A;
	break;
	}

	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistria128:
	case Intrinsic::x86_sse42_pcmpestria128:
	case Intrinsic::x86_sse42_pcmpistric128:
	case Intrinsic::x86_sse42_pcmpestric128:
	case Intrinsic::x86_sse42_pcmpistrio128:
	case Intrinsic::x86_sse42_pcmpestrio128:
	case Intrinsic::x86_sse42_pcmpistris128:
	case Intrinsic::x86_sse42_pcmpestris128:
	case Intrinsic::x86_sse42_pcmpistriz128:
	case Intrinsic::x86_sse42_pcmpestriz128: {
	unsigned Opcode;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::x86_sse42_pcmpistria128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpestria128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpistric128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpestric128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpistrio128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpestrio128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpistris128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpestris128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpistriz128:
	Opcode = X86ISD::PCMPISTR;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_sse42_pcmpestriz128:
	Opcode = X86ISD::PCMPESTR;
	X86CC = X86::COND_E;
	break;
	}
	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
	SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistri128:
	case Intrinsic::x86_sse42_pcmpestri128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
	Opcode = X86ISD::PCMPISTR;
	else
	Opcode = X86ISD::PCMPESTR;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps);
	}

	case Intrinsic::x86_sse42_pcmpistrm128:
	case Intrinsic::x86_sse42_pcmpestrm128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
	Opcode = X86ISD::PCMPISTR;
	else
	Opcode = X86ISD::PCMPESTR;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
	}

	case Intrinsic::eh_sjlj_lsda: {
	MachineFunction &MF = DAG.getMachineFunction();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	auto &Context = MF.getMMI().getContext();
	MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
	Twine(MF.getFunctionNumber()));
	return DAG.getNode(getGlobalWrapperKind(), dl, VT,
	DAG.getMCSymbol(S, PtrVT));
	}

	case Intrinsic::x86_seh_lsda: {
	// Compute the symbol for the LSDA. We know it'll get emitted later.
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Op1 = Op.getOperand(1);
	auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
	MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));

	// Generate a simple absolute symbol reference. This intrinsic is only
	// supported on 32-bit Windows, which isn't PIC.
	SDValue Result = DAG.getMCSymbol(LSDASym, VT);
	return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
	}

	case Intrinsic::eh_recoverfp: {
	SDValue FnOp = Op.getOperand(1);
	SDValue IncomingFPOp = Op.getOperand(2);
	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
	if (!Fn)
	report_fatal_error(
	"llvm.eh.recoverfp must take a function as the first argument");
	return recoverFramePointer(DAG, Fn, IncomingFPOp);
	}

	case Intrinsic::localaddress: {
	// Returns one of the stack, base, or frame pointer registers, depending on
	// which is used to reference local variables.
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned Reg;
	if (RegInfo->hasBasePointer(MF))
	Reg = RegInfo->getBaseRegister();
	else { // Handles the SP or FP case.
	bool CantUseFP = RegInfo->needsStackRealignment(MF);
	if (CantUseFP)
	Reg = RegInfo->getPtrSizedStackRegister(MF);
	else
	Reg = RegInfo->getPtrSizedFrameRegister(MF);
	}
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
	}

	case Intrinsic::x86_avx512_vp2intersect_q_512:
	case Intrinsic::x86_avx512_vp2intersect_q_256:
	case Intrinsic::x86_avx512_vp2intersect_q_128:
	case Intrinsic::x86_avx512_vp2intersect_d_512:
	case Intrinsic::x86_avx512_vp2intersect_d_256:
	case Intrinsic::x86_avx512_vp2intersect_d_128: {
	MVT MaskVT = Op.getSimpleValueType();

	SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
	SDLoc DL(Op);

	SDValue Operation =
	DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
	Op->getOperand(1), Op->getOperand(2));

	SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
	MaskVT, Operation);
	SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
	MaskVT, Operation);
	return DAG.getMergeValues({Result0, Result1}, DL);
	}
	}
	}

	static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let BreakFalseDeps deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
	}

	static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
	VT.getVectorNumElements());
	MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

	// We support two versions of the gather intrinsics. One with scalar mask and
	// one with vXi1 mask. Convert scalar to vXi1 if necessary.
	if (Mask.getValueType() != MaskVT)
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let BreakFalseDeps deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
	}

	static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
	Src.getSimpleValueType().getVectorNumElements());
	MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

	// We support two versions of the scatter intrinsics. One with scalar mask and
	// one with vXi1 mask. Convert scalar to vXi1 if necessary.
	if (Mask.getValueType() != MaskVT)
	Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

	SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
	VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
	return Res.getValue(1);
	}

	static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Mask, SDValue Base, SDValue Index,
	SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT =
	MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	/// Handles the lowering of builtin intrinsics with chain that return their
	/// value into registers EDX:EAX.
	/// If operand ScrReg is a valid register identifier, then operand 2 of N is
	/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
	/// TargetOpcode.
	/// Returns a Glue value which can be used to add extra copy-from-reg if the
	/// expanded intrinsics implicitly defines extra registers (i.e. not just
	/// EDX:EAX).
	static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	unsigned TargetOpcode,
	unsigned SrcReg,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	SDValue Chain = N->getOperand(0);
	SDValue Glue;

	if (SrcReg) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
	Glue = Chain.getValue(1);
	}

	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue N1Ops[] = {Chain, Glue};
	SDNode *N1 = DAG.getMachineNode(
	TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
	Chain = SDValue(N1, 0);

	// Reads the content of XCR and returns it in registers EDX:EAX.
	SDValue LO, HI;
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);
	Glue = HI.getValue(2);

	if (Subtarget.is64Bit()) {
	// Merge the two 32-bit values into a 64-bit one.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return Glue;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	return Glue;
	}

	/// Handles the lowering of builtin intrinsics that read the time stamp counter
	/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
	/// READCYCLECOUNTER nodes.
	static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	// The processor's time-stamp counter (a 64-bit MSR) is stored into the
	// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
	// and the EAX register is loaded with the low-order 32 bits.
	SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
	/* NoRegister */0, Subtarget,
	Results);
	if (Opcode != X86::RDTSCP)
	return;

	SDValue Chain = Results[1];
	// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
	// the ECX register. Add 'ecx' explicitly to the chain.
	SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
	Results[1] = ecx;
	Results.push_back(ecx.getValue(1));
	}

	static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<SDValue, 3> Results;
	SDLoc DL(Op);
	getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, DL);
	}

	static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue RegNode = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EH registrations only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
	EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue EHGuard = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EHGuard only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
	EHInfo->EHGuardFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	/// Emit Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
	SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
	SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	return SignedSat ?
	DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	/// Emit Masked Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
	SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
	MachineMemOperand *MMO, SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = { Chain, Val, Ptr, Mask };
	return SignedSat ?
	DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();

	const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
	if (!IntrData) {
	switch (IntNo) {
	case llvm::Intrinsic::x86_seh_ehregnode:
	return MarkEHRegistrationNode(Op, DAG);
	case llvm::Intrinsic::x86_seh_ehguard:
	return MarkEHGuard(Op, DAG);
	case llvm::Intrinsic::x86_rdpkru: {
	SDLoc dl(Op);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	// Create a RDPKRU node and pass 0 to the ECX parameter.
	return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
	DAG.getConstant(0, dl, MVT::i32));
	}
	case llvm::Intrinsic::x86_wrpkru: {
	SDLoc dl(Op);
	// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
	// to the EDX and ECX parameters.
	return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
	Op.getOperand(0), Op.getOperand(2),
	DAG.getConstant(0, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	}
	case llvm::Intrinsic::x86_flags_read_u32:
	case llvm::Intrinsic::x86_flags_read_u64:
	case llvm::Intrinsic::x86_flags_write_u32:
	case llvm::Intrinsic::x86_flags_write_u64: {
	// We need a frame pointer because this will get lowered to a PUSH/POP
	// sequence.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	// Don't do anything here, we will expand these intrinsics out later
	// during FinalizeISel in EmitInstrWithCustomInserter.
	return SDValue();
	}
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64:
	case Intrinsic::x86_umwait:
	case Intrinsic::x86_tpause: {
	SDLoc dl(Op);
	SDValue Chain = Op->getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	unsigned Opcode;

	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic");
	case Intrinsic::x86_umwait:
	Opcode = X86ISD::UMWAIT;
	break;
	case Intrinsic::x86_tpause:
	Opcode = X86ISD::TPAUSE;
	break;
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64:
	Opcode = X86ISD::LWPINS;
	break;
	}

	SDValue Operation =
	DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
	Op->getOperand(3), Op->getOperand(4));
	SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
	Operation.getValue(1));
	}
	case Intrinsic::x86_enqcmd:
	case Intrinsic::x86_enqcmds: {
	SDLoc dl(Op);
	SDValue Chain = Op.getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	unsigned Opcode;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic!");
	case Intrinsic::x86_enqcmd:
	Opcode = X86ISD::ENQCMD;
	break;
	case Intrinsic::x86_enqcmds:
	Opcode = X86ISD::ENQCMDS;
	break;
	}
	SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
	Op.getOperand(3));
	SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
	Operation.getValue(1));
	}
	}
	return SDValue();
	}

	SDLoc dl(Op);
	switch(IntrData->Type) {
	default: llvm_unreachable("Unknown Intrinsic Type");
	case RDSEED:
	case RDRAND: {
	// Emit the node with the right value type.
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
	SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
	// Otherwise return the value from Rand, which is always 0, casted to i32.
	SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
	DAG.getConstant(1, dl, Op->getValueType(1)),
	DAG.getConstant(X86::COND_B, dl, MVT::i8),
	SDValue(Result.getNode(), 1) };
	SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

	// Return { result, isValid, chain }.
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
	SDValue(Result.getNode(), 2));
	}
	case GATHER_AVX2: {
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case GATHER: {
	//gather(v1, mask, index, base, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
	Chain, Subtarget);
	}
	case SCATTER: {
	//scatter(base, mask, index, v1, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Base = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Src = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case PREFETCH: {
	SDValue Hint = Op.getOperand(6);
	unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
	assert((HintVal == 2 \|\| HintVal == 3) &&
	"Wrong prefetch hint in intrinsic: should be 2 or 3");
	unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
	SDValue Chain = Op.getOperand(0);
	SDValue Mask = Op.getOperand(2);
	SDValue Index = Op.getOperand(3);
	SDValue Base = Op.getOperand(4);
	SDValue Scale = Op.getOperand(5);
	return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
	Subtarget);
	}
	// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
	case RDTSC: {
	SmallVector<SDValue, 2> Results;
	getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Read Performance Monitoring Counters.
	case RDPMC:
	// GetExtended Control Register.
	case XGETBV: {
	SmallVector<SDValue, 2> Results;

	// RDPMC uses ECX to select the index of the performance counter to read.
	// XGETBV uses ECX to select the index of the XCR register to return.
	// The result is stored into registers EDX:EAX.
	expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
	Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// XTEST intrinsics.
	case XTEST: {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
	SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
	SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
	Ret, SDValue(InTrans.getNode(), 1));
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToTruncate = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	EVT MemVT = MemIntr->getMemoryVT();

	uint16_t TruncationOp = IntrData->Opc0;
	switch (TruncationOp) {
	case X86ISD::VTRUNC: {
	if (isAllOnesConstant(Mask)) // return just a truncate store
	return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
	MemIntr->getMemOperand(), true /* truncating */);
	}
	case X86ISD::VTRUNCUS:
	case X86ISD::VTRUNCS: {
	bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
	if (isAllOnesConstant(Mask))
	return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand(), DAG);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
	VMask, MemVT, MemIntr->getMemOperand(), DAG);
	}
	default:
	llvm_unreachable("Unsupported truncstore intrinsic");
	}
	}
	}
	}

	SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	if (Depth > 0) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Just load the return address.
	SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
	MachinePointerInfo());
	}

	SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
	return getReturnAddressFrameIndex(DAG);
	}

	SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	EVT VT = Op.getValueType();

	MFI.setFrameAddressIsTaken(true);

	if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
	// Depth > 0 makes no sense on targets which use Windows unwind codes. It
	// is not possible to crawl up the stack without looking at the unwind codes
	// simultaneously.
	int FrameAddrIndex = FuncInfo->getFAIndex();
	if (!FrameAddrIndex) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
	SlotSize, /SPOffset=/0, /IsImmutable=/false);
	FuncInfo->setFAIndex(FrameAddrIndex);
	}
	return DAG.getFrameIndex(FrameAddrIndex, VT);
	}

	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	SDLoc dl(Op); // FIXME probably not meaningful
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	assert(((FrameReg == X86::RBP && VT == MVT::i64) \|\|
	(FrameReg == X86::EBP && VT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	const MachineFunction &MF = DAG.getMachineFunction();

	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("esp", X86::ESP)
	.Case("rsp", X86::RSP)
	.Case("ebp", X86::EBP)
	.Case("rbp", X86::RBP)
	.Default(0);

	if (Reg == X86::EBP \|\| Reg == X86::RBP) {
	if (!TFI.hasFP(MF))
	report_fatal_error("register " + StringRef(RegName) +
	" is allocatable: function has no frame pointer");
	#ifndef NDEBUG
	else {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	assert((FrameReg == X86::EBP \|\| FrameReg == X86::RBP) &&
	"Invalid Frame Register!");
	}
	#endif
	}

	if (Reg)
	return Reg;

	report_fatal_error("Invalid register name global variable");
	}

	SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
	SelectionDAG &DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
	}

	unsigned X86TargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

	return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
	}

	unsigned X86TargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	// Funclet personalities don't use selectors (the runtime does the selection).
	assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
	}

	bool X86TargetLowering::needsFixedCatchObjects() const {
	return Subtarget.isTargetWin64();
	}

	SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Offset = Op.getOperand(1);
	SDValue Handler = Op.getOperand(2);
	SDLoc dl (Op);

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
	assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\|
	(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
	unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

	SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
	DAG.getIntPtrConstant(RegInfo->getSlotSize(),
	dl));
	StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
	Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
	Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

	return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
	DAG.getRegister(StoreAddrReg, PtrVT));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	// If the subtarget is not 64bit, we may need the global base reg
	// after isel expand pseudo, i.e., after CGBR pass ran.
	// Therefore, ask for the GlobalBaseReg now, so that the pass
	// inserts the code for us in case we need it.
	// Otherwise, we will end up in a situation where we will
	// reference a virtual register that is not defined!
	if (!Subtarget.is64Bit()) {
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
	}
	return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
	Op.getOperand(0));
	}

	static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
	return Op.getOperand(0);
	}

	SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Root = Op.getOperand(0);
	SDValue Trmp = Op.getOperand(1); // trampoline
	SDValue FPtr = Op.getOperand(2); // nested function
	SDValue Nest = Op.getOperand(3); // 'nest' parameter value
	SDLoc dl (Op);

	const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	if (Subtarget.is64Bit()) {
	SDValue OutChains[6];

	// Large code-model.
	const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
	const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

	const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
	const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

	const unsigned char REX_WB = 0x40 \| 0x08 \| 0x01; // REX prefix

	// Load the pointer to the nested function into R11.
	unsigned OpCode = ((MOV64ri \| N86R11) << 8) \| REX_WB; // movabsq r11
	SDValue Addr = Trmp;
	OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(2, dl, MVT::i64));
	OutChains[1] =
	DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
	/* Alignment = */ 2);

	// Load the 'nest' parameter value into R10.
	// R10 is specified in X86CallingConv.td
	OpCode = ((MOV64ri \| N86R10) << 8) \| REX_WB; // movabsq r10
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(10, dl, MVT::i64));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 10));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(12, dl, MVT::i64));
	OutChains[3] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
	/* Alignment = */ 2);

	// Jump to the nested function.
	OpCode = (JMP64r << 8) \| REX_WB; // jmpq *...
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(20, dl, MVT::i64));
	OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 20));

	unsigned char ModRM = N86R11 \| (4 << 3) \| (3 << 6); // ...r11
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(22, dl, MVT::i64));
	OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 22));

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	} else {
	const Function *Func =
	cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
	CallingConv::ID CC = Func->getCallingConv();
	unsigned NestReg;

	switch (CC) {
	default:
	llvm_unreachable("Unsupported calling convention");
	case CallingConv::C:
	case CallingConv::X86_StdCall: {
	// Pass 'nest' parameter in ECX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::ECX;

	// Check that ECX wasn't needed by an 'inreg' parameter.
	FunctionType *FTy = Func->getFunctionType();
	const AttributeList &Attrs = Func->getAttributes();

	if (!Attrs.isEmpty() && !Func->isVarArg()) {
	unsigned InRegCount = 0;
	unsigned Idx = 1;

	for (FunctionType::param_iterator I = FTy->param_begin(),
	E = FTy->param_end(); I != E; ++I, ++Idx)
	if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
	auto &DL = DAG.getDataLayout();
	// FIXME: should only count parameters that are lowered to integers.
	InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
	}

	if (InRegCount > 2) {
	report_fatal_error("Nest register in use - reduce number of inreg"
	" parameters!");
	}
	}
	break;
	}
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::Fast:
	// Pass 'nest' parameter in EAX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::EAX;
	break;
	}

	SDValue OutChains[4];
	SDValue Addr, Disp;

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(10, dl, MVT::i32));
	Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

	// This is storing the opcode for MOV32ri.
	const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
	const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
	OutChains[0] =
	DAG.getStore(Root, dl, DAG.getConstant(MOV32ri \| N86Reg, dl, MVT::i8),
	Trmp, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(1, dl, MVT::i32));
	OutChains[1] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
	/* Alignment = */ 1);

	const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(5, dl, MVT::i32));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 5),
	/* Alignment = */ 1);

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(6, dl, MVT::i32));
	OutChains[3] =
	DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
	/* Alignment = */ 1);

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}
	}

	SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	/*
	The rounding mode is in bits 11:10 of FPSR, and has the following
	settings:
	00 Round to nearest
	01 Round to -inf
	10 Round to +inf
	11 Round to 0

	FLT_ROUNDS, on the other hand, expects the following:
	-1 Undefined
	0 Round to 0
	1 Round to nearest
	2 Round to +inf
	3 Round to -inf

	To perform the conversion, we do:
	(((((FPSR & 0x800) >> 11) \| ((FPSR & 0x400) >> 9)) + 1) & 3)
	*/

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlignment = TFI.getStackAlignment();
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);

	// Save FP Control Word to stack slot
	int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
	SDValue StackSlot =
	DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOStore, 2, 2);

	SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
	SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
	DAG.getVTList(MVT::Other),
	Ops, MVT::i16, MMO);

	// Load FP Control Word from stack slot
	SDValue CWD =
	DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());

	// Transform as necessary
	SDValue CWD1 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x800, DL, MVT::i16)),
	DAG.getConstant(11, DL, MVT::i8));
	SDValue CWD2 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x400, DL, MVT::i16)),
	DAG.getConstant(9, DL, MVT::i8));

	SDValue RetVal =
	DAG.getNode(ISD::AND, DL, MVT::i16,
	DAG.getNode(ISD::ADD, DL, MVT::i16,
	DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
	DAG.getConstant(1, DL, MVT::i16)),
	DAG.getConstant(3, DL, MVT::i16));

	return DAG.getNode((VT.getSizeInBits() < 16 ?
	ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
	}

	// Split an unary integer op into 2 half sized ops.
	static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElems = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();
	MVT EltVT = VT.getVectorElementType();
	SDValue Src = Op.getOperand(0);
	assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
	"Src and Op should have the same element type!");

	// Extract the Lo/Hi vectors
	SDLoc dl(Op);
	SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
	SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);

	MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
	DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	// Decompose 512-bit ops into smaller 256-bit ops.
	static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is512BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 512-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	/// Lower a vector CTLZ using native supported vector CTLZ instruction.
	//
	// i8/i16 vector implemented using dword LZCNT vector instruction
	// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
	// split the vector, perform operation on it's Lo a Hi part and
	// concatenate the results.
	static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(Op.getOpcode() == ISD::CTLZ);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = VT.getVectorNumElements();

	assert((EltVT == MVT::i8 \|\| EltVT == MVT::i16) &&
	"Unsupported element type");

	// Split vector, it's Lo and Hi parts will be handled in next iteration.
	if (NumElems > 16 \|\|
	(NumElems == 16 && !Subtarget.canExtendTo512DQ()))
	return LowerVectorIntUnary(Op, DAG);

	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	assert((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) &&
	"Unsupported value type for operation");

	// Use native supported vector instruction vplzcntd.
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
	SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
	SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
	SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

	return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
	}

	// Lower CTLZ using a PSHUFB lookup table implementation.
	static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	int NumElts = VT.getVectorNumElements();
	int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
	MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

	// Per-nibble leading zero PSHUFB lookup table.
	const int LUT[16] = {/* 0 / 4, / 1 / 3, / 2 / 2, / 3 */ 2,
	/* 4 / 1, / 5 / 1, / 6 / 1, / 7 */ 1,
	/* 8 / 0, / 9 / 0, / a / 0, / b */ 0,
	/* c / 0, / d / 0, / e / 0, / f */ 0};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumBytes; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

	// Begin by bitcasting the input to byte vector, then split those bytes
	// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
	// If the hi input nibble is zero then we add both results together, otherwise
	// we just take the hi result (by masking the lo result to zero before the
	// add).
	SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
	SDValue Zero = DAG.getConstant(0, DL, CurrVT);

	SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
	SDValue Lo = Op0;
	SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
	SDValue HiZ;
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
	}

	Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
	Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
	SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

	// Merge result back from vXi8 back to VT, working on the lo/hi halves
	// of the current vector width in the same way we did for the nibbles.
	// If the upper half of the input element is zero then add the halves'
	// leading zero counts together, otherwise just use the upper half's.
	// Double the width of the result until we are at target width.
	while (CurrVT != VT) {
	int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
	int CurrNumElts = CurrVT.getVectorNumElements();
	MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
	MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
	SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

	// Check if the upper half of the input element is zero.
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	}
	HiZ = DAG.getBitcast(NextVT, HiZ);

	// Move the upper/lower halves to the lower bits as we'll be extending to
	// NextVT. Mask the lower result to zero if HiZ is true and add the results
	// together.
	SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
	SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
	SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
	R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
	Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
	CurrVT = NextVT;
	}

	return Res;
	}

	static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasCDI() &&
	// vXi8 vectors need to be promoted to 512-bits for vXi32.
	(Subtarget.canExtendTo512DQ() \|\| VT.getVectorElementType() != MVT::i8))
	return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
	return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
	}

	static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT OpVT = VT;
	unsigned NumBits = VT.getSizeInBits();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	if (VT.isVector())
	return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

	Op = Op.getOperand(0);
	if (VT == MVT::i8) {
	// Zero extend to i32 since there is not an i8 bsr.
	OpVT = MVT::i32;
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
	}

	// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

	if (Opc == ISD::CTLZ) {
	// If src is zero (i.e. bsr sets ZF), returns NumBits.
	SDValue Ops[] = {
	Op,
	DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
	DAG.getConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)
	};
	Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
	}

	// Finally xor with NumBits-1.
	Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
	DAG.getConstant(NumBits - 1, dl, OpVT));

	if (VT == MVT::i8)
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
	return Op;
	}

	static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumBits = VT.getScalarSizeInBits();
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);

	assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
	"Only scalar CTTZ requires custom lowering");

	// Issue a bsf (scan bits forward) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);

	// If src is zero (i.e. bsf sets ZF), returns NumBits.
	SDValue Ops[] = {
	Op,
	DAG.getConstant(NumBits, dl, VT),
	DAG.getConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)
	};
	return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
	}

	/// Break a 256-bit integer operation into two new 128-bit ones and then
	/// concatenate the result back.
	static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	/// Break a 512-bit integer operation into two new 256-bit ones and then
	/// concatenate the result back.
	static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is512BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	if (VT == MVT::i16 \|\| VT == MVT::i32)
	return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
	Op.getOperand(0), Op.getOperand(1));

	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return split256IntArith(Op, DAG);
	}

	static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	if (VT.getScalarType() == MVT::i1) {
	SDLoc dl(Op);
	switch (Opcode) {
	default: llvm_unreachable("Expected saturated arithmetic opcode");
	case ISD::UADDSAT:
	case ISD::SADDSAT:
	// *addsat i1 X, Y --> X \| Y
	return DAG.getNode(ISD::OR, dl, VT, X, Y);
	case ISD::USUBSAT:
	case ISD::SSUBSAT:
	// *subsat i1 X, Y --> X & ~Y
	return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
	}
	}

	if (VT.is128BitVector()) {
	// Avoid the generic expansion with min/max if we don't have pminu/pmaxu.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), VT);
	SDLoc DL(Op);
	if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
	// uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
	SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
	return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
	}
	if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
	// usubsat X, Y --> (X >u Y) ? X - Y : 0
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
	SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
	return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
	}
	// Use default expansion.
	return SDValue();
	}

	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return split256IntArith(Op, DAG);
	}

	static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64) {
	// Since X86 does not have CMOV for 8-bit integer, we don't convert
	// 8-bit integer abs to NEG and CMOV.
	SDLoc DL(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
	DAG.getConstant(0, DL, VT), N0);
	SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
	SDValue(Neg.getNode(), 1)};
	return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
	}

	// ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
	if ((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && Subtarget.hasSSE41()) {
	SDLoc DL(Op);
	SDValue Src = Op.getOperand(0);
	SDValue Sub =
	DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
	return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
	}

	if (VT.is256BitVector() && !Subtarget.hasInt256()) {
	assert(VT.isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntUnary(Op, DAG);
	}

	// Default to expand.
	return SDValue();
	}

	static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// For AVX1 cases, split to use legal ops (everything but v4i64).
	if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
	return split256IntArith(Op, DAG);

	SDLoc DL(Op);
	unsigned Opcode = Op.getOpcode();
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);

	// For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
	// using the SMIN/SMAX instructions and flipping the signbit back.
	if (VT == MVT::v8i16) {
	assert((Opcode == ISD::UMIN \|\| Opcode == ISD::UMAX) &&
	"Unexpected MIN/MAX opcode");
	SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
	N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
	N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
	Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
	SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
	return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
	}

	// Else, expand to a compare/select.
	ISD::CondCode CC;
	switch (Opcode) {
	case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
	case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
	case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
	case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
	default: llvm_unreachable("Unknown MINMAX opcode");
	}

	SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
	return DAG.getSelect(DL, VT, Cond, N0, N1);
	}

	static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));

	// Decompose 256-bit ops into 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return split256IntArith(Op, DAG);

	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
	// vector pairs, multiply and truncate.
	if (VT == MVT::v16i8 \|\| VT == MVT::v32i8 \|\| VT == MVT::v64i8) {
	unsigned NumElts = VT.getVectorNumElements();

	if ((VT == MVT::v16i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
	MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	return DAG.getNode(
	ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::MUL, dl, ExVT,
	DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
	DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
	}

	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Extract the lo/hi parts to any extend to i16.
	// We're going to mask off the low byte of each result element of the
	// pmullw, so it doesn't matter what's in the high byte of each 16-bit
	// element.
	SDValue Undef = DAG.getUNDEF(VT);
	SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
	SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));

	SDValue BLo, BHi;
	if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
	// If the LHS is a constant, manually unpackl/unpackh.
	SmallVector<SDValue, 16> LoOps, HiOps;
	for (unsigned i = 0; i != NumElts; i += 16) {
	for (unsigned j = 0; j != 8; ++j) {
	LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
	MVT::i16));
	HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
	MVT::i16));
	}
	}

	BLo = DAG.getBuildVector(ExVT, dl, LoOps);
	BHi = DAG.getBuildVector(ExVT, dl, HiOps);
	} else {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
	}

	// Multiply, mask the lower 8bits of the lo/hi results and pack.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
	RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
	if (VT == MVT::v4i32) {
	assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
	"Should not custom lower when pmulld is available!");

	// Extract the odd parts.
	static const int UnpackMask[] = { 1, -1, 3, -1 };
	SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
	SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

	// Multiply the even parts.
	SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, A),
	DAG.getBitcast(MVT::v2i64, B));
	// Now multiply odd parts.
	SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, Aodds),
	DAG.getBitcast(MVT::v2i64, Bodds));

	Evens = DAG.getBitcast(VT, Evens);
	Odds = DAG.getBitcast(VT, Odds);

	// Merge the two vectors back together with a shuffle. This expands into 2
	// shuffles.
	static const int ShufMask[] = { 0, 4, 2, 6 };
	return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
	}

	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&
	"Only know how to lower V2I64/V4I64/V8I64 multiply");
	assert(!Subtarget.hasDQI() && "DQI should use MULLQ");

	// Ahi = psrlqi(a, 32);
	// Bhi = psrlqi(b, 32);
	//
	// AloBlo = pmuludq(a, b);
	// AloBhi = pmuludq(a, Bhi);
	// AhiBlo = pmuludq(Ahi, b);
	//
	// Hi = psllqi(AloBhi + AhiBlo, 32);
	// return AloBlo + Hi;
	KnownBits AKnown = DAG.computeKnownBits(A);
	KnownBits BKnown = DAG.computeKnownBits(B);

	APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
	bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
	bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);

	APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
	bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
	bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);

	SDValue Zero = DAG.getConstant(0, dl, VT);

	// Only multiply lo/hi halves that aren't known to be zero.
	SDValue AloBlo = Zero;
	if (!ALoIsZero && !BLoIsZero)
	AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);

	SDValue AloBhi = Zero;
	if (!ALoIsZero && !BHiIsZero) {
	SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
	AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
	}

	SDValue AhiBlo = Zero;
	if (!AHiIsZero && !BLoIsZero) {
	SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
	AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
	}

	SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
	Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

	return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
	}

	static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	bool IsSigned = Op->getOpcode() == ISD::MULHS;
	unsigned NumElts = VT.getVectorNumElements();
	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Decompose 256-bit ops into 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return split256IntArith(Op, DAG);

	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32) {
	assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v16i32 && Subtarget.hasAVX512()));

	// PMULxD operations multiply each even value (starting at 0) of LHS with
	// the related value of RHS and produce a widen result.
	// E.g., PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	//
	// In other word, to have all the results, we need to perform two PMULxD:
	// 1. one with the even values.
	// 2. one with the odd values.
	// To achieve #2, with need to place the odd values at an even position.
	//
	// Place the odd value at an even position (basically, shift all values 1
	// step to the left):
	const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
	9, -1, 11, -1, 13, -1, 15, -1};
	// <a\|b\|c\|d> => <b\|undef\|d\|undef>
	SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
	makeArrayRef(&Mask[0], NumElts));
	// <e\|f\|g\|h> => <f\|undef\|h\|undef>
	SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
	makeArrayRef(&Mask[0], NumElts));

	// Emit two multiplies, one for the lower 2 ints and one for the higher 2
	// ints.
	MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
	unsigned Opcode =
	(IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
	// PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
	DAG.getBitcast(MulVT, A),
	DAG.getBitcast(MulVT, B)));
	// PMULUDQ <4 x i32> <b\|undef\|d\|undef>, <4 x i32> <f\|undef\|h\|undef>
	// => <2 x i64> <bf\|dh>
	SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
	DAG.getBitcast(MulVT, Odd0),
	DAG.getBitcast(MulVT, Odd1)));

	// Shuffle it back into the right order.
	SmallVector<int, 16> ShufMask(NumElts);
	for (int i = 0; i != (int)NumElts; ++i)
	ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;

	SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);

	// If we have a signed multiply but no PMULDQ fix up the result of an
	// unsigned multiply.
	if (IsSigned && !Subtarget.hasSSE41()) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
	SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);

	SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
	}

	return Res;
	}

	// Only i8 vectors should need custom lowering after this.
	assert((VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
	"Unsupported vector type");

	// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
	// logical shift down the upper half and pack back to i8.

	// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
	// and then ashr/lshr the upper bits down to the lower bits before multiply.
	unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

	if ((VT == MVT::v16i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
	SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
	SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
	Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
	}

	// For signed 512-bit vectors, split into 256-bit vectors to allow the
	// sign-extension to occur.
	if (VT == MVT::v64i8 && IsSigned)
	return split512IntArith(Op, DAG);

	// Signed AVX2 implementation - extend xmm subvectors to ymm.
	if (VT == MVT::v32i8 && IsSigned) {
	MVT ExVT = MVT::v16i16;
	SDValue ALo = extract128BitVector(A, 0, DAG, dl);
	SDValue BLo = extract128BitVector(B, 0, DAG, dl);
	SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
	SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
	ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
	BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
	AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
	BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
	SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
	Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);

	// Bitcast back to VT and then pack all the even elements from Lo and Hi.
	// Shuffle lowering should turn this into PACKUS+PERMQ
	Lo = DAG.getBitcast(VT, Lo);
	Hi = DAG.getBitcast(VT, Hi);
	return DAG.getVectorShuffle(VT, dl, Lo, Hi,
	{ 0, 2, 4, 6, 8, 10, 12, 14,
	16, 18, 20, 22, 24, 26, 28, 30,
	32, 34, 36, 38, 40, 42, 44, 46,
	48, 50, 52, 54, 56, 58, 60, 62});
	}

	// For signed v16i8 and all unsigned vXi8 we will unpack the low and high
	// half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,
	// shift the results and pack the half lane results back together.

	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};

	// Extract the lo parts and zero/sign extend to i16.
	// Only use SSE4.1 instructions for signed v16i8 where using unpack requires
	// shifts to sign extend. Using unpack for unsigned only requires an xor to
	// create zeros and a copy due to tied registers contraints pre-avx. But using
	// zero_extend_vector_inreg would require an additional pshufd for the high
	// part.

	SDValue ALo, AHi;
	if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
	ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);

	AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
	AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
	} else if (IsSigned) {
	ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
	AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));

	ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
	AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
	} else {
	ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
	DAG.getConstant(0, dl, VT)));
	AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
	DAG.getConstant(0, dl, VT)));
	}

	SDValue BLo, BHi;
	if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
	// If the LHS is a constant, manually unpackl/unpackh and extend.
	SmallVector<SDValue, 16> LoOps, HiOps;
	for (unsigned i = 0; i != NumElts; i += 16) {
	for (unsigned j = 0; j != 8; ++j) {
	SDValue LoOp = B.getOperand(i + j);
	SDValue HiOp = B.getOperand(i + j + 8);

	if (IsSigned) {
	LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
	HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
	} else {
	LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
	HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
	}

	LoOps.push_back(LoOp);
	HiOps.push_back(HiOp);
	}
	}

	BLo = DAG.getBuildVector(ExVT, dl, LoOps);
	BHi = DAG.getBuildVector(ExVT, dl, HiOps);
	} else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
	BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);

	BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
	BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
	} else if (IsSigned) {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));

	BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
	BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
	} else {
	BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
	DAG.getConstant(0, dl, VT)));
	BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
	DAG.getConstant(0, dl, VT)));
	}

	// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
	// pack back to vXi8.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
	RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);

	// Bitcast back to VT and then pack all the even elements from Lo and Hi.
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.isTargetWin64() && "Unexpected target");
	EVT VT = Op.getValueType();
	assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
	"Unexpected return type for lowering");

	RTLIB::Libcall LC;
	bool isSigned;
	switch (Op->getOpcode()) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
	case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
	case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
	case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
	case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
	case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
	}

	SDLoc dl(Op);
	SDValue InChain = DAG.getEntryNode();

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
	EVT ArgVT = Op->getOperand(i).getValueType();
	assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
	"Unexpected argument type for lowering");
	SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
	Entry.Node = StackPtr;
	InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
	MachinePointerInfo(), /* Alignment = */ 16);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Ty = PointerType::get(ArgTy,0);
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);
	}

	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
	getPointerTy(DAG.getDataLayout()));

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(InChain)
	.setLibCallee(
	getLibcallCallingConv(LC),
	static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
	std::move(Args))
	.setInRegister()
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
	return DAG.getBitcast(VT, CallInfo.first);
	}

	// Return true if the required (according to Opcode) shift-imm form is natively
	// supported by the Subtarget
	static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	if (VT.getScalarSizeInBits() < 16)
	return false;

	if (VT.is512BitVector() && Subtarget.hasAVX512() &&
	(VT.getScalarSizeInBits() > 16 \|\| Subtarget.hasBWI()))
	return true;

	bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256());

	bool AShift = LShift && (Subtarget.hasAVX512() \|\|
	(VT != MVT::v2i64 && VT != MVT::v4i64));
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	// The shift amount is a variable, but it is the same for all vector lanes.
	// These instructions are defined together with shift-immediate.
	static
	bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
	}

	// Return true if the required (according to Opcode) variable-shift form is
	// natively supported by the Subtarget
	static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {

	if (!Subtarget.hasInt256() \|\| VT.getScalarSizeInBits() < 16)
	return false;

	// vXi16 supported only on AVX-512, BWI
	if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
	return false;

	if (Subtarget.hasAVX512())
	return true;

	bool LShift = VT.is128BitVector() \|\| VT.is256BitVector();
	bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);

	auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && "Unexpected SRA type");
	MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
	SDValue Ex = DAG.getBitcast(ExVT, R);

	// ashr(R, 63) === cmp_slt(R, 0)
	if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
	assert((VT != MVT::v4i64 \|\| Subtarget.hasInt256()) &&
	"Unsupported PCMPGT op");
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
	}

	if (ShiftAmt >= 32) {
	// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
	SDValue Upper =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
	SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt - 32, DAG);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{9, 1, 11, 3, 13, 5, 15, 7});
	} else {
	// SRA upper i32, SRL whole i64 and select lower i32.
	SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt, DAG);
	SDValue Lower =
	getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
	Lower = DAG.getBitcast(ExVT, Lower);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{8, 1, 10, 3, 12, 5, 14, 7});
	}
	return DAG.getBitcast(VT, Ex);
	};

	// Optimize shl/srl/sra with constant shift amount.
	APInt APIntShiftAmt;
	if (!isConstantSplat(Amt, APIntShiftAmt))
	return SDValue();

	// If the shift amount is out of range, return undef.
	if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
	return DAG.getUNDEF(VT);

	uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	// i64 SRA needs to be performed as partial shifts.
	if (((!Subtarget.hasXOP() && VT == MVT::v2i64) \|\|
	(Subtarget.hasInt256() && VT == MVT::v4i64)) &&
	Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);

	if (VT == MVT::v16i8 \|\| (Subtarget.hasInt256() && VT == MVT::v32i8) \|\|
	VT == MVT::v64i8) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Simple i8 add case
	if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
	return DAG.getNode(ISD::ADD, dl, VT, R, R);

	// ashr(R, 7) === cmp_slt(R, 0)
	if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
	SDValue Zeros = DAG.getConstant(0, dl, VT);
	if (VT.is512BitVector()) {
	assert(VT == MVT::v64i8 && "Unexpected element type!");
	SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
	}
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
	}

	// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
	if (VT == MVT::v16i8 && Subtarget.hasXOP())
	return SDValue();

	if (Op.getOpcode() == ISD::SHL) {
	// Make a large shift.
	SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
	ShiftAmt, DAG);
	SHL = DAG.getBitcast(VT, SHL);
	// Zero out the rightmost bits.
	APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
	return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRL) {
	// Make a large shift.
	SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
	ShiftAmt, DAG);
	SRL = DAG.getBitcast(VT, SRL);
	// Zero out the leftmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SRL,
	DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
	SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

	SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
	return Res;
	}
	llvm_unreachable("Unknown shift opcode.");
	}

	return SDValue();
	}

	static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
	unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);

	if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
	MVT EltVT = VT.getVectorElementType();
	assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
	if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
	else if (EltVT.bitsLT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
	}

	// vXi8 shifts - shift as v8i16 + mask result.
	if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) \|\|
	(VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) \|\|
	VT == MVT::v64i8) &&
	!Subtarget.hasXOP()) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
	if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
	unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
	unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	// Create the mask using vXi16 shifts. For shift-rights we need to move
	// the upper byte down before splatting the vXi8 mask.
	SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
	BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
	BaseShAmt, Subtarget, DAG);
	if (Opcode != ISD::SHL)
	BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
	8, DAG);
	BitMask = DAG.getBitcast(VT, BitMask);
	BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
	SmallVector<int, 64>(NumElts, 0));

	SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
	DAG.getBitcast(ExtVT, R), BaseShAmt,
	Subtarget, DAG);
	Res = DAG.getBitcast(VT, Res);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);

	if (Opcode == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
	// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
	SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
	SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
	BaseShAmt, Subtarget, DAG);
	SignMask = DAG.getBitcast(VT, SignMask);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
	}
	return Res;
	}
	}
	}

	// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
	if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
	Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
	Amt = Amt.getOperand(0);
	unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
	std::vector<SDValue> Vals(Ratio);
	for (unsigned i = 0; i != Ratio; ++i)
	Vals[i] = Amt.getOperand(i);
	for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
	for (unsigned j = 0; j != Ratio; ++j)
	if (Vals[j] != Amt.getOperand(i + j))
	return SDValue();
	}

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
	return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
	}
	return SDValue();
	}

	// Convert a shift/rotate left amount to a multiplication scale factor.
	static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Amt.getSimpleValueType();
	if (!(VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(Subtarget.hasInt256() && VT == MVT::v16i16) \|\|
	(!Subtarget.hasAVX512() && VT == MVT::v16i8)))
	return SDValue();

	if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
	SmallVector<SDValue, 8> Elts;
	MVT SVT = VT.getVectorElementType();
	unsigned SVTBits = SVT.getSizeInBits();
	APInt One(SVTBits, 1);
	unsigned NumElems = VT.getVectorNumElements();

	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Op = Amt->getOperand(i);
	if (Op->isUndef()) {
	Elts.push_back(Op);
	continue;
	}

	ConstantSDNode *ND = cast<ConstantSDNode>(Op);
	APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
	uint64_t ShAmt = C.getZExtValue();
	if (ShAmt >= SVTBits) {
	Elts.push_back(DAG.getUNDEF(SVT));
	continue;
	}
	Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
	}
	return DAG.getBuildVector(VT, dl, Elts);
	}

	// If the target doesn't support variable shifts, use either FP conversion
	// or integer multiplication to avoid shifting each element individually.
	if (VT == MVT::v4i32) {
	Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
	DAG.getConstant(0x3f800000U, dl, VT));
	Amt = DAG.getBitcast(MVT::v4f32, Amt);
	return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
	}

	// AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
	if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
	SDValue Z = DAG.getConstant(0, dl, VT);
	SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
	SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
	Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
	Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
	if (Subtarget.hasSSE41())
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

	return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
	DAG.getBitcast(VT, Hi),
	{0, 2, 4, 6, 8, 10, 12, 14});
	}

	return SDValue();
	}

	static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	unsigned Opc = Op.getOpcode();
	unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
	unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);

	assert(VT.isVector() && "Custom lowering only for vector shifts!");
	assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");

	if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
	return V;

	if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
	return V;

	if (SupportedVectorVarShift(VT, Subtarget, Opc))
	return Op;

	// XOP has 128-bit variable logical/arithmetic shifts.
	// +ve/-ve Amt = shift left/right.
	if (Subtarget.hasXOP() && (VT == MVT::v2i64 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v8i16 \|\| VT == MVT::v16i8)) {
	if (Opc == ISD::SRL \|\| Opc == ISD::SRA) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
	}
	if (Opc == ISD::SHL \|\| Opc == ISD::SRL)
	return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
	if (Opc == ISD::SRA)
	return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
	}

	// 2i64 vector logical shifts can efficiently avoid scalarization - do the
	// shifts per-lane and then shuffle the partial results back together.
	if (VT == MVT::v2i64 && Opc != ISD::SRA) {
	// Splat the shift amounts so the scalar shifts above will catch it.
	SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
	SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
	SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
	return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
	}

	// i64 vector arithmetic shift can be emulated with the transform:
	// M = lshr(SIGN_MASK, Amt)
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
	if ((VT == MVT::v2i64 \|\| (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
	Opc == ISD::SRA) {
	SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
	SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
	R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
	R = DAG.getNode(ISD::XOR, dl, VT, R, M);
	R = DAG.getNode(ISD::SUB, dl, VT, R, M);
	return R;
	}

	// If possible, lower this shift as a sequence of two shifts by
	// constant plus a BLENDing shuffle instead of scalarizing it.
	// Example:
	// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
	//
	// Could be rewritten as:
	// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
	//
	// The advantage is that the two shifts from the example would be
	// lowered as X86ISD::VSRLI nodes in parallel before blending.
	if (ConstantAmt && (VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(VT == MVT::v16i16 && Subtarget.hasInt256()))) {
	SDValue Amt1, Amt2;
	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 8> ShuffleMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	SDValue A = Amt->getOperand(i);
	if (A.isUndef()) {
	ShuffleMask.push_back(SM_SentinelUndef);
	continue;
	}
	if (!Amt1 \|\| Amt1 == A) {
	ShuffleMask.push_back(i);
	Amt1 = A;
	continue;
	}
	if (!Amt2 \|\| Amt2 == A) {
	ShuffleMask.push_back(i + NumElts);
	Amt2 = A;
	continue;
	}
	break;
	}

	// Only perform this blend if we can perform it without loading a mask.
	if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
	(VT != MVT::v16i16 \|\|
	is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
	(VT == MVT::v4i32 \|\| Subtarget.hasSSE41() \|\| Opc != ISD::SHL \|\|
	canWidenShuffleElements(ShuffleMask))) {
	auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
	auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
	if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
	Cst2->getAPIntValue().ult(EltSizeInBits)) {
	SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
	Cst1->getZExtValue(), DAG);
	SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
	Cst2->getZExtValue(), DAG);
	return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
	}
	}
	}

	// If possible, lower this packed shift into a vector multiply instead of
	// expanding it into a sequence of scalar shifts.
	if (Opc == ISD::SHL)
	if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
	return DAG.getNode(ISD::MUL, dl, VT, R, Scale);

	// Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
	// can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
	if (Opc == ISD::SRL && ConstantAmt &&
	(VT == MVT::v8i16 \|\| (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
	SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
	SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
	if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
	SDValue Zero = DAG.getConstant(0, dl, VT);
	SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
	SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
	return DAG.getSelect(dl, VT, ZAmt, R, Res);
	}
	}

	// Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
	// can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
	// TODO: Special case handling for shift by 0/1, really we can afford either
	// of these cases in pre-SSE41/XOP/AVX512 but not both.
	if (Opc == ISD::SRA && ConstantAmt &&
	(VT == MVT::v8i16 \|\| (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
	((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
	!Subtarget.hasAVX512()) \|\|
	DAG.isKnownNeverZero(Amt))) {
	SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
	SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
	if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
	SDValue Amt0 =
	DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
	SDValue Amt1 =
	DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
	SDValue Sra1 =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
	SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
	Res = DAG.getSelect(dl, VT, Amt0, R, Res);
	return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
	}
	}

	// v4i32 Non Uniform Shifts.
	// If the shift amount is constant we can shift each lane using the SSE2
	// immediate shifts, else we need to zero-extend each lane to the lower i64
	// and shift using the SSE2 variable shifts.
	// The separate results can then be blended together.
	if (VT == MVT::v4i32) {
	SDValue Amt0, Amt1, Amt2, Amt3;
	if (ConstantAmt) {
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
	} else {
	// The SSE2 shifts use the lower i64 as the same shift amount for
	// all lanes and the upper i64 is ignored. On AVX we're better off
	// just zero-extending, but for SSE just duplicating the top 16-bits is
	// cheaper and has the same effect for out of range values.
	if (Subtarget.hasAVX()) {
	SDValue Z = DAG.getConstant(0, dl, VT);
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
	} else {
	SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
	SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{4, 5, 6, 7, -1, -1, -1, -1});
	Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{0, 1, 1, 1, -1, -1, -1, -1});
	Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
	{2, 3, 3, 3, -1, -1, -1, -1});
	Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
	{0, 1, 1, 1, -1, -1, -1, -1});
	Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
	{2, 3, 3, 3, -1, -1, -1, -1});
	}
	}

	unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
	SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
	SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
	SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
	SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));

	// Merge the shifted lane results optimally with/without PBLENDW.
	// TODO - ideally shuffle combining would handle this.
	if (Subtarget.hasSSE41()) {
	SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
	SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
	}
	SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
	SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
	}

	// It's worth extending once and using the vXi16/vXi32 shifts for smaller
	// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
	// make the existing SSE solution better.
	// NOTE: We honor prefered vector width before promoting to 512-bits.
	if ((Subtarget.hasInt256() && VT == MVT::v8i16) \|\|
	(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) \|\|
	(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) \|\|
	(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
	assert((!Subtarget.hasBWI() \|\| VT == MVT::v32i8 \|\| VT == MVT::v16i8) &&
	"Unexpected vector type");
	MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
	MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
	unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	R = DAG.getNode(ExtOpc, dl, ExtVT, R);
	Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(Opc, dl, ExtVT, R, Amt));
	}

	// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
	// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
	if (ConstantAmt && (Opc == ISD::SRA \|\| Opc == ISD::SRL) &&
	(VT == MVT::v16i8 \|\| VT == MVT::v64i8 \|\|
	(VT == MVT::v32i8 && Subtarget.hasInt256())) &&
	!Subtarget.hasXOP()) {
	int NumElts = VT.getVectorNumElements();
	SDValue Cst8 = DAG.getConstant(8, dl, MVT::i8);

	// Extend constant shift amount to vXi16 (it doesn't matter if the type
	// isn't legal).
	MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
	Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
	Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
	Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
	assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
	"Constant build vector expected");

	if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
	R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
	: DAG.getZExtOrTrunc(R, dl, ExVT);
	R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
	R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
	return DAG.getZExtOrTrunc(R, dl, VT);
	}

	SmallVector<SDValue, 16> LoAmt, HiAmt;
	for (int i = 0; i != NumElts; i += 16) {
	for (int j = 0; j != 8; ++j) {
	LoAmt.push_back(Amt.getOperand(i + j));
	HiAmt.push_back(Amt.getOperand(i + j + 8));
	}
	}

	MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
	SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
	SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);

	SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
	SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
	LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
	HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
	LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
	HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
	LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
	HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
	}

	if (VT == MVT::v16i8 \|\|
	(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) {
	MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (VT.is512BitVector()) {
	// On AVX512BW targets we make use of the fact that VSELECT lowers
	// to a masked blend which selects bytes based just on the sign bit
	// extracted to a mask.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
	ISD::SETGT);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	} else if (Subtarget.hasSSE41()) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = DAG.getConstant(0, dl, SelVT);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
	return DAG.getSelect(dl, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
	Amt = DAG.getBitcast(VT, Amt);

	if (Opc == ISD::SHL \|\| Opc == ISD::SRL) {
	// r = VSELECT(r, shift(r, 4), a);
	SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);
	return R;
	}

	if (Opc == ISD::SRA) {
	// For SRA we need to unpack each byte to the higher byte of a i16 vector
	// so we can correctly sign extend. We don't care what happens to the
	// lower byte.
	SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
	SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);

	// r = VSELECT(r, shift(r, 4), a);
	SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
	SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 2), a);
	MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
	MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 1), a);
	MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
	MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// Logical shift the result back to the lower byte, leaving a zero upper
	// byte meaning that we can safely pack with PACKUSWB.
	RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
	RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}
	}

	if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
	MVT ExtVT = MVT::v8i32;
	SDValue Z = DAG.getConstant(0, dl, VT);
	SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
	SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
	SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
	SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);
	SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
	SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
	Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
	Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	if (VT == MVT::v8i16) {
	// If we have a constant shift amount, the non-SSE41 path is best as
	// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
	bool UseSSE41 = Subtarget.hasSSE41() &&
	!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	if (UseSSE41) {
	MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
	V0 = DAG.getBitcast(ExtVT, V0);
	V1 = DAG.getBitcast(ExtVT, V1);
	Sel = DAG.getBitcast(ExtVT, Sel);
	return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we splat the sign bit - a negative value will
	// set all bits of the lanes to true and VSELECT uses that in
	// its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue C =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
	return DAG.getSelect(dl, VT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
	if (UseSSE41) {
	// On SSE41 targets we need to replicate the shift mask in both
	// bytes for PBLENDVB.
	Amt = DAG.getNode(
	ISD::OR, dl, VT,
	getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
	getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
	} else {
	Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
	}

	// r = VSELECT(r, shift(r, 8), a);
	SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 4), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
	R = SignBitSelect(Amt, M, R);
	return R;
	}

	// Decompose 256-bit shifts into 128-bit shifts.
	if (VT.is256BitVector())
	return split256IntArith(Op, DAG);

	return SDValue();
	}

	static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert(VT.isVector() && "Custom lowering only for vector rotates!");

	SDLoc DL(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	int NumElts = VT.getVectorNumElements();

	// Check for constant splat rotation amount.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	int CstSplatIndex = -1;
	if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits))
	for (int i = 0; i != NumElts; ++i)
	if (!UndefElts[i]) {
	if (CstSplatIndex < 0 \|\| EltBits[i] == EltBits[CstSplatIndex]) {
	CstSplatIndex = i;
	continue;
	}
	CstSplatIndex = -1;
	break;
	}

	// AVX512 implicitly uses modulo rotation amounts.
	if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
	// Attempt to rotate by immediate.
	if (0 <= CstSplatIndex) {
	unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
	uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
	return DAG.getNode(Op, DL, VT, R,
	DAG.getConstant(RotateAmt, DL, MVT::i8));
	}

	// Else, fall-back on VPROLV/VPRORV.
	return Op;
	}

	assert((Opcode == ISD::ROTL) && "Only ROTL supported");

	// XOP has 128-bit vector variable + immediate rotates.
	// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
	// XOP implicitly uses modulo rotation amounts.
	if (Subtarget.hasXOP()) {
	if (VT.is256BitVector())
	return split256IntArith(Op, DAG);
	assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");

	// Attempt to rotate by immediate.
	if (0 <= CstSplatIndex) {
	uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
	return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
	DAG.getConstant(RotateAmt, DL, MVT::i8));
	}

	// Use general rotate by variable (per-element).
	return Op;
	}

	// Split 256-bit integers on pre-AVX2 targets.
	if (VT.is256BitVector() && !Subtarget.hasAVX2())
	return split256IntArith(Op, DAG);

	assert((VT == MVT::v4i32 \|\| VT == MVT::v8i16 \|\| VT == MVT::v16i8 \|\|
	((VT == MVT::v8i32 \|\| VT == MVT::v16i16 \|\| VT == MVT::v32i8) &&
	Subtarget.hasAVX2())) &&
	"Only vXi32/vXi16/vXi8 vector rotates supported");

	// Rotate by an uniform constant - expand back to shifts.
	if (0 <= CstSplatIndex)
	return SDValue();

	bool IsSplatAmt = DAG.isSplatValue(Amt);

	// v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
	// the amount bit.
	if (EltSizeInBits == 8 && !IsSplatAmt) {
	if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
	return SDValue();

	// We don't need ModuloAmt here as we just peek at individual bits.
	MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (Subtarget.hasSSE41()) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = DAG.getConstant(0, DL, SelVT);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
	return DAG.getSelect(DL, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
	Amt = DAG.getBitcast(VT, Amt);

	// r = VSELECT(r, rot(r, 4), a);
	SDValue M;
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

	// r = VSELECT(r, rot(r, 2), a);
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

	// return VSELECT(r, rot(r, 1), a);
	M = DAG.getNode(
	ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
	DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
	return SignBitSelect(VT, Amt, M, R);
	}

	// ISD::ROT* uses modulo rotate amounts.
	Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
	DAG.getConstant(EltSizeInBits - 1, DL, VT));

	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
	bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
	SupportedVectorVarShift(VT, Subtarget, ISD::SRL);

	// Fallback for splats + all supported variable shifts.
	// Fallback for non-constants AVX2 vXi16 as well.
	if (IsSplatAmt \|\| LegalVarShifts \|\| (Subtarget.hasAVX2() && !ConstantAmt)) {
	SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
	AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
	SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
	SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
	return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
	}

	// As with shifts, convert the rotation amount to a multiplication factor.
	SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
	assert(Scale && "Failed to convert ROTL amount to scale");

	// v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
	if (EltSizeInBits == 16) {
	SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
	SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	// v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
	// to v2i64 results at a time. The upper 32-bits contain the wrapped bits
	// that can then be OR'd with the lower 32-bits.
	assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
	static const int OddMask[] = {1, -1, 3, -1};
	SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
	SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);

	SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, R),
	DAG.getBitcast(MVT::v2i64, Scale));
	SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64, R13),
	DAG.getBitcast(MVT::v2i64, Scale13));
	Res02 = DAG.getBitcast(VT, Res02);
	Res13 = DAG.getBitcast(VT, Res13);

	return DAG.getNode(ISD::OR, DL, VT,
	DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
	DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
	}

	/// Returns true if the operand type is exactly twice the native width, and
	/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
	/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
	/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
	bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
	unsigned OpWidth = MemType->getPrimitiveSizeInBits();

	if (OpWidth == 64)
	return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
	if (OpWidth == 128)
	return Subtarget.hasCmpxchg16b();

	return false;
	}

	// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
	// TODO: In 32-bit mode, use FISTP when X87 is available?
	bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	Type *MemType = SI->getValueOperand()->getType();

	bool NoImplicitFloatOps =
	SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
	if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2())
	return false;

	return needsCmpXchgNb(MemType);
	}

	// Note: this turns large loads into lock cmpxchg8b/16b.
	// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	Type *MemType = LI->getType();

	// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
	// can use movq to do the load. If we have X87 we can load into an 80-bit
	// X87 register and store it to a stack temporary.
	bool NoImplicitFloatOps =
	LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
	if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
	(Subtarget.hasSSE2() \|\| Subtarget.hasX87()))
	return AtomicExpansionKind::None;

	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();

	// If the operand is too big, we must see if cmpxchg8/16b is available
	// and default to library calls otherwise.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	AtomicRMWInst::BinOp Op = AI->getOperation();
	switch (Op) {
	default:
	llvm_unreachable("Unknown atomic operation");
	case AtomicRMWInst::Xchg:
	case AtomicRMWInst::Add:
	case AtomicRMWInst::Sub:
	// It's better to use xadd, xsub or xchg for these in all cases.
	return AtomicExpansionKind::None;
	case AtomicRMWInst::Or:
	case AtomicRMWInst::And:
	case AtomicRMWInst::Xor:
	// If the atomicrmw's result isn't actually used, we can just add a "lock"
	// prefix to a normal instruction for these operations.
	return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	case AtomicRMWInst::Nand:
	case AtomicRMWInst::Max:
	case AtomicRMWInst::Min:
	case AtomicRMWInst::UMax:
	case AtomicRMWInst::UMin:
	case AtomicRMWInst::FAdd:
	case AtomicRMWInst::FSub:
	// These always require a non-trivial set of data operations on x86. We must
	// use a cmpxchg loop.
	return AtomicExpansionKind::CmpXChg;
	}
	}

	LoadInst *
	X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();
	// Accesses larger than the native width are turned into cmpxchg/libcalls, so
	// there is no benefit in turning such RMWs into loads, and it is actually
	// harmful as it introduces a mfence.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth)
	return nullptr;

	// If this is a canonical idempotent atomicrmw w/no uses, we have a better
	// lowering available in lowerAtomicArith.
	// TODO: push more cases through this path.
	if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
	if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
	AI->use_empty())
	return nullptr;

	auto Builder = IRBuilder<>(AI);
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	auto SSID = AI->getSyncScopeID();
	// We must restrict the ordering to avoid generating loads with Release or
	// ReleaseAcquire orderings.
	auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());

	// Before the load we need a fence. Here is an example lifted from
	// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
	// is required:
	// Thread 0:
	// x.store(1, relaxed);
	// r1 = y.fetch_add(0, release);
	// Thread 1:
	// y.fetch_add(42, acquire);
	// r2 = x.load(relaxed);
	// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
	// lowered to just a load without a fence. A mfence flushes the store buffer,
	// making the optimization clearly correct.
	// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
	// otherwise, we might be able to be more aggressive on relaxed idempotent
	// rmw. In practice, they do not look useful, so we don't try to be
	// especially clever.
	if (SSID == SyncScope::SingleThread)
	// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
	// the IR level, so we must wrap it in an intrinsic.
	return nullptr;

	if (!Subtarget.hasMFence())
	// FIXME: it might make sense to use a locked operation here but on a
	// different cache-line to prevent cache-line bouncing. In practice it
	// is probably a small win, and x86 processors without mfence are rare
	// enough that we do not bother.
	return nullptr;

	Function *MFence =
	llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
	Builder.CreateCall(MFence, {});

	// Finally we can emit the atomic load.
	LoadInst *Loaded =
	Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
	AI->getType()->getPrimitiveSizeInBits());
	Loaded->setAtomic(Order, SSID);
	AI->replaceAllUsesWith(Loaded);
	AI->eraseFromParent();
	return Loaded;
	}

	/// Emit a locked operation on a stack location which does not change any
	/// memory location, but does involve a lock prefix. Location is chosen to be
	/// a) very likely accessed only by a single thread to minimize cache traffic,
	/// and b) definitely dereferenceable. Returns the new Chain result.
	static SDValue emitLockedStackOp(SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDValue Chain, SDLoc DL) {
	// Implementation notes:
	// 1) LOCK prefix creates a full read/write reordering barrier for memory
	// operations issued by the current processor. As such, the location
	// referenced is not relevant for the ordering properties of the instruction.
	// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
	// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
	// 2) Using an immediate operand appears to be the best encoding choice
	// here since it doesn't require an extra register.
	// 3) OR appears to be very slightly faster than ADD. (Though, the difference
	// is small enough it might just be measurement noise.)
	// 4) When choosing offsets, there are several contributing factors:
	// a) If there's no redzone, we default to TOS. (We could allocate a cache
	// line aligned stack object to improve this case.)
	// b) To minimize our chances of introducing a false dependence, we prefer
	// to offset the stack usage from TOS slightly.
	// c) To minimize concerns about cross thread stack usage - in particular,
	// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
	// captures state in the TOS frame and accesses it from many threads -
	// we want to use an offset such that the offset is in a distinct cache
	// line from the TOS frame.
	//
	// For a general discussion of the tradeoffs and benchmark results, see:
	// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/

	auto &MF = DAG.getMachineFunction();
	auto &TFL = *Subtarget.getFrameLowering();
	const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;

	if (Subtarget.is64Bit()) {
	SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::RSP, MVT::i64), // Base
	DAG.getTargetConstant(1, DL, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i64), // Index
	DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i16), // Segment.
	Zero,
	Chain};
	SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
	MVT::Other, Ops);
	return SDValue(Res, 1);
	}

	SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::ESP, MVT::i32), // Base
	DAG.getTargetConstant(1, DL, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i32), // Index
	DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i16), // Segment.
	Zero,
	Chain
	};
	SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
	MVT::Other, Ops);
	return SDValue(Res, 1);
	}

	static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
	cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
	SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

	// The only fence that needs an instruction is a sequentially-consistent
	// cross-thread fence.
	if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
	FenceSSID == SyncScope::System) {
	if (Subtarget.hasMFence())
	return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

	SDValue Chain = Op.getOperand(0);
	return emitLockedStackOp(DAG, Subtarget, Chain, dl);
	}

	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
	}

	static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT T = Op.getSimpleValueType();
	SDLoc DL(Op);
	unsigned Reg = 0;
	unsigned size = 0;
	switch(T.SimpleTy) {
	default: llvm_unreachable("Invalid value type!");
	case MVT::i8: Reg = X86::AL; size = 1; break;
	case MVT::i16: Reg = X86::AX; size = 2; break;
	case MVT::i32: Reg = X86::EAX; size = 4; break;
	case MVT::i64:
	assert(Subtarget.is64Bit() && "Node not type legal!");
	Reg = X86::RAX; size = 8;
	break;
	}
	SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
	Op.getOperand(2), SDValue());
	SDValue Ops[] = { cpIn.getValue(0),
	Op.getOperand(1),
	Op.getOperand(3),
	DAG.getTargetConstant(size, DL, MVT::i8),
	cpIn.getValue(1) };
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
	Ops, T, MMO);

	SDValue cpOut =
	DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
	SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
	MVT::i32, cpOut.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

	return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
	cpOut, Success, EFLAGS.getValue(1));
	}

	// Create MOVMSKB, taking into account whether we need to split for AVX1.
	static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT InVT = V.getSimpleValueType();

	if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
	Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
	Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
	Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
	DAG.getConstant(16, DL, MVT::i8));
	return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
	}

	return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	}

	static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	// Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
	// half to v32i1 and concatenating the result.
	if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
	assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
	assert(Subtarget.hasBWI() && "Expected BWI target");
	SDLoc dl(Op);
	SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
	DAG.getIntPtrConstant(0, dl));
	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
	DAG.getIntPtrConstant(1, dl));
	Hi = DAG.getBitcast(MVT::v32i1, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	}

	// Custom splitting for BWI types when AVX512F is available but BWI isn't.
	if ((SrcVT == MVT::v32i16 \|\| SrcVT == MVT::v64i8) && DstVT.isVector() &&
	DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
	SDLoc dl(Op);
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
	EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
	DstVT.getVectorNumElements() / 2);
	Lo = DAG.getBitcast(CastVT, Lo);
	Hi = DAG.getBitcast(CastVT, Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
	}

	// Use MOVMSK for vector to scalar conversion to prevent scalarization.
	if ((SrcVT == MVT::v16i1 \|\| SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
	assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
	MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
	SDLoc DL(Op);
	SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
	V = getPMOVMSKB(DL, V, DAG, Subtarget);
	return DAG.getZExtOrTrunc(V, DL, DstVT);
	}

	if (SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8 \|\|
	SrcVT == MVT::i64) {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	if (DstVT != MVT::f64 && DstVT != MVT::i64 &&
	!(DstVT == MVT::x86mmx && SrcVT.isVector()))
	// This conversion needs to be expanded.
	return SDValue();

	SDLoc dl(Op);
	if (SrcVT.isVector()) {
	// Widen the vector in input in the case of MVT::v2i32.
	// Example: from MVT::v2i32 to MVT::v4i32.
	MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
	SrcVT.getVectorNumElements() * 2);
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
	DAG.getUNDEF(SrcVT));
	} else {
	assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
	"Unexpected source type in LowerBITCAST");
	Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
	}

	MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
	Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);

	if (DstVT == MVT::x86mmx)
	return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
	Subtarget.hasMMX() && "Unexpected custom BITCAST");
	assert((DstVT == MVT::i64 \|\|
	(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
	"Unexpected custom BITCAST");
	// i64 <=> MMX conversions are Legal.
	if (SrcVT==MVT::i64 && DstVT.isVector())
	return Op;
	if (DstVT==MVT::i64 && SrcVT.isVector())
	return Op;
	// MMX <=> MMX conversions are Legal.
	if (SrcVT.isVector() && DstVT.isVector())
	return Op;
	// All other conversions need to be expanded.
	return SDValue();
	}

	/// Compute the horizontal sum of bytes in V for the elements of VT.
	///
	/// Requires V to be a byte vector and VT to be an integer vector type with
	/// wider elements than V's type. The width of the elements of VT determines
	/// how many bytes of V are summed horizontally to produce each element of the
	/// result.
	static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc DL(V);
	MVT ByteVecVT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
	"Expected value to have byte element type.");
	assert(EltVT != MVT::i8 &&
	"Horizontal byte sum only makes sense for wider elements!");
	unsigned VecSize = VT.getSizeInBits();
	assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");

	// PSADBW instruction horizontally add all bytes and leave the result in i64
	// chunks, thus directly computes the pop count for v2i64 and v4i64.
	if (EltVT == MVT::i64) {
	SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
	return DAG.getBitcast(VT, V);
	}

	if (EltVT == MVT::i32) {
	// We unpack the low half and high half into i32s interleaved with zeros so
	// that we can use PSADBW to horizontally sum them. The most useful part of
	// this is that it lines up the results of two PSADBW instructions to be
	// two v2i64 vectors which concatenated are the 4 population counts. We can
	// then use PACKUSWB to shrink and concatenate them into a v4i32 again.
	SDValue Zeros = DAG.getConstant(0, DL, VT);
	SDValue V32 = DAG.getBitcast(VT, V);
	SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
	SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);

	// Do the horizontal sums into two v2i64s.
	Zeros = DAG.getConstant(0, DL, ByteVecVT);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, Low), Zeros);
	High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, High), Zeros);

	// Merge them together.
	MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
	V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
	DAG.getBitcast(ShortVecVT, Low),
	DAG.getBitcast(ShortVecVT, High));

	return DAG.getBitcast(VT, V);
	}

	// The only element type left is i16.
	assert(EltVT == MVT::i16 && "Unknown how to handle type");

	// To obtain pop count for each i16 element starting from the pop count for
	// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
	// right by 8. It is important to shift as i16s as i8 vector shift isn't
	// directly supported.
	SDValue ShifterV = DAG.getConstant(8, DL, VT);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
	DAG.getBitcast(ByteVecVT, V));
	return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	}

	static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	int NumElts = VT.getVectorNumElements();
	(void)EltVT;
	assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");

	// Implement a lookup table in register by using an algorithm based on:
	// http://wm.ite.pl/articles/sse-popcount.html
	//
	// The general idea is that every lower byte nibble in the input vector is an
	// index into a in-register pre-computed pop count table. We then split up the
	// input vector in two new ones: (1) a vector with only the shifted-right
	// higher nibbles for each byte and (2) a vector with the lower nibbles (and
	// masked out higher ones) for each byte. PSHUFB is used separately with both
	// to index the in-register table. Next, both are added and the result is a
	// i8 vector where each element contains the pop count for input byte.
	const int LUT[16] = {/* 0 / 0, / 1 / 1, / 2 / 1, / 3 */ 2,
	/* 4 / 1, / 5 / 2, / 6 / 2, / 7 */ 3,
	/* 8 / 1, / 9 / 2, / a / 2, / b */ 3,
	/* c / 2, / d / 3, / e / 3, / f */ 4};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumElts; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
	SDValue M0F = DAG.getConstant(0x0F, DL, VT);

	// High nibbles
	SDValue FourV = DAG.getConstant(4, DL, VT);
	SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);

	// Low nibbles
	SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);

	// The input vector is used as the shuffle mask that index elements into the
	// LUT. After counting low and high nibbles, add the vector to obtain the
	// final pop count per i8 element.
	SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
	SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
	return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
	}

	// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
	// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
	static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector()) &&
	"Unknown CTPOP type to handle");
	SDLoc DL(Op.getNode());
	SDValue Op0 = Op.getOperand(0);

	// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
	if (Subtarget.hasVPOPCNTDQ()) {
	unsigned NumElems = VT.getVectorNumElements();
	assert((VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16) && "Unexpected type");
	if (NumElems < 16 \|\| (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
	Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
	}
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	// For element types greater than i8, do vXi8 pop counts and a bytesum.
	if (VT.getScalarType() != MVT::i8) {
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
	SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
	return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
	}

	// We can't use the fast LUT approach, so fall back on LegalizeDAG.
	if (!Subtarget.hasSSSE3())
	return SDValue();

	return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
	}

	static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().isVector() &&
	"We only do custom lowering for vector population count.");
	return LowerVectorCTPOP(Op, Subtarget, DAG);
	}

	static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	// For scalars, its still beneficial to transfer to/from the SIMD unit to
	// perform the BITREVERSE.
	if (!VT.isVector()) {
	MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
	Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	int NumElts = VT.getVectorNumElements();
	int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector())
	return Lower256IntUnary(Op, DAG);

	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitreverse lowering supported.");

	// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
	// perform the BSWAP in the shuffle.
	// Its best to shuffle using the second operand as this will implicitly allow
	// memory folding for multiple vectors.
	SmallVector<SDValue, 16> MaskElts;
	for (int i = 0; i != NumElts; ++i) {
	for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
	int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
	int PermuteByte = SourceByte \| (2 << 5);
	MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
	}
	}

	SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
	SDValue Res = DAG.getBitcast(MVT::v16i8, In);
	Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
	Res, Mask);
	return DAG.getBitcast(VT, Res);
	}

	static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasXOP() && !VT.is512BitVector())
	return LowerBITREVERSE_XOP(Op, DAG);

	assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");

	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarType() == MVT::i8 &&
	"Only byte vector BITREVERSE supported");

	// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
	// two nibbles and a PSHUFB lookup to find the bitreverse of each
	// 0-15 value (moved to the other nibble).
	SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

	const int LoLUT[16] = {
	/* 0 / 0x00, / 1 / 0x80, / 2 / 0x40, / 3 */ 0xC0,
	/* 4 / 0x20, / 5 / 0xA0, / 6 / 0x60, / 7 */ 0xE0,
	/* 8 / 0x10, / 9 / 0x90, / a / 0x50, / b */ 0xD0,
	/* c / 0x30, / d / 0xB0, / e / 0x70, / f */ 0xF0};
	const int HiLUT[16] = {
	/* 0 / 0x00, / 1 / 0x08, / 2 / 0x04, / 3 */ 0x0C,
	/* 4 / 0x02, / 5 / 0x0A, / 6 / 0x06, / 7 */ 0x0E,
	/* 8 / 0x01, / 9 / 0x09, / a / 0x05, / b */ 0x0D,
	/* c / 0x03, / d / 0x0B, / e / 0x07, / f */ 0x0F};

	SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
	for (unsigned i = 0; i < NumElts; ++i) {
	LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
	HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
	}

	SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
	SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
	Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NewOpc = 0;
	switch (N->getOpcode()) {
	case ISD::ATOMIC_LOAD_ADD:
	NewOpc = X86ISD::LADD;
	break;
	case ISD::ATOMIC_LOAD_SUB:
	NewOpc = X86ISD::LSUB;
	break;
	case ISD::ATOMIC_LOAD_OR:
	NewOpc = X86ISD::LOR;
	break;
	case ISD::ATOMIC_LOAD_XOR:
	NewOpc = X86ISD::LXOR;
	break;
	case ISD::ATOMIC_LOAD_AND:
	NewOpc = X86ISD::LAND;
	break;
	default:
	llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
	}

	MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

	return DAG.getMemIntrinsicNode(
	NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1), N->getOperand(2)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	}

	/// Lower atomic_load_ops into LOCK-prefixed operations.
	static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
	SDValue Chain = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	unsigned Opc = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// We can lower atomic_load_add into LXADD. However, any other atomicrmw op
	// can only be lowered when the result is unused. They should have already
	// been transformed into a cmpxchg loop in AtomicExpand.
	if (N->hasAnyUseOfValue(0)) {
	// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
	// select LXADD if LOCK_SUB can't be selected.
	if (Opc == ISD::ATOMIC_LOAD_SUB) {
	RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
	RHS, AN->getMemOperand());
	}
	assert(Opc == ISD::ATOMIC_LOAD_ADD &&
	"Used AtomicRMW ops other than Add should have been expanded!");
	return N;
	}

	// Specialized lowering for the canonical form of an idemptotent atomicrmw.
	// The core idea here is that since the memory location isn't actually
	// changing, all we need is a lowering for the ordering impacts of the
	// atomicrmw. As such, we can chose a different operation and memory
	// location to minimize impact on other code.
	if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
	// On X86, the only ordering which actually requires an instruction is
	// seq_cst which isn't SingleThread, everything just needs to be preserved
	// during codegen and then dropped. Note that we expect (but don't assume),
	// that orderings other than seq_cst and acq_rel have been canonicalized to
	// a store or load.
	if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
	AN->getSyncScopeID() == SyncScope::System) {
	// Prefer a locked operation against a stack location to minimize cache
	// traffic. This assumes that stack locations are very likely to be
	// accessed only by the owning thread.
	SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), NewChain);
	}
	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), NewChain);
	}

	SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
	// RAUW the chain, but don't worry about the result, as it's unused.
	assert(!N->hasAnyUseOfValue(0));
	// NOTE: The getUNDEF is needed to give something for the unused result 0.
	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
	DAG.getUNDEF(VT), LockOp.getValue(1));
	}

	static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	auto *Node = cast<AtomicSDNode>(Op.getNode());
	SDLoc dl(Node);
	EVT VT = Node->getMemoryVT();

	bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
	bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);

	// If this store is not sequentially consistent and the type is legal
	// we can just keep it.
	if (!IsSeqCst && IsTypeLegal)
	return Op;

	if (VT == MVT::i64 && !IsTypeLegal) {
	// For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled.
	// FIXME: Use movlps with SSE1.
	// FIXME: Use fist with X87.
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
	Subtarget.hasSSE2()) {
	SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	Node->getOperand(2));
	SDVTList Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() };
	SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys,
	Ops, MVT::i64,
	Node->getMemOperand());

	// If this is a sequentially consistent store, also emit an appropriate
	// barrier.
	if (IsSeqCst)
	Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

	return Chain;
	}
	}

	// Convert seq_cst store -> xchg
	// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
	// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
	SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
	Node->getMemoryVT(),
	Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2),
	Node->getMemOperand());
	return Swap.getValue(1);
	}

	static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
	SDNode *N = Op.getNode();
	MVT VT = N->getSimpleValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	SDLoc DL(N);

	// Set the carry flag.
	SDValue Carry = Op.getOperand(2);
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
	SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
	Op.getOperand(1), Carry.getValue(1));

	SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());

	// For MacOSX, we want to call an alternative entry point: __sincos_stret,
	// which returns the values as { float, float } (in XMM0) or
	// { double, double } (which is returned in XMM0, XMM1).
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	bool isF64 = ArgVT == MVT::f64;
	// Only optimize x86_64 for now. i386 is a bit messy. For f32,
	// the small struct {f32, f32} is returned in (eax, edx). For f64,
	// the results are returned via SRet in memory.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
	const char *LibcallName = TLI.getLibcallName(LC);
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = isF64 ? (Type )StructType::get(ArgTy, ArgTy)
	: (Type *)VectorType::get(ArgTy, 4);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

	if (isF64)
	// Returned in xmm0 and xmm1.
	return CallResult.first;

	// Returned in bits 0:31 and 32:64 xmm0.
	SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(0, dl));
	SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(1, dl));
	SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
	}

	/// Widen a vector input to a vector of NVT. The
	/// input vector must have the same element type as NVT.
	static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
	bool FillWithZeroes = false) {
	// Check if InOp already has the right width.
	MVT InVT = InOp.getSimpleValueType();
	if (InVT == NVT)
	return InOp;

	if (InOp.isUndef())
	return DAG.getUNDEF(NVT);

	assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
	"input and widen element type must match");

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned WidenNumElts = NVT.getVectorNumElements();
	assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
	"Unexpected request for vector widening");

	SDLoc dl(InOp);
	if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
	InOp.getNumOperands() == 2) {
	SDValue N1 = InOp.getOperand(1);
	if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) \|\|
	N1.isUndef()) {
	InOp = InOp.getOperand(0);
	InVT = InOp.getSimpleValueType();
	InNumElts = InVT.getVectorNumElements();
	}
	}
	if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0; i < InNumElts; ++i)
	Ops.push_back(InOp.getOperand(i));

	EVT EltVT = InOp.getOperand(0).getValueType();

	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
	DAG.getUNDEF(EltVT);
	for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
	Ops.push_back(FillVal);
	return DAG.getBuildVector(NVT, dl, Ops);
	}
	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
	DAG.getUNDEF(NVT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
	InOp, DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"MGATHER/MSCATTER are supported on AVX-512 arch only");

	MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
	SDValue Src = N->getValue();
	MVT VT = Src.getSimpleValueType();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
	SDLoc dl(Op);

	SDValue Scale = N->getScale();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Chain = N->getChain();
	SDValue BasePtr = N->getBasePtr();

	if (VT == MVT::v2f32) {
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	// If the index is v2i64 and we have VLX we can use xmm for data and index.
	if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32));
	SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
	SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
	VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
	return SDValue(NewScatter.getNode(), 1);
	}
	return SDValue();
	}

	if (VT == MVT::v2i32) {
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(MVT::v2i32));
	// If the index is v2i64 and we have VLX we can use xmm for data and index.
	if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
	SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
	SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
	VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
	return SDValue(NewScatter.getNode(), 1);
	}
	// Custom widen all the operands to avoid promotion.
	EVT NewIndexVT = EVT::getVectorVT(
	*DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
	Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
	DAG.getUNDEF(Index.getValueType()));
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getConstant(0, dl, MVT::v2i1));
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
	return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
	Ops, N->getMemOperand());
	}

	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	// If the index is v2i32, we're being called by type legalization and we
	// should just let the default handling take care of it.
	if (IndexVT == MVT::v2i32)
	return SDValue();

	// If we don't have VLX and neither the passthru or index is 512-bits, we
	// need to widen until one is.
	if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// Determine how much we need to widen by to get a 512-bit type.
	unsigned Factor = std::min(512/VT.getSizeInBits(),
	512/IndexVT.getSizeInBits());
	unsigned NumElts = VT.getVectorNumElements() * Factor;

	VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
	MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

	Src = ExtendToType(Src, VT, DAG);
	Index = ExtendToType(Index, IndexVT, DAG);
	Mask = ExtendToType(Mask, MaskVT, DAG, true);
	}

	SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
	SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
	VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
	return SDValue(NewScatter.getNode(), 1);
	}

	static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
	MVT VT = Op.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	MVT MaskVT = Mask.getSimpleValueType();
	SDValue PassThru = N->getPassThru();
	SDLoc dl(Op);

	// Handle AVX masked loads which don't support passthru other than 0.
	if (MaskVT.getVectorElementType() != MVT::i1) {
	// We also allow undef in the isel pattern.
	if (PassThru.isUndef() \|\| ISD::isBuildVectorAllZeros(PassThru.getNode()))
	return Op;

	SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(),
	N->getBasePtr(), Mask,
	getZeroVector(VT, Subtarget, DAG, dl),
	N->getMemoryVT(), N->getMemOperand(),
	N->getExtensionType(),
	N->isExpandingLoad());
	// Emit a blend.
	SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
	PassThru);
	return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
	}

	assert((!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked load op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked load op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
	PassThru = ExtendToType(PassThru, WideDataVT, DAG);

	// Mask element has to be i1.
	assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
	"Unexpected mask type");

	MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
	N->getBasePtr(), Mask, PassThru,
	N->getMemoryVT(), N->getMemOperand(),
	N->getExtensionType(),
	N->isExpandingLoad());

	SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
	NewLoad.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
	SDValue DataToStore = N->getValue();
	MVT VT = DataToStore.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isCompressingStore() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked store op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked store op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

	// Mask element has to be i1.
	assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
	"Unexpected mask type");

	MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

	DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
	Mask, N->getMemoryVT(), N->getMemOperand(),
	N->isTruncatingStore(), N->isCompressingStore());
	}

	static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");

	MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue PassThru = N->getPassThru();
	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");

	// If the index is v2i32, we're being called by type legalization.
	if (IndexVT == MVT::v2i32)
	return SDValue();

	// If we don't have VLX and neither the passthru or index is 512-bits, we
	// need to widen until one is.
	MVT OrigVT = VT;
	if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	!IndexVT.is512BitVector()) {
	// Determine how much we need to widen by to get a 512-bit type.
	unsigned Factor = std::min(512/VT.getSizeInBits(),
	512/IndexVT.getSizeInBits());

	unsigned NumElts = VT.getVectorNumElements() * Factor;

	VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
	MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

	PassThru = ExtendToType(PassThru, VT, DAG);
	Index = ExtendToType(Index, IndexVT, DAG);
	Mask = ExtendToType(Mask, MaskVT, DAG, true);
	}

	SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
	N->getScale() };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
	NewGather, DAG.getIntPtrConstant(0, dl));
	return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	/// Provide custom lowering hooks for some operations.
	SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Should not custom lower this!");
	case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	return LowerCMP_SWAP(Op, Subtarget, DAG);
	case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
	case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
	case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
	case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
	case ISD::VSELECT: return LowerVSELECT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
	case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::SHL_PARTS:
	case ISD::SRA_PARTS:
	case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
	case ISD::FSHL:
	case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
	case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
	case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
	case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
	case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
	case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
	case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
	case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
	case ISD::FADD:
	case ISD::FSUB: return lowerFaddFsub(Op, DAG, Subtarget);
	case ISD::FABS:
	case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
	case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
	case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
	case ISD::SETCC: return LowerSETCC(Op, DAG);
	case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
	case ISD::SELECT: return LowerSELECT(Op, DAG);
	case ISD::BRCOND: return LowerBRCOND(Op, DAG);
	case ISD::JumpTable: return LowerJumpTable(Op, DAG);
	case ISD::VASTART: return LowerVASTART(Op, DAG);
	case ISD::VAARG: return LowerVAARG(Op, DAG);
	case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
	case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
	case ISD::FRAME_TO_ARGS_OFFSET:
	return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
	case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
	case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
	case ISD::EH_SJLJ_SETUP_DISPATCH:
	return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
	case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
	case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
	case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
	case ISD::MULHS:
	case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
	case ISD::ROTL:
	case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO: return LowerXALUO(Op, DAG);
	case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
	case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
	case ISD::ADDCARRY:
	case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
	case ISD::ADD:
	case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
	case ISD::UADDSAT:
	case ISD::SADDSAT:
	case ISD::USUBSAT:
	case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
	case ISD::SMAX:
	case ISD::SMIN:
	case ISD::UMAX:
	case ISD::UMIN: return LowerMINMAX(Op, DAG);
	case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
	case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
	case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
	case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
	case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
	case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
	case ISD::GC_TRANSITION_START:
	return LowerGC_TRANSITION_START(Op, DAG);
	case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
	}
	}

	/// Places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	void X86TargetLowering::LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDValue Res = LowerOperation(SDValue(N, 0), DAG);

	if (!Res.getNode())
	return;

	// If the original node has one result, take the return value from
	// LowerOperation as is. It might not be result number 0.
	if (N->getNumValues() == 1) {
	Results.push_back(Res);
	return;
	}

	// If the original node has multiple results, then the return node should
	// have the same number of results.
	assert((N->getNumValues() == Res->getNumValues()) &&
	"Lowering returned the wrong number of results!");

	// Places new result values base on N result number.
	for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
	Results.push_back(Res.getValue(I));
	}

	/// Replace a node with an illegal result type with a new node built out of
	/// custom code.
	void X86TargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const {
	SDLoc dl(N);
	switch (N->getOpcode()) {
	default:
	#ifndef NDEBUG
	dbgs() << "ReplaceNodeResults: ";
	N->dump(&DAG);
	#endif
	llvm_unreachable("Do not know how to custom type legalize this operation!");
	case ISD::CTPOP: {
	assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
	// Use a v2i64 if possible.
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
	SDValue Wide =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
	Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
	// Bit count should fit in 32-bits, extract it as that and then zero
	// extend to i64. Otherwise we end up extracting bits 63:32 separately.
	Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
	Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
	DAG.getIntPtrConstant(0, dl));
	Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
	Results.push_back(Wide);
	}
	return;
	}
	case ISD::MUL: {
	EVT VT = N->getValueType(0);
	assert(VT.isVector() && "Unexpected VT");
	if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger &&
	VT.getVectorNumElements() == 2) {
	// Promote to a pattern that will be turned into PMULUDQ.
	SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
	N->getOperand(0));
	SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
	N->getOperand(1));
	SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1);
	Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul));
	} else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
	VT.getVectorElementType() == MVT::i8) {
	// Pre-promote these to vXi16 to avoid op legalization thinking all 16
	// elements are needed.
	MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
	SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
	SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	unsigned NumConcats = 16 / VT.getVectorNumElements();
	SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
	ConcatOps[0] = Res;
	Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
	Results.push_back(Res);
	}
	return;
	}
	case ISD::UADDSAT:
	case ISD::SADDSAT:
	case ISD::USUBSAT:
	case ISD::SSUBSAT:
	case X86ISD::VPMADDWD:
	case X86ISD::AVG: {
	// Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and
	// X86ISD::AVG/VPMADDWD by widening.
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

	EVT VT = N->getValueType(0);
	EVT InVT = N->getOperand(0).getValueType();
	assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
	"Expected a VT that divides into 128 bits.");
	unsigned NumConcat = 128 / InVT.getSizeInBits();

	EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
	InVT.getVectorElementType(),
	NumConcat * InVT.getVectorNumElements());
	EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
	VT.getVectorElementType(),
	NumConcat * VT.getVectorNumElements());

	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
	Ops[0] = N->getOperand(0);
	SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
	Ops[0] = N->getOperand(1);
	SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

	SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	return;
	}
	case ISD::ABS: {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	assert(N->getValueType(0) == MVT::i64 &&
	"Unexpected type (!= i64) on ABS.");
	MVT HalfT = MVT::i32;
	SDValue Lo, Hi, Tmp;
	SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);

	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
	DAG.getConstant(0, dl, HalfT));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
	DAG.getConstant(1, dl, HalfT));
	Tmp = DAG.getNode(
	ISD::SRA, dl, HalfT, Hi,
	DAG.getConstant(HalfT.getSizeInBits() - 1, dl,
	TLI.getShiftAmountTy(HalfT, DAG.getDataLayout())));
	Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
	Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
	SDValue(Lo.getNode(), 1));
	Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
	Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
	Results.push_back(Lo);
	Results.push_back(Hi);
	return;
	}
	case ISD::SETCC: {
	// Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
	// setCC result type is v2i1 because type legalzation will end up with
	// a v4i1 setcc plus an extend.
	assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
	if (N->getOperand(0).getValueType() != MVT::v2f32 \|\|
	getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector)
	return;
	SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
	SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(0), UNDEF);
	SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(1), UNDEF);
	SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
	N->getOperand(2));
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	return;
	}
	// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
	case X86ISD::FMINC:
	case X86ISD::FMIN:
	case X86ISD::FMAXC:
	case X86ISD::FMAX: {
	EVT VT = N->getValueType(0);
	assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
	SDValue UNDEF = DAG.getUNDEF(VT);
	SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(0), UNDEF);
	SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(1), UNDEF);
	Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
	return;
	}
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM: {
	EVT VT = N->getValueType(0);
	if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) {
	// If this RHS is a constant splat vector we can widen this and let
	// division/remainder by constant optimize it.
	// TODO: Can we do something for non-splat?
	APInt SplatVal;
	if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
	unsigned NumConcats = 128 / VT.getSizeInBits();
	SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
	Ops0[0] = N->getOperand(0);
	EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
	SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
	SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
	SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
	Results.push_back(Res);
	}
	return;
	}

	if (VT == MVT::v2i32) {
	// Legalize v2i32 div/rem by unrolling. Otherwise we promote to the
	// v2i64 and unroll later. But then we create i64 scalar ops which
	// might be slow in 64-bit mode or require a libcall in 32-bit mode.
	Results.push_back(DAG.UnrollVectorOp(N));
	return;
	}

	if (VT.isVector())
	return;

	LLVM_FALLTHROUGH;
	}
	case ISD::SDIVREM:
	case ISD::UDIVREM: {
	SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
	Results.push_back(V);
	return;
	}
	case ISD::TRUNCATE: {
	MVT VT = N->getSimpleValueType(0);
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
	return;

	// The generic legalizer will try to widen the input type to the same
	// number of elements as the widened result type. But this isn't always
	// the best thing so do some custom legalization to avoid some cases.
	MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();

	unsigned InBits = InVT.getSizeInBits();
	if (128 % InBits == 0) {
	// 128 bit and smaller inputs should avoid truncate all together and
	// just use a build_vector that will become a shuffle.
	// TODO: Widen and use a shuffle directly?
	MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
	EVT EltVT = VT.getVectorElementType();
	unsigned WidenNumElts = WidenVT.getVectorNumElements();
	SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
	// Use the original element count so we don't do more scalar opts than
	// necessary.
	unsigned MinElts = VT.getVectorNumElements();
	for (unsigned i=0; i < MinElts; ++i) {
	SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
	DAG.getIntPtrConstant(i, dl));
	Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
	}
	Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
	return;
	}
	// With AVX512 there are some cases that can use a target specific
	// truncate node to go from 256/512 to less than 128 with zeros in the
	// upper elements of the 128 bit result.
	if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
	// We can use VTRUNC directly if for 256 bits with VLX or for any 512.
	if ((InBits == 256 && Subtarget.hasVLX()) \|\| InBits == 512) {
	Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
	return;
	}
	// There's one case we can widen to 512 bits and use VTRUNC.
	if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
	In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
	DAG.getUNDEF(MVT::v4i64));
	Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
	return;
	}
	}
	return;
	}
	case ISD::SIGN_EXTEND_VECTOR_INREG: {
	if (ExperimentalVectorWideningLegalization)
	return;

	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
	(InVT == MVT::v16i16 \|\| InVT == MVT::v32i8)) {
	// Custom split this so we can extend i8/i16->i32 invec. This is better
	// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
	// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
	// we allow the sra from the extend to i32 to be shared by the split.
	EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(),
	InVT.getVectorElementType(),
	InVT.getVectorNumElements() / 2);
	MVT ExtendVT = MVT::getVectorVT(MVT::i32,
	VT.getVectorNumElements());
	In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExtractVT,
	In, DAG.getIntPtrConstant(0, dl));
	In = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, MVT::v4i32, In);

	// Fill a vector with sign bits for each element.
	SDValue Zero = DAG.getConstant(0, dl, ExtendVT);
	SDValue SignBits = DAG.getSetCC(dl, ExtendVT, Zero, In, ISD::SETGT);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

	// Create an unpackl and unpackh to interleave the sign bits then bitcast
	// to vXi64.
	SDValue Lo = getUnpackl(DAG, dl, ExtendVT, In, SignBits);
	Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
	SDValue Hi = getUnpackh(DAG, dl, ExtendVT, In, SignBits);
	Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);

	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	return;
	}
	return;
	}
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND: {
	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
	(InVT == MVT::v4i16 \|\| InVT == MVT::v4i8) &&
	getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) {
	assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
	// Custom split this so we can extend i8/i16->i32 invec. This is better
	// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
	// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
	// we allow the sra from the extend to i32 to be shared by the split.
	In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);

	// Fill a vector with sign bits for each element.
	SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
	SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);

	// Create an unpackl and unpackh to interleave the sign bits then bitcast
	// to v2i64.
	SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
	{0, 4, 1, 5});
	Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
	SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
	{2, 6, 3, 7});
	Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);

	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	return;
	}

	if (VT == MVT::v16i32 \|\| VT == MVT::v8i64) {
	if (!InVT.is128BitVector()) {
	// Not a 128 bit vector, but maybe type legalization will promote
	// it to 128 bits.
	if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
	return;
	InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
	if (!InVT.is128BitVector())
	return;

	// Promote the input to 128 bits. Type legalization will turn this into
	// zext_inreg/sext_inreg.
	In = DAG.getNode(N->getOpcode(), dl, InVT, In);
	}

	// Perform custom splitting instead of the two stage extend we would get
	// by default.
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	assert(isTypeLegal(LoVT) && "Split VT not legal?");

	SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);

	// We need to shift the input over by half the number of elements.
	unsigned NumElts = InVT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
	for (unsigned i = 0; i != HalfNumElts; ++i)
	ShufMask[i] = i + HalfNumElts;

	SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
	Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);

	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
	Results.push_back(Res);
	}
	return;
	}
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: {
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src.getValueType();

	// Promote these manually to avoid over promotion to v2i64. Type
	// legalization will revisit the v2i32 operation for more cleanup.
	if ((VT == MVT::v2i8 \|\| VT == MVT::v2i16) &&
	getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) {
	// AVX512DQ provides instructions that produce a v2i64 result.
	if (Subtarget.hasDQI())
	return;

	SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src);
	Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
	: ISD::AssertSext,
	dl, MVT::v2i32, Res,
	DAG.getValueType(VT.getVectorElementType()));
	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	Results.push_back(Res);
	return;
	}

	if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
	return;

	// Try to create a 128 bit vector, but don't exceed a 32 bit element.
	unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
	MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
	VT.getVectorNumElements());
	SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);

	// Preserve what we know about the size of the original result. Except
	// when the result is v2i32 since we can't widen the assert.
	if (PromoteVT != MVT::v2i32)
	Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
	: ISD::AssertSext,
	dl, PromoteVT, Res,
	DAG.getValueType(VT.getVectorElementType()));

	// Truncate back to the original width.
	Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

	// Now widen to 128 bits.
	unsigned NumConcats = 128 / VT.getSizeInBits();
	MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
	VT.getVectorNumElements() * NumConcats);
	SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
	ConcatOps[0] = Res;
	Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
	Results.push_back(Res);
	return;
	}


	if (VT == MVT::v2i32) {
	assert((IsSigned \|\| Subtarget.hasAVX512()) &&
	"Can only handle signed conversion without AVX512");
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	bool Widenv2i32 =
	getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector;
	if (Src.getValueType() == MVT::v2f64) {
	unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	if (!IsSigned && !Subtarget.hasVLX()) {
	// If v2i32 is widened, we can defer to the generic legalizer.
	if (Widenv2i32)
	return;
	// Custom widen by doubling to a legal vector with. Isel will
	// further widen to v8f64.
	Opc = ISD::FP_TO_UINT;
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64,
	Src, DAG.getUNDEF(MVT::v2f64));
	}
	SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
	if (!Widenv2i32)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	return;
	}
	if (SrcVT == MVT::v2f32 &&
	getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
	SDValue Idx = DAG.getIntPtrConstant(0, dl);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32));
	Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
	: ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
	Results.push_back(Res);
	return;
	}

	// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
	// so early out here.
	return;
	}

	if (Subtarget.hasDQI() && VT == MVT::i64 &&
	(SrcVT == MVT::f32 \|\| SrcVT == MVT::f64)) {
	assert(!Subtarget.is64Bit() && "i64 should be legal");
	unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
	// Using a 256-bit input here to guarantee 128-bit input for f32 case.
	// TODO: Use 128-bit vectors for f64 case?
	// TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
	MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
	MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
	SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
	DAG.getConstantFP(0.0, dl, VecInVT), Src,
	ZeroIdx);
	Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
	Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
	Results.push_back(Res);
	return;
	}

	if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned))
	Results.push_back(V);
	return;
	}
	case ISD::SINT_TO_FP: {
	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
	SDValue Src = N->getOperand(0);
	if (N->getValueType(0) != MVT::v2f32 \|\| Src.getValueType() != MVT::v2i64)
	return;
	Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
	return;
	}
	case ISD::UINT_TO_FP: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT VT = N->getValueType(0);
	if (VT != MVT::v2f32)
	return;
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
	Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
	return;
	}
	if (SrcVT != MVT::v2i32)
	return;
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
	SDValue VBias =
	DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
	DAG.getBitcast(MVT::v2i64, VBias));
	Or = DAG.getBitcast(MVT::v2f64, Or);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
	Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
	return;
	}
	case ISD::FP_ROUND: {
	if (!isTypeLegal(N->getOperand(0).getValueType()))
	return;
	SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
	Results.push_back(V);
	return;
	}
	case ISD::FP_EXTEND: {
	// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
	// No other ValueType for FP_EXTEND should reach this point.
	assert(N->getValueType(0) == MVT::v2f32 &&
	"Do not know how to legalize this Node");
	return;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default : llvm_unreachable("Do not know how to custom type "
	"legalize this intrinsic operation!");
	case Intrinsic::x86_rdtsc:
	return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdtscp:
	return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdpmc:
	expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
	Results);
	return;
	case Intrinsic::x86_xgetbv:
	expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
	Results);
	return;
	}
	}
	case ISD::READCYCLECOUNTER: {
	return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
	}
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
	EVT T = N->getValueType(0);
	assert((T == MVT::i64 \|\| T == MVT::i128) && "can only expand cmpxchg pair");
	bool Regs64bit = T == MVT::i128;
	assert((!Regs64bit \|\| Subtarget.hasCmpxchg16b()) &&
	"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
	MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
	SDValue cpInL, cpInH;
	cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(0, dl, HalfT));
	cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(1, dl, HalfT));
	cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	cpInL, SDValue());
	cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	cpInH, cpInL.getValue(1));
	SDValue swapInL, swapInH;
	swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(0, dl, HalfT));
	swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(1, dl, HalfT));
	swapInH =
	DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
	swapInH, cpInH.getValue(1));
	// If the current function needs the base pointer, RBX,
	// we shouldn't use cmpxchg directly.
	// Indeed the lowering of that instruction will clobber
	// that register and since RBX will be a reserved register
	// the register allocator will not make sure its value will
	// be properly saved and restored around this live-range.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	SDValue Result;
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	unsigned BasePtr = TRI->getBaseRegister();
	MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
	if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
	(BasePtr == X86::RBX \|\| BasePtr == X86::EBX)) {
	// ISel prefers the LCMPXCHG64 variant.
	// If that assert breaks, that means it is not the case anymore,
	// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
	// not just EBX. This is a matter of accepting i64 input for that
	// pseudo, and restoring into the register of the right wide
	// in expand pseudo. Everything else should just work.
	assert(((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX) &&
	"Saving only half of the RBX");
	unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
	: X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
	SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX,
	HalfT, swapInH.getValue(1));
	SDValue Ops[] = {/Chain/ RBXSave.getValue(1), N->getOperand(1), swapInL,
	RBXSave,
	/Glue/ RBXSave.getValue(2)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	} else {
	unsigned Opcode =
	Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
	swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX, swapInL,
	swapInH.getValue(1));
	SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
	swapInL.getValue(1)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	}
	SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	HalfT, Result.getValue(1));
	SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	HalfT, cpOutL.getValue(2));
	SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

	SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
	MVT::i32, cpOutH.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
	Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
	Results.push_back(Success);
	Results.push_back(EFLAGS.getValue(1));
	return;
	}
	case ISD::ATOMIC_LOAD: {
	assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
	bool NoImplicitFloatOps =
	DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);
	if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
	auto *Node = cast<AtomicSDNode>(N);
	if (Subtarget.hasSSE2()) {
	// Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the
	// lower 64-bits.
	SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
	SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
	SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
	MVT::i64, Node->getMemOperand());
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Ld.getValue(1));
	return;
	}
	if (Subtarget.hasX87()) {
	// First load this into an 80-bit X87 register. This will put the whole
	// integer into the significand.
	// FIXME: Do we need to glue? See FIXME comment in BuildFILD.
	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue);
	SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG,
	dl, Tys, Ops, MVT::i64,
	Node->getMemOperand());
	SDValue Chain = Result.getValue(1);
	SDValue InFlag = Result.getValue(2);

	// Now store the X87 register to a stack temporary and convert to i64.
	// This store is not atomic and doesn't need to be.
	// FIXME: We don't need a stack temporary if the result of the load
	// is already being stored. We could just directly store there.
	SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
	int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
	MachinePointerInfo MPI =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
	SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag };
	Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,
	DAG.getVTList(MVT::Other), StoreOps,
	MVT::i64, MPI, 0 /Align/,
	MachineMemOperand::MOStore);

	// Finally load the value back from the stack temporary and return it.
	// This load is not atomic and doesn't need to be.
	// This load will be further type legalized.
	Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
	Results.push_back(Result);
	Results.push_back(Result.getValue(1));
	return;
	}
	}
	// TODO: Use MOVLPS when SSE1 is available?
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;
	}
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;

	case ISD::BITCAST: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT DstVT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0).getValueType();

	// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
	// we can split using the k-register rather than memory.
	if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
	assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	Lo = DAG.getBitcast(MVT::i32, Lo);
	Hi = DAG.getBitcast(MVT::i32, Hi);
	SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
	Results.push_back(Res);
	return;
	}

	// Custom splitting for BWI types when AVX512F is available but BWI isn't.
	if ((DstVT == MVT::v32i16 \|\| DstVT == MVT::v64i8) &&
	SrcVT.isVector() && isTypeLegal(SrcVT)) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
	Lo = DAG.getBitcast(CastVT, Lo);
	Hi = DAG.getBitcast(CastVT, Hi);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
	Results.push_back(Res);
	return;
	}

	if (SrcVT != MVT::f64 \|\|
	(DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) \|\|
	getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector)
	return;

	unsigned NumElts = DstVT.getVectorNumElements();
	EVT SVT = DstVT.getVectorElementType();
	EVT WiderVT = EVT::getVectorVT(DAG.getContext(), SVT, NumElts 2);
	SDValue Res;
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0));
	Res = DAG.getBitcast(WiderVT, Res);
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	return;
	}
	case ISD::MGATHER: {
	EVT VT = N->getValueType(0);
	if (VT == MVT::v2f32 && (Subtarget.hasVLX() \|\| !Subtarget.hasAVX512())) {
	auto *Gather = cast<MaskedGatherSDNode>(N);
	SDValue Index = Gather->getIndex();
	if (Index.getValueType() != MVT::v2i64)
	return;
	SDValue Mask = Gather->getMask();
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	Gather->getPassThru(),
	DAG.getUNDEF(MVT::v2f32));
	if (!Subtarget.hasVLX()) {
	// We need to widen the mask, but the instruction will only use 2
	// of its elements. So we can use undef.
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getUNDEF(MVT::v2i1));
	Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
	}
	SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
	Gather->getBasePtr(), Index, Gather->getScale() };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
	Gather->getMemoryVT(), Gather->getMemOperand());
	Results.push_back(Res);
	Results.push_back(Res.getValue(2));
	return;
	}
	if (VT == MVT::v2i32) {
	auto *Gather = cast<MaskedGatherSDNode>(N);
	SDValue Index = Gather->getIndex();
	SDValue Mask = Gather->getMask();
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
	Gather->getPassThru(),
	DAG.getUNDEF(MVT::v2i32));
	// If the index is v2i64 we can use it directly.
	if (Index.getValueType() == MVT::v2i64 &&
	(Subtarget.hasVLX() \|\| !Subtarget.hasAVX512())) {
	if (!Subtarget.hasVLX()) {
	// We need to widen the mask, but the instruction will only use 2
	// of its elements. So we can use undef.
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getUNDEF(MVT::v2i1));
	Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
	}
	SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
	Gather->getBasePtr(), Index, Gather->getScale() };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
	Gather->getMemoryVT(), Gather->getMemOperand());
	SDValue Chain = Res.getValue(2);
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
	EVT IndexVT = Index.getValueType();
	EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
	IndexVT.getScalarType(), 4);
	// Otherwise we need to custom widen everything to avoid promotion.
	Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
	DAG.getUNDEF(IndexVT));
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getConstant(0, dl, MVT::v2i1));
	SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
	Gather->getBasePtr(), Index, Gather->getScale() };
	SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
	Gather->getMemoryVT(), dl, Ops,
	Gather->getMemOperand());
	SDValue Chain = Res.getValue(1);
	if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	}
	return;
	}
	case ISD::LOAD: {
	// Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
	// avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
	// cast since type legalization will try to use an i64 load.
	MVT VT = N->getSimpleValueType(0);
	assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
	if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
	return;
	if (!ISD::isNON_EXTLoad(N))
	return;
	auto *Ld = cast<LoadSDNode>(N);
	if (Subtarget.hasSSE2()) {
	MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
	SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	SDValue Chain = Res.getValue(1);
	MVT WideVT = MVT::getVectorVT(LdVT, 2);
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
	MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() * 2);
	Res = DAG.getBitcast(CastVT, Res);
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	assert(Subtarget.hasSSE1() && "Expected SSE");
	SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
	SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
	SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
	MVT::i64, Ld->getMemOperand());
	Results.push_back(Res);
	Results.push_back(Res.getValue(1));
	return;
	}
	}
	}

	const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((X86ISD::NodeType)Opcode) {
	case X86ISD::FIRST_NUMBER: break;
	case X86ISD::BSF: return "X86ISD::BSF";
	case X86ISD::BSR: return "X86ISD::BSR";
	case X86ISD::SHLD: return "X86ISD::SHLD";
	case X86ISD::SHRD: return "X86ISD::SHRD";
	case X86ISD::FAND: return "X86ISD::FAND";
	case X86ISD::FANDN: return "X86ISD::FANDN";
	case X86ISD::FOR: return "X86ISD::FOR";
	case X86ISD::FXOR: return "X86ISD::FXOR";
	case X86ISD::FILD: return "X86ISD::FILD";
	case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
	case X86ISD::FIST: return "X86ISD::FIST";
	case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM";
	case X86ISD::FLD: return "X86ISD::FLD";
	case X86ISD::FST: return "X86ISD::FST";
	case X86ISD::CALL: return "X86ISD::CALL";
	case X86ISD::BT: return "X86ISD::BT";
	case X86ISD::CMP: return "X86ISD::CMP";
	case X86ISD::COMI: return "X86ISD::COMI";
	case X86ISD::UCOMI: return "X86ISD::UCOMI";
	case X86ISD::CMPM: return "X86ISD::CMPM";
	case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE";
	case X86ISD::SETCC: return "X86ISD::SETCC";
	case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
	case X86ISD::FSETCC: return "X86ISD::FSETCC";
	case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
	case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE";
	case X86ISD::CMOV: return "X86ISD::CMOV";
	case X86ISD::BRCOND: return "X86ISD::BRCOND";
	case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
	case X86ISD::IRET: return "X86ISD::IRET";
	case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
	case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
	case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
	case X86ISD::Wrapper: return "X86ISD::Wrapper";
	case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
	case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
	case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
	case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
	case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
	case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
	case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
	case X86ISD::PINSRB: return "X86ISD::PINSRB";
	case X86ISD::PINSRW: return "X86ISD::PINSRW";
	case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
	case X86ISD::ANDNP: return "X86ISD::ANDNP";
	case X86ISD::BLENDI: return "X86ISD::BLENDI";
	case X86ISD::BLENDV: return "X86ISD::BLENDV";
	case X86ISD::HADD: return "X86ISD::HADD";
	case X86ISD::HSUB: return "X86ISD::HSUB";
	case X86ISD::FHADD: return "X86ISD::FHADD";
	case X86ISD::FHSUB: return "X86ISD::FHSUB";
	case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
	case X86ISD::FMAX: return "X86ISD::FMAX";
	case X86ISD::FMAXS: return "X86ISD::FMAXS";
	case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE";
	case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE";
	case X86ISD::FMIN: return "X86ISD::FMIN";
	case X86ISD::FMINS: return "X86ISD::FMINS";
	case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE";
	case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE";
	case X86ISD::FMAXC: return "X86ISD::FMAXC";
	case X86ISD::FMINC: return "X86ISD::FMINC";
	case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
	case X86ISD::FRCP: return "X86ISD::FRCP";
	case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
	case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
	case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
	case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
	case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
	case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
	case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
	case X86ISD::EH_SJLJ_SETUP_DISPATCH:
	return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
	case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
	case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
	case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
	case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
	case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
	case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
	case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
	case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
	return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
	case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
	return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
	case X86ISD::LADD: return "X86ISD::LADD";
	case X86ISD::LSUB: return "X86ISD::LSUB";
	case X86ISD::LOR: return "X86ISD::LOR";
	case X86ISD::LXOR: return "X86ISD::LXOR";
	case X86ISD::LAND: return "X86ISD::LAND";
	case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
	case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
	case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE";
	case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
	case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
	case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
	case X86ISD::VMTRUNC: return "X86ISD::VMTRUNC";
	case X86ISD::VMTRUNCS: return "X86ISD::VMTRUNCS";
	case X86ISD::VMTRUNCUS: return "X86ISD::VMTRUNCUS";
	case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
	case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
	case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
	case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
	case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
	case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE";
	case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS";
	case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE";
	case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
	case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";
	case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
	case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS";
	case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
	case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
	case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
	case X86ISD::VSHL: return "X86ISD::VSHL";
	case X86ISD::VSRL: return "X86ISD::VSRL";
	case X86ISD::VSRA: return "X86ISD::VSRA";
	case X86ISD::VSHLI: return "X86ISD::VSHLI";
	case X86ISD::VSRLI: return "X86ISD::VSRLI";
	case X86ISD::VSRAI: return "X86ISD::VSRAI";
	case X86ISD::VSHLV: return "X86ISD::VSHLV";
	case X86ISD::VSRLV: return "X86ISD::VSRLV";
	case X86ISD::VSRAV: return "X86ISD::VSRAV";
	case X86ISD::VROTLI: return "X86ISD::VROTLI";
	case X86ISD::VROTRI: return "X86ISD::VROTRI";
	case X86ISD::VPPERM: return "X86ISD::VPPERM";
	case X86ISD::CMPP: return "X86ISD::CMPP";
	case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
	case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
	case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
	case X86ISD::ADD: return "X86ISD::ADD";
	case X86ISD::SUB: return "X86ISD::SUB";
	case X86ISD::ADC: return "X86ISD::ADC";
	case X86ISD::SBB: return "X86ISD::SBB";
	case X86ISD::SMUL: return "X86ISD::SMUL";
	case X86ISD::UMUL: return "X86ISD::UMUL";
	case X86ISD::OR: return "X86ISD::OR";
	case X86ISD::XOR: return "X86ISD::XOR";
	case X86ISD::AND: return "X86ISD::AND";
	case X86ISD::BEXTR: return "X86ISD::BEXTR";
	case X86ISD::BZHI: return "X86ISD::BZHI";
	case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
	case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
	case X86ISD::PTEST: return "X86ISD::PTEST";
	case X86ISD::TESTP: return "X86ISD::TESTP";
	case X86ISD::KORTEST: return "X86ISD::KORTEST";
	case X86ISD::KTEST: return "X86ISD::KTEST";
	case X86ISD::KADD: return "X86ISD::KADD";
	case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
	case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
	case X86ISD::PACKSS: return "X86ISD::PACKSS";
	case X86ISD::PACKUS: return "X86ISD::PACKUS";
	case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
	case X86ISD::VALIGN: return "X86ISD::VALIGN";
	case X86ISD::VSHLD: return "X86ISD::VSHLD";
	case X86ISD::VSHRD: return "X86ISD::VSHRD";
	case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
	case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
	case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
	case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
	case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
	case X86ISD::SHUFP: return "X86ISD::SHUFP";
	case X86ISD::SHUF128: return "X86ISD::SHUF128";
	case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
	case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
	case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
	case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
	case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
	case X86ISD::MOVSD: return "X86ISD::MOVSD";
	case X86ISD::MOVSS: return "X86ISD::MOVSS";
	case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
	case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
	case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
	case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
	case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
	case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
	case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
	case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
	case X86ISD::VPERMV: return "X86ISD::VPERMV";
	case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
	case X86ISD::VPERMI: return "X86ISD::VPERMI";
	case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
	case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
	case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE";
	case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
	case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE";
	case X86ISD::VRANGE: return "X86ISD::VRANGE";
	case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE";
	case X86ISD::VRANGES: return "X86ISD::VRANGES";
	case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE";
	case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
	case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
	case X86ISD::PSADBW: return "X86ISD::PSADBW";
	case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
	case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
	case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
	case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
	case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
	case X86ISD::MFENCE: return "X86ISD::MFENCE";
	case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
	case X86ISD::SAHF: return "X86ISD::SAHF";
	case X86ISD::RDRAND: return "X86ISD::RDRAND";
	case X86ISD::RDSEED: return "X86ISD::RDSEED";
	case X86ISD::RDPKRU: return "X86ISD::RDPKRU";
	case X86ISD::WRPKRU: return "X86ISD::WRPKRU";
	case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
	case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
	case X86ISD::VPSHA: return "X86ISD::VPSHA";
	case X86ISD::VPSHL: return "X86ISD::VPSHL";
	case X86ISD::VPCOM: return "X86ISD::VPCOM";
	case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
	case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
	case X86ISD::FMSUB: return "X86ISD::FMSUB";
	case X86ISD::FNMADD: return "X86ISD::FNMADD";
	case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
	case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
	case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
	case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
	case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
	case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
	case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
	case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
	case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
	case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
	case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
	case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
	case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE";
	case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
	case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE";
	case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
	case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE";
	case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
	case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE";
	case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
	case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE";
	case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
	case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE";
	case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
	case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
	case X86ISD::XTEST: return "X86ISD::XTEST";
	case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
	case X86ISD::EXPAND: return "X86ISD::EXPAND";
	case X86ISD::SELECTS: return "X86ISD::SELECTS";
	case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
	case X86ISD::RCP14: return "X86ISD::RCP14";
	case X86ISD::RCP14S: return "X86ISD::RCP14S";
	case X86ISD::RCP28: return "X86ISD::RCP28";
	case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE";
	case X86ISD::RCP28S: return "X86ISD::RCP28S";
	case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE";
	case X86ISD::EXP2: return "X86ISD::EXP2";
	case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE";
	case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
	case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
	case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
	case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE";
	case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
	case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE";
	case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
	case X86ISD::FADDS: return "X86ISD::FADDS";
	case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
	case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
	case X86ISD::FSUBS: return "X86ISD::FSUBS";
	case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
	case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
	case X86ISD::FMULS: return "X86ISD::FMULS";
	case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
	case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
	case X86ISD::FDIVS: return "X86ISD::FDIVS";
	case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
	case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
	case X86ISD::FSQRTS: return "X86ISD::FSQRTS";
	case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
	case X86ISD::FGETEXP: return "X86ISD::FGETEXP";
	case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE";
	case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS";
	case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE";
	case X86ISD::SCALEF: return "X86ISD::SCALEF";
	case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND";
	case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
	case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND";
	case X86ISD::AVG: return "X86ISD::AVG";
	case X86ISD::MULHRS: return "X86ISD::MULHRS";
	case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
	case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
	case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
	case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
	case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";
	case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";
	case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE";
	case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE";
	case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI";
	case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI";
	case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE";
	case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE";
	case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
	case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
	case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P";
	case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P";
	case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
	case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
	case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
	case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP";
	case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
	case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP";
	case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
	case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
	case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH";
	case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
	case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE";
	case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
	case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
	case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI";
	case X86ISD::MCVTP2UI: return "X86ISD::MCVTP2UI";
	case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
	case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
	case X86ISD::CVTS2SI: return "X86ISD::CVTS2SI";
	case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI";
	case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
	case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
	case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16";
	case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16";
	case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16";
	case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS";
	case X86ISD::LWPINS: return "X86ISD::LWPINS";
	case X86ISD::MGATHER: return "X86ISD::MGATHER";
	case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
	case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
	case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
	case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
	case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
	case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
	case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
	case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
	case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
	case X86ISD::NT_CALL: return "X86ISD::NT_CALL";
	case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
	case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
	case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
	case X86ISD::ENQCMD: return "X86ISD:ENQCMD";
	case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS";
	case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT";
	}
	return nullptr;
	}

	/// Return true if the addressing mode represented by AM is legal for this
	/// target, for a load/store of the specified type.
	bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS,
	Instruction *I) const {
	// X86 supports extremely general addressing modes.
	CodeModel::Model M = getTargetMachine().getCodeModel();

	// X86 allows a sign-extended 32-bit immediate field as a displacement.
	if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
	return false;

	if (AM.BaseGV) {
	unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

	// If a reference to this global requires an extra load, we can't fold it.
	if (isGlobalStubReference(GVFlags))
	return false;

	// If BaseGV requires a register for the PIC base, we cannot also have a
	// BaseReg specified.
	if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
	return false;

	// If lower 4G is not available, then we must use rip-relative addressing.
	if ((M != CodeModel::Small \|\| isPositionIndependent()) &&
	Subtarget.is64Bit() && (AM.BaseOffs \|\| AM.Scale > 1))
	return false;
	}

	switch (AM.Scale) {
	case 0:
	case 1:
	case 2:
	case 4:
	case 8:
	// These scales always work.
	break;
	case 3:
	case 5:
	case 9:
	// These scales are formed with basereg+scalereg. Only accept if there is
	// no basereg yet.
	if (AM.HasBaseReg)
	return false;
	break;
	default: // Other stuff never works.
	return false;
	}

	return true;
	}

	bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
	unsigned Bits = Ty->getScalarSizeInBits();

	// 8-bit shifts are always expensive, but versions with a scalar amount aren't
	// particularly cheaper than those without.
	if (Bits == 8)
	return false;

	// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
	if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
	(Bits == 8 \|\| Bits == 16 \|\| Bits == 32 \|\| Bits == 64))
	return false;

	// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
	// shifts just as cheap as scalar ones.
	if (Subtarget.hasAVX2() && (Bits == 32 \|\| Bits == 64))
	return false;

	// AVX512BW has shifts such as vpsllvw.
	if (Subtarget.hasBWI() && Bits == 16)
	return false;

	// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
	// fully general vector.
	return true;
	}

	bool X86TargetLowering::isBinOp(unsigned Opcode) const {
	switch (Opcode) {
	// These are non-commutative binops.
	// TODO: Add more X86ISD opcodes once we have test coverage.
	case X86ISD::ANDNP:
	case X86ISD::PCMPGT:
	case X86ISD::FMAX:
	case X86ISD::FMIN:
	case X86ISD::FANDN:
	return true;
	}

	return TargetLoweringBase::isBinOp(Opcode);
	}

	bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
	switch (Opcode) {
	// TODO: Add more X86ISD opcodes once we have test coverage.
	case X86ISD::PCMPEQ:
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ:
	case X86ISD::FMAXC:
	case X86ISD::FMINC:
	case X86ISD::FAND:
	case X86ISD::FOR:
	case X86ISD::FXOR:
	return true;
	}

	return TargetLoweringBase::isCommutativeBinOp(Opcode);
	}

	bool X86TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::allowTruncateForTailCall(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;

	if (!isTypeLegal(EVT::getEVT(Ty1)))
	return false;

	assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");

	// Assuming the caller doesn't have a zeroext or signext return parameter,
	// truncation all the way down to i1 is valid.
	return true;
	}

	bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
	// Can also use sub to handle negated immediates.
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (!VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2))
	return true;

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	if (!VT1.isSimple() \|\| !VT1.isInteger() \|\|
	!VT2.isSimple() \|\| !VT2.isInteger())
	return false;

	switch (VT1.getSimpleVT().SimpleTy) {
	default: break;
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	// X86 has 8, 16, and 32-bit zero-extending loads.
	return true;
	}

	return false;
	}

	bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
	EVT SrcVT = ExtVal.getOperand(0).getValueType();

	// There is no extending load for vXi1.
	if (SrcVT.getScalarType() == MVT::i1)
	return false;

	return true;
	}

	bool
	X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
	if (!Subtarget.hasAnyFMA())
	return false;

	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
	// i16 instructions are longer (0x66 prefix) and potentially slower.
	return !(VT1 == MVT::i32 && VT2 == MVT::i16);
	}

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks.
	/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
	/// are assumed to be legal.
	bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
	if (!VT.isSimple())
	return false;

	// Not for i1 vectors
	if (VT.getSimpleVT().getScalarType() == MVT::i1)
	return false;

	// Very little shuffling can be done for 64-bit vectors right now.
	if (VT.getSimpleVT().getSizeInBits() == 64)
	return false;

	// We only care that the types being shuffled are legal. The lowering can
	// handle any possible shuffle mask that results.
	return isTypeLegal(VT.getSimpleVT());
	}

	bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
	EVT VT) const {
	// Don't convert an 'and' into a shuffle that we don't directly support.
	// vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
	if (!Subtarget.hasAVX2())
	if (VT == MVT::v32i8 \|\| VT == MVT::v16i16)
	return false;

	// Just delegate to the generic legality, clear masks aren't special.
	return isShuffleMaskLegal(Mask, VT);
	}

	bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
	// If the subtarget is using retpolines, we need to not generate jump tables.
	if (Subtarget.useRetpolineIndirectBranches())
	return false;

	// Otherwise, fallback on the generic logic.
	return TargetLowering::areJTsAllowed(Fn);
	}

	//===----------------------------------------------------------------------===//
	// X86 Scheduler Hooks
	//===----------------------------------------------------------------------===//

	/// Utility function to emit xbegin specifying the start of an RTM region.
	static MachineBasicBlock emitXBegin(MachineInstr &MI, MachineBasicBlock MBB,
	const TargetInstrInfo *TII) {
	DebugLoc DL = MI.getDebugLoc();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// For the v = xbegin(), we generate
	//
	// thisMBB:
	// xbegin sinkMBB
	//
	// mainMBB:
	// s0 = -1
	//
	// fallBB:
	// eax = # XABORT_DEF
	// s1 = eax
	//
	// sinkMBB:
	// v = phi(s0/mainBB, s1/fallBB)

	MachineBasicBlock *thisMBB = MBB;
	MachineFunction *MF = MBB->getParent();
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	unsigned DstReg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned fallDstReg = MRI.createVirtualRegister(RC);

	// thisMBB:
	// xbegin fallMBB
	// # fallthrough to mainMBB
	// # abortion to fallMBB
	BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(fallMBB);

	// mainMBB:
	// mainDstReg := -1
	BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
	BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	mainMBB->addSuccessor(sinkMBB);

	// fallMBB:
	// ; pseudo instruction to model hardware's definition from XABORT
	// EAX := XABORT_DEF
	// fallDstReg := EAX
	BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
	BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
	.addReg(X86::EAX);
	fallMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
	BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(fallDstReg).addMBB(fallMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}



	MachineBasicBlock *
	X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// Emit va_arg instruction on X86-64.

	// Operands to this pseudo-instruction:
	// 0 ) Output : destination address (reg)
	// 1-5) Input : va_list address (addr, i64mem)
	// 6 ) ArgSize : Size (in bytes) of vararg type
	// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
	// 8 ) Align : Alignment of type
	// 9 ) EFLAGS (implicit-def)

	assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
	static_assert(X86::AddrNumOperands == 5,
	"VAARG_64 assumes 5 address operands");

	unsigned DestReg = MI.getOperand(0).getReg();
	MachineOperand &Base = MI.getOperand(1);
	MachineOperand &Scale = MI.getOperand(2);
	MachineOperand &Index = MI.getOperand(3);
	MachineOperand &Disp = MI.getOperand(4);
	MachineOperand &Segment = MI.getOperand(5);
	unsigned ArgSize = MI.getOperand(6).getImm();
	unsigned ArgMode = MI.getOperand(7).getImm();
	unsigned Align = MI.getOperand(8).getImm();

	MachineFunction *MF = MBB->getParent();

	// Memory Reference
	assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");

	MachineMemOperand *OldMMO = MI.memoperands().front();

	// Clone the MMO into two separate MMOs for loading and storing
	MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
	OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
	MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
	OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);

	// Machine Information
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
	const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
	DebugLoc DL = MI.getDebugLoc();

	// struct va_list {
	// i32 gp_offset
	// i32 fp_offset
	// i64 overflow_area (address)
	// i64 reg_save_area (address)
	// }
	// sizeof(va_list) = 24
	// alignment(va_list) = 8

	unsigned TotalNumIntRegs = 6;
	unsigned TotalNumXMMRegs = 8;
	bool UseGPOffset = (ArgMode == 1);
	bool UseFPOffset = (ArgMode == 2);
	unsigned MaxOffset = TotalNumIntRegs * 8 +
	(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

	/* Align ArgSize to a multiple of 8 */
	unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
	bool NeedsAlign = (Align > 8);

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *overflowMBB;
	MachineBasicBlock *offsetMBB;
	MachineBasicBlock *endMBB;

	unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
	unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
	unsigned OffsetReg = 0;

	if (!UseGPOffset && !UseFPOffset) {
	// If we only pull from the overflow region, we don't create a branch.
	// We don't need to alter control flow.
	OffsetDestReg = 0; // unused
	OverflowDestReg = DestReg;

	offsetMBB = nullptr;
	overflowMBB = thisMBB;
	endMBB = thisMBB;
	} else {
	// First emit code to check if gp_offset (or fp_offset) is below the bound.
	// If so, pull the argument from reg_save_area. (branch to offsetMBB)
	// If not, pull from overflow_area. (branch to overflowMBB)
	//
	// thisMBB
	// \| .
	// \| .
	// offsetMBB overflowMBB
	// \| .
	// \| .
	// endMBB

	// Registers for the PHI in endMBB
	OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
	OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator MBBIter = ++MBB->getIterator();

	// Insert the new basic blocks
	MF->insert(MBBIter, offsetMBB);
	MF->insert(MBBIter, overflowMBB);
	MF->insert(MBBIter, endMBB);

	// Transfer the remainder of MBB and its successor edges to endMBB.
	endMBB->splice(endMBB->begin(), thisMBB,
	std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
	endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

	// Make offsetMBB and overflowMBB successors of thisMBB
	thisMBB->addSuccessor(offsetMBB);
	thisMBB->addSuccessor(overflowMBB);

	// endMBB is a successor of both offsetMBB and overflowMBB
	offsetMBB->addSuccessor(endMBB);
	overflowMBB->addSuccessor(endMBB);

	// Load the offset value into a register
	OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// Check if there is enough room left to pull this argument.
	BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
	.addReg(OffsetReg)
	.addImm(MaxOffset + 8 - ArgSizeA8);

	// Branch to "overflowMBB" if offset >= max
	// Fall through to "offsetMBB" otherwise
	BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
	.addMBB(overflowMBB).addImm(X86::COND_AE);
	}

	// In offsetMBB, emit code to use the reg_save_area.
	if (offsetMBB) {
	assert(OffsetReg != 0);

	// Read the reg_save_area address.
	unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 16)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// Zero-extend the offset
	unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
	.addImm(0)
	.addReg(OffsetReg)
	.addImm(X86::sub_32bit);

	// Add the offset to the reg_save_area to get the final address.
	BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
	.addReg(OffsetReg64)
	.addReg(RegSaveReg);

	// Compute the offset for the next argument
	unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
	.addReg(OffsetReg)
	.addImm(UseFPOffset ? 16 : 8);

	// Store it back into the va_list.
	BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.addReg(NextOffsetReg)
	.setMemRefs(StoreOnlyMMO);

	// Jump to endMBB
	BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
	.addMBB(endMBB);
	}

	//
	// Emit code to use overflow area
	//

	// Load the overflow_area address into a register.
	unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.setMemRefs(LoadOnlyMMO);

	// If we need to align it, do so. Otherwise, just copy the address
	// to OverflowDestReg.
	if (NeedsAlign) {
	// Align the overflow address
	assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
	unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);

	// aligned_addr = (addr + (align-1)) & ~(align-1)
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
	.addReg(OverflowAddrReg)
	.addImm(Align-1);

	BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
	.addReg(TmpReg)
	.addImm(~(uint64_t)(Align-1));
	} else {
	BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
	.addReg(OverflowAddrReg);
	}

	// Compute the next overflow address after this argument.
	// (the overflow address should be kept 8-byte aligned)
	unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
	.addReg(OverflowDestReg)
	.addImm(ArgSizeA8);

	// Store the new overflow address.
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.addReg(NextAddrReg)
	.setMemRefs(StoreOnlyMMO);

	// If we branched, emit the PHI to the front of endMBB.
	if (offsetMBB) {
	BuildMI(*endMBB, endMBB->begin(), DL,
	TII->get(X86::PHI), DestReg)
	.addReg(OffsetDestReg).addMBB(offsetMBB)
	.addReg(OverflowDestReg).addMBB(overflowMBB);
	}

	// Erase the pseudo instruction
	MI.eraseFromParent();

	return endMBB;
	}

	MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *MBB) const {
	// Emit code to save XMM registers to the stack. The ABI says that the
	// number of registers to save is given in %al, so it's theoretically
	// possible to do an indirect jump trick to avoid saving all of them,
	// however this code takes a simpler approach and just executes all
	// of the stores if %al is non-zero. It's less code, and it's probably
	// easier on the hardware branch predictor, and stores aren't all that
	// expensive anyway.

	// Create the new basic blocks. One block contains all the XMM stores,
	// and one block is the final destination regardless of whether any
	// stores were performed.
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *F = MBB->getParent();
	MachineFunction::iterator MBBIter = ++MBB->getIterator();
	MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(MBBIter, XMMSaveMBB);
	F->insert(MBBIter, EndMBB);

	// Transfer the remainder of MBB and its successor edges to EndMBB.
	EndMBB->splice(EndMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	EndMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// The original block will now fall through to the XMM save block.
	MBB->addSuccessor(XMMSaveMBB);
	// The XMMSaveMBB will fall through to the end block.
	XMMSaveMBB->addSuccessor(EndMBB);

	// Now add the instructions.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	unsigned CountReg = MI.getOperand(0).getReg();
	int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
	int64_t VarArgsFPOffset = MI.getOperand(2).getImm();

	if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
	// If %al is 0, branch around the XMM save block.
	BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
	BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
	MBB->addSuccessor(EndMBB);
	}

	// Make sure the last operand is EFLAGS, which gets clobbered by the branch
	// that was just emitted, but clearly shouldn't be "saved".
	assert((MI.getNumOperands() <= 3 \|\|
	!MI.getOperand(MI.getNumOperands() - 1).isReg() \|\|
	MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
	"Expected last argument to be EFLAGS");
	unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
	// In the XMM save block, save all the XMM argument registers.
	for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
	int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
	MachineMemOperand *MMO = F->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
	MachineMemOperand::MOStore,
	/Size=/16, /Align=/16);
	BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
	.addFrameIndex(RegSaveFrameIndex)
	.addImm(/Scale=/1)
	.addReg(/IndexReg=/0)
	.addImm(/Disp=/Offset)
	.addReg(/Segment=/0)
	.addReg(MI.getOperand(i).getReg())
	.addMemOperand(MMO);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.

	return EndMBB;
	}

	// The EFLAGS operand of SelectItr might be missing a kill marker
	// because there were multiple uses of EFLAGS, and ISel didn't know
	// which to mark. Figure out whether SelectItr should have had a
	// kill marker, and set it if it should. Returns the correct kill
	// marker value.
	static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
	MachineBasicBlock* BB,
	const TargetRegisterInfo* TRI) {
	// Scan forward through BB for a use/def of EFLAGS.
	MachineBasicBlock::iterator miI(std::next(SelectItr));
	for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
	const MachineInstr& mi = *miI;
	if (mi.readsRegister(X86::EFLAGS))
	return false;
	if (mi.definesRegister(X86::EFLAGS))
	break; // Should have kill-flag - update below.
	}

	// If we hit the end of the block, check whether EFLAGS is live into a
	// successor.
	if (miI == BB->end()) {
	for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
	sEnd = BB->succ_end();
	sItr != sEnd; ++sItr) {
	MachineBasicBlock* succ = *sItr;
	if (succ->isLiveIn(X86::EFLAGS))
	return false;
	}
	}

	// We found a def, or hit the end of the basic block and EFLAGS wasn't live
	// out. SelectMI should have a kill flag on EFLAGS.
	SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
	return true;
	}

	// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
	// together with other CMOV pseudo-opcodes into a single basic-block with
	// conditional jump around it.
	static bool isCMOVPseudo(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case X86::CMOV_FR32:
	case X86::CMOV_FR64:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_VR128:
	case X86::CMOV_VR128X:
	case X86::CMOV_VR256:
	case X86::CMOV_VR256X:
	case X86::CMOV_VR512:
	case X86::CMOV_VK2:
	case X86::CMOV_VK4:
	case X86::CMOV_VK8:
	case X86::CMOV_VK16:
	case X86::CMOV_VK32:
	case X86::CMOV_VK64:
	return true;

	default:
	return false;
	}
	}

	// Helper function, which inserts PHI functions into SinkMBB:
	// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
	// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
	// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
	// the last PHI function inserted.
	static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
	MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
	MachineBasicBlock TrueMBB, MachineBasicBlock FalseMBB,
	MachineBasicBlock *SinkMBB) {
	MachineFunction *MF = TrueMBB->getParent();
	const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
	DebugLoc DL = MIItBegin->getDebugLoc();

	X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

	MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

	// As we are creating the PHIs, we have to be careful if there is more than
	// one. Later CMOVs may reference the results of earlier CMOVs, but later
	// PHIs have to reference the individual true/false inputs from earlier PHIs.
	// That also means that PHI construction must work forward from earlier to
	// later, and that the code must maintain a mapping from earlier PHI's
	// destination registers, and the registers that went into the PHI.
	DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
	MachineInstrBuilder MIB;

	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
	unsigned DestReg = MIIt->getOperand(0).getReg();
	unsigned Op1Reg = MIIt->getOperand(1).getReg();
	unsigned Op2Reg = MIIt->getOperand(2).getReg();

	// If this CMOV we are generating is the opposite condition from
	// the jump we generated, then we have to swap the operands for the
	// PHI that is going to be generated.
	if (MIIt->getOperand(3).getImm() == OppCC)
	std::swap(Op1Reg, Op2Reg);

	if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
	Op1Reg = RegRewriteTable[Op1Reg].first;

	if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
	Op2Reg = RegRewriteTable[Op2Reg].second;

	MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(FalseMBB)
	.addReg(Op2Reg)
	.addMBB(TrueMBB);

	// Add this PHI to the rewrite table.
	RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
	}

	return MIB;
	}

	// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
	MachineInstr &SecondCascadedCMOV,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = FirstCMOV.getDebugLoc();

	// We lower cascaded CMOVs such as
	//
	// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
	//
	// to two successive branches.
	//
	// Without this, we would add a PHI between the two jumps, which ends up
	// creating a few copies all around. For instance, for
	//
	// (sitofp (zext (fcmp une)))
	//
	// we would generate:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// movaps %xmm0, %xmm1
	// jne .LBB5_2
	// xorps %xmm1, %xmm1
	// .LBB5_2:
	// jp .LBB5_4
	// movaps %xmm1, %xmm0
	// .LBB5_4:
	// retq
	//
	// because this custom-inserter would have generated:
	//
	// A
	// \| \
	// \| B
	// \| /
	// C
	// \| \
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// B: empty
	// C: Z = PHI [X, A], [Y, B]
	// D: empty
	// E: PHI [X, C], [Z, D]
	//
	// If we lower both CMOVs in a single step, we can instead generate:
	//
	// A
	// \| \
	// \| C
	// \| /\|
	// \|/ \|
	// \| \|
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// D: empty
	// E: PHI [X, A], [X, C], [Y, D]
	//
	// Which, in our sitofp/fcmp example, gives us something like:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// jne .LBB5_4
	// jp .LBB5_4
	// xorps %xmm0, %xmm0
	// .LBB5_4:
	// retq
	//

	// We lower cascaded CMOV into two successive branches to the same block.
	// EFLAGS is used by both, so mark it as live in the second.
	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FirstInsertedMBB);
	F->insert(It, SecondInsertedMBB);
	F->insert(It, SinkMBB);

	// For a cascaded CMOV, we lower it to two successive branches to
	// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
	// the FirstInsertedMBB.
	FirstInsertedMBB->addLiveIn(X86::EFLAGS);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
	SecondInsertedMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->begin(), ThisMBB,
	std::next(MachineBasicBlock::iterator(FirstCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FirstInsertedMBB);
	// The true block target of the first branch is always SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
	// The true block for the branch of FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SinkMBB);
	// This is fallthrough.
	SecondInsertedMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instructions.
	X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
	BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);

	X86::CondCode SecondCC =
	X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
	BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);

	// SinkMBB:
	// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
	unsigned DestReg = FirstCMOV.getOperand(0).getReg();
	unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
	unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
	MachineInstrBuilder MIB =
	BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(SecondInsertedMBB)
	.addReg(Op2Reg)
	.addMBB(ThisMBB);

	// The second SecondInsertedMBB provides the same incoming value as the
	// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
	MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
	// Copy the PHI result to the register defined by the second CMOV.
	BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
	TII->get(TargetOpcode::COPY),
	SecondCascadedCMOV.getOperand(0).getReg())
	.addReg(FirstCMOV.getOperand(0).getReg());

	// Now remove the CMOVs.
	FirstCMOV.eraseFromParent();
	SecondCascadedCMOV.eraseFromParent();

	return SinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	// To "insert" a SELECT_CC instruction, we actually have to insert the
	// diamond control-flow pattern. The incoming instruction knows the
	// destination vreg to set, the condition code register to branch on, the
	// true/false values to select between and a branch opcode to use.

	// ThisMBB:
	// ...
	// TrueVal = ...
	// cmpTY ccX, r1, r2
	// bCC copy1MBB
	// fallthrough --> FalseMBB

	// This code lowers all pseudo-CMOV instructions. Generally it lowers these
	// as described above, by inserting a BB, and then making a PHI at the join
	// point to select the true and false operands of the CMOV in the PHI.
	//
	// The code also handles two different cases of multiple CMOV opcodes
	// in a row.
	//
	// Case 1:
	// In this case, there are multiple CMOVs in a row, all which are based on
	// the same condition setting (or the exact opposite condition setting).
	// In this case we can lower all the CMOVs using a single inserted BB, and
	// then make a number of PHIs at the join point to model the CMOVs. The only
	// trickiness here, is that in a case like:
	//
	// t2 = CMOV cond1 t1, f1
	// t3 = CMOV cond1 t2, f2
	//
	// when rewriting this into PHIs, we have to perform some renaming on the
	// temps since you cannot have a PHI operand refer to a PHI result earlier
	// in the same block. The "simple" but wrong lowering would be:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t2(BB1), f2(BB2)
	//
	// but clearly t2 is not defined in BB1, so that is incorrect. The proper
	// renaming is to note that on the path through BB1, t2 is really just a
	// copy of t1, and do that renaming, properly generating:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t1(BB1), f2(BB2)
	//
	// Case 2:
	// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
	// function - EmitLoweredCascadedSelect.

	X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
	MachineInstr *LastCMOV = &MI;
	MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);

	// Check for case 1, where there are multiple CMOVs with the same condition
	// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
	// number of jumps the most.

	if (isCMOVPseudo(MI)) {
	// See if we have a string of CMOVS with the same condition. Skip over
	// intervening debug insts.
	while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
	(NextMIIt->getOperand(3).getImm() == CC \|\|
	NextMIIt->getOperand(3).getImm() == OppCC)) {
	LastCMOV = &*NextMIIt;
	++NextMIIt;
	NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end());
	}
	}

	// This checks for case 2, but only do this if we didn't already find
	// case 1, as indicated by LastCMOV == MI.
	if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
	NextMIIt->getOpcode() == MI.getOpcode() &&
	NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
	NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
	NextMIIt->getOperand(1).isKill()) {
	return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
	}

	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FalseMBB);
	F->insert(It, SinkMBB);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!LastCMOV->killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
	FalseMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer any debug instructions inside the CMOV sequence to the sunk block.
	auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
	auto DbgIt = MachineBasicBlock::iterator(MI);
	while (DbgIt != DbgEnd) {
	auto Next = std::next(DbgIt);
	if (DbgIt->isDebugInstr())
	SinkMBB->push_back(DbgIt->removeFromParent());
	DbgIt = Next;
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->end(), ThisMBB,
	std::next(MachineBasicBlock::iterator(LastCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FalseMBB);
	// The true block target of the first (or only) branch is always a SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FalseMBB.
	FalseMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instruction.
	BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);

	// SinkMBB:
	// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
	// ...
	MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
	MachineBasicBlock::iterator MIItEnd =
	std::next(MachineBasicBlock::iterator(LastCMOV));
	createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

	// Now remove the CMOV(s).
	ThisMBB->erase(MIItBegin, MIItEnd);

	return SinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();

	assert(MF->shouldSplitStack());

	const bool Is64Bit = Subtarget.is64Bit();
	const bool IsLP64 = Subtarget.isTarget64BitLP64();

	const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
	const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

	// BB:
	// ... [Till the alloca]
	// If stacklet is not large enough, jump to mallocMBB
	//
	// bumpMBB:
	// Allocate by subtracting from RSP
	// Jump to continueMBB
	//
	// mallocMBB:
	// Allocate by call to runtime
	//
	// continueMBB:
	// ...
	// [rest of original BB]
	//

	MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	const TargetRegisterClass *AddrRegClass =
	getRegClassFor(getPointerTy(MF->getDataLayout()));

	unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
	SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
	sizeVReg = MI.getOperand(1).getReg(),
	physSPReg =
	IsLP64 \|\| Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

	MachineFunction::iterator MBBIter = ++BB->getIterator();

	MF->insert(MBBIter, bumpMBB);
	MF->insert(MBBIter, mallocMBB);
	MF->insert(MBBIter, continueMBB);

	continueMBB->splice(continueMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	continueMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Add code to the main basic block to check if the stack limit has been hit,
	// and if so, jump to mallocMBB otherwise to bumpMBB.
	BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
	.addReg(tmpSPVReg).addReg(sizeVReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
	.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
	.addReg(SPLimitVReg);
	BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);

	// bumpMBB simply decreases the stack pointer, since we know the current
	// stacklet has enough space.
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Calls into a routine in libgcc to allocate more space from the heap.
	const uint32_t *RegMask =
	Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
	if (IsLP64) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::RDI, RegState::Implicit)
	.addReg(X86::RAX, RegState::ImplicitDefine);
	} else if (Is64Bit) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EDI, RegState::Implicit)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	} else {
	BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
	.addImm(12);
	BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	}

	if (!Is64Bit)
	BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
	.addImm(16);

	BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
	.addReg(IsLP64 ? X86::RAX : X86::EAX);
	BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Set up the CFG correctly.
	BB->addSuccessor(bumpMBB);
	BB->addSuccessor(mallocMBB);
	mallocMBB->addSuccessor(continueMBB);
	bumpMBB->addSuccessor(continueMBB);

	// Take care of the PHI nodes.
	BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
	MI.getOperand(0).getReg())
	.addReg(mallocPtrVReg)
	.addMBB(mallocMBB)
	.addReg(bumpSPPtrVReg)
	.addMBB(bumpMBB);

	// Delete the original pseudo instruction.
	MI.eraseFromParent();

	// And we're done.
	return continueMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
	DebugLoc DL = MI.getDebugLoc();

	assert(!isAsynchronousEHPersonality(
	classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
	"SEH does not use catchret!");

	// Only 32-bit EH needs to worry about manually restoring stack pointers.
	if (!Subtarget.is32Bit())
	return BB;

	// C++ EH creates a new target block to hold the restore code, and wires up
	// the new block to the return destination with a normal JMP_4.
	MachineBasicBlock *RestoreMBB =
	MF->CreateMachineBasicBlock(BB->getBasicBlock());
	assert(BB->succ_size() == 1);
	MF->insert(std::next(BB->getIterator()), RestoreMBB);
	RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
	BB->addSuccessor(RestoreMBB);
	MI.getOperand(0).setMBB(RestoreMBB);

	auto RestoreMBBI = RestoreMBB->begin();
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const Constant *PerFn = MF->getFunction().getPersonalityFn();
	bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
	// Only 32-bit SEH requires special handling for catchpad.
	if (IsSEH && Subtarget.is32Bit()) {
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
	}
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// So, here we replace TLSADDR with the sequence:
	// adjust_stackdown -> TLSADDR -> adjust_stackup.
	// We need this because TLSADDR is lowered into calls
	// inside MC, therefore without the two markers shrink-wrapping
	// may push the prologue/epilogue pass them.
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction &MF = *BB->getParent();

	// Emit CALLSEQ_START right before the instruction.
	unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
	MachineInstrBuilder CallseqStart =
	BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
	BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

	// Emit CALLSEQ_END right after the instruction.
	// We don't call erase from parent because we want to keep the
	// original instruction around.
	unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
	MachineInstrBuilder CallseqEnd =
	BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
	BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// This is pretty easy. We're taking the value that we received from
	// our load from the relocation, sticking it in either RDI (x86-64)
	// or EAX and doing an indirect call. The return value will then
	// be in the normal return register.
	MachineFunction *F = BB->getParent();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
	assert(MI.getOperand(3).isGlobal() && "This should be a global");

	// Get a register mask for the lowered call.
	// FIXME: The 32-bit calls have non-standard calling conventions. Use a
	// proper register mask.
	const uint32_t *RegMask =
	Subtarget.is64Bit() ?
	Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
	Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
	if (Subtarget.is64Bit()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
	addDirectMem(MIB, X86::RDI);
	MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else if (!isPositionIndependent()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(0)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(TII->getGlobalBaseReg(F))
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
	switch (RPOpc) {
	case X86::RETPOLINE_CALL32:
	return X86::CALLpcrel32;
	case X86::RETPOLINE_CALL64:
	return X86::CALL64pcrel32;
	case X86::RETPOLINE_TCRETURN32:
	return X86::TCRETURNdi;
	case X86::RETPOLINE_TCRETURN64:
	return X86::TCRETURNdi64;
	}
	llvm_unreachable("not retpoline opcode");
	}

	static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
	unsigned Reg) {
	if (Subtarget.useRetpolineExternalThunk()) {
	// When using an external thunk for retpolines, we pick names that match the
	// names GCC happens to use as well. This helps simplify the implementation
	// of the thunks for kernels where they have no easy ability to create
	// aliases and are doing non-trivial configuration of the thunk's body. For
	// example, the Linux kernel will do boot-time hot patching of the thunk
	// bodies and cannot easily export aliases of these to loaded modules.
	//
	// Note that at any point in the future, we may need to change the semantics
	// of how we implement retpolines and at that time will likely change the
	// name of the called thunk. Essentially, there is no hard guarantee that
	// LLVM will generate calls to specific thunks, we merely make a best-effort
	// attempt to help out kernels and other systems where duplicating the
	// thunks is costly.
	switch (Reg) {
	case X86::EAX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_eax";
	case X86::ECX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_ecx";
	case X86::EDX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_edx";
	case X86::EDI:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__x86_indirect_thunk_edi";
	case X86::R11:
	assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	return "__x86_indirect_thunk_r11";
	}
	llvm_unreachable("unexpected reg for retpoline");
	}

	// When targeting an internal COMDAT thunk use an LLVM-specific name.
	switch (Reg) {
	case X86::EAX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_eax";
	case X86::ECX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_ecx";
	case X86::EDX:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_edx";
	case X86::EDI:
	assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	return "__llvm_retpoline_edi";
	case X86::R11:
	assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	return "__llvm_retpoline_r11";
	}
	llvm_unreachable("unexpected reg for retpoline");
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// Copy the virtual register into the R11 physical register and
	// call the retpoline thunk.
	DebugLoc DL = MI.getDebugLoc();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	unsigned CalleeVReg = MI.getOperand(0).getReg();
	unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());

	// Find an available scratch register to hold the callee. On 64-bit, we can
	// just use R11, but we scan for uses anyway to ensure we don't generate
	// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
	// already a register use operand to the call to hold the callee. If none
	// are available, use EDI instead. EDI is chosen because EBX is the PIC base
	// register and ESI is the base pointer to realigned stack frames with VLAs.
	SmallVector<unsigned, 3> AvailableRegs;
	if (Subtarget.is64Bit())
	AvailableRegs.push_back(X86::R11);
	else
	AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});

	// Zero out any registers that are already used.
	for (const auto &MO : MI.operands()) {
	if (MO.isReg() && MO.isUse())
	for (unsigned &Reg : AvailableRegs)
	if (Reg == MO.getReg())
	Reg = 0;
	}

	// Choose the first remaining non-zero available register.
	unsigned AvailableReg = 0;
	for (unsigned MaybeReg : AvailableRegs) {
	if (MaybeReg) {
	AvailableReg = MaybeReg;
	break;
	}
	}
	if (!AvailableReg)
	report_fatal_error("calling convention incompatible with retpoline, no "
	"available registers");

	const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);

	BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
	.addReg(CalleeVReg);
	MI.getOperand(0).ChangeToES(Symbol);
	MI.setDesc(TII->get(Opc));
	MachineInstrBuilder(*BB->getParent(), &MI)
	.addReg(AvailableReg, RegState::Implicit \| RegState::Kill);
	return BB;
	}

	/// SetJmp implies future control flow change upon calling the corresponding
	/// LongJmp.
	/// Instead of using the 'return' instruction, the long jump fixes the stack and
	/// performs an indirect branch. To do so it uses the registers that were stored
	/// in the jump buffer (when calling SetJmp).
	/// In case the shadow stack is enabled we need to fix it as well, because some
	/// return addresses will be skipped.
	/// The function will save the SSP for future fixing in the function
	/// emitLongJmpShadowStackFix.
	/// \sa emitLongJmpShadowStackFix
	/// \param [in] MI The temporary Machine Instruction for the builtin.
	/// \param [in] MBB The Machine Basic Block that will be modified.
	void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();
	MachineInstrBuilder MIB;

	// Memory Reference.
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	// Initialize a register with zero.
	MVT PVT = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	unsigned ZReg = MRI.createVirtualRegister(PtrRC);
	unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
	BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
	.addDef(ZReg)
	.addReg(ZReg, RegState::Undef)
	.addReg(ZReg, RegState::Undef);

	// Read the current SSP Register value to the zeroed register.
	unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
	unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
	BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

	// Write the SSP register value to offset 3 in input memory buffer.
	unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
	const int64_t SSPOffset = 3 * PVT.getStoreSize();
	const unsigned MemOpndSlot = 1;
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	MIB.addReg(SSPCopyReg);
	MIB.setMemRefs(MMOs);
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	unsigned DstReg;
	unsigned MemOpndSlot = 0;

	unsigned CurOp = 0;

	DstReg = MI.getOperand(CurOp++).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
	(void)TRI;
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned restoreDstReg = MRI.createVirtualRegister(RC);

	MemOpndSlot = CurOp;

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	// For v = setjmp(buf), we generate
	//
	// thisMBB:
	// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
	// SjLjSetup restoreMBB
	//
	// mainMBB:
	// v_main = 0
	//
	// sinkMBB:
	// v = phi(main, restore)
	//
	// restoreMBB:
	// if base pointer being used, load it from frame
	// v_restore = 1

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, sinkMBB);
	MF->push_back(restoreMBB);
	restoreMBB->setHasAddressTaken();

	MachineInstrBuilder MIB;

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// thisMBB:
	unsigned PtrStoreOpc = 0;
	unsigned LabelReg = 0;
	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	// Prepare IP either in reg or imm.
	if (!UseImmLabel) {
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	LabelReg = MRI.createVirtualRegister(PtrRC);
	if (Subtarget.is64Bit()) {
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB)
	.addReg(0);
	} else {
	const X86InstrInfo XII = static_cast<const X86InstrInfo>(TII);
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
	.addReg(XII->getGlobalBaseReg(MF))
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}
	} else
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	// Store IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	if (!UseImmLabel)
	MIB.addReg(LabelReg);
	else
	MIB.addMBB(restoreMBB);
	MIB.setMemRefs(MMOs);

	if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
	emitSetJmpShadowStackFix(MI, thisMBB);
	}

	// Setup
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
	.addMBB(restoreMBB);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	MIB.addRegMask(RegInfo->getNoPreservedMask());
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(restoreMBB);

	// mainMBB:
	// EAX = 0
	BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
	mainMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	BuildMI(*sinkMBB, sinkMBB->begin(), DL,
	TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(restoreDstReg).addMBB(restoreMBB);

	// restoreMBB:
	if (RegInfo->hasBasePointer(*MF)) {
	const bool Uses64BitFramePtr =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
	X86FI->setRestoreBasePointer(MF);
	unsigned FramePtr = RegInfo->getFrameRegister(*MF);
	unsigned BasePtr = RegInfo->getBaseRegister();
	unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
	FramePtr, true, X86FI->getRestoreBasePointerOffset())
	.setMIFlag(MachineInstr::FrameSetup);
	}
	BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
	BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	restoreMBB->addSuccessor(sinkMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	/// Fix the shadow stack using the previously saved SSP pointer.
	/// \sa emitSetJmpShadowStackFix
	/// \param [in] MI The temporary Machine Instruction for the builtin.
	/// \param [in] MBB The Machine Basic Block that will be modified.
	/// \return The sink MBB that will perform the future indirect branch.
	MachineBasicBlock *
	X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	MVT PVT = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

	// checkSspMBB:
	// xor vreg1, vreg1
	// rdssp vreg1
	// test vreg1, vreg1
	// je sinkMBB # Jump if Shadow Stack is not supported
	// fallMBB:
	// mov buf+24/12(%rip), vreg2
	// sub vreg1, vreg2
	// jbe sinkMBB # No need to fix the Shadow Stack
	// fixShadowMBB:
	// shr 3/2, vreg2
	// incssp vreg2 # fix the SSP according to the lower 8 bits
	// shr 8, vreg2
	// je sinkMBB
	// fixShadowLoopPrepareMBB:
	// shl vreg2
	// mov 128, vreg3
	// fixShadowLoopMBB:
	// incssp vreg3
	// dec vreg2
	// jne fixShadowLoopMBB # Iterate until you finish fixing
	// # the Shadow Stack
	// sinkMBB:

	MachineFunction::iterator I = ++MBB->getIterator();
	const BasicBlock *BB = MBB->getBasicBlock();

	MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, checkSspMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, fixShadowMBB);
	MF->insert(I, fixShadowLoopPrepareMBB);
	MF->insert(I, fixShadowLoopMBB);
	MF->insert(I, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
	MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MBB->addSuccessor(checkSspMBB);

	// Initialize a register with zero.
	unsigned ZReg = MRI.createVirtualRegister(PtrRC);
	unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
	BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
	.addDef(ZReg)
	.addReg(ZReg, RegState::Undef)
	.addReg(ZReg, RegState::Undef);

	// Read the current SSP Register value to the zeroed register.
	unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
	unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
	BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

	// Check whether the result of the SSP register is zero and jump directly
	// to the sink.
	unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
	BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
	.addReg(SSPCopyReg)
	.addReg(SSPCopyReg);
	BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
	checkSspMBB->addSuccessor(sinkMBB);
	checkSspMBB->addSuccessor(fallMBB);

	// Reload the previously saved SSP register value.
	unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	const int64_t SPPOffset = 3 * PVT.getStoreSize();
	MachineInstrBuilder MIB =
	BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (i == X86::AddrDisp)
	MIB.addDisp(MO, SPPOffset);
	else if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Subtract the current SSP from the previous SSP.
	unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
	unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
	BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
	.addReg(PrevSSPReg)
	.addReg(SSPCopyReg);

	// Jump to sink in case PrevSSPReg <= SSPCopyReg.
	BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
	fallMBB->addSuccessor(sinkMBB);
	fallMBB->addSuccessor(fixShadowMBB);

	// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
	unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
	unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
	unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
	.addReg(SspSubReg)
	.addImm(Offset);

	// Increase SSP when looking only on the lower 8 bits of the delta.
	unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
	BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);

	// Reset the lower 8 bits.
	unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
	.addReg(SspFirstShrReg)
	.addImm(8);

	// Jump if the result of the shift is zero.
	BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
	fixShadowMBB->addSuccessor(sinkMBB);
	fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);

	// Do a single shift left.
	unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
	unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
	.addReg(SspSecondShrReg);

	// Save the value 128 to a register (will be used next with incssp).
	unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
	unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
	BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
	.addImm(128);
	fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);

	// Since incssp only looks at the lower 8 bits, we might need to do several
	// iterations of incssp until we finish fixing the shadow stack.
	unsigned DecReg = MRI.createVirtualRegister(PtrRC);
	unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
	BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
	.addReg(SspAfterShlReg)
	.addMBB(fixShadowLoopPrepareMBB)
	.addReg(DecReg)
	.addMBB(fixShadowLoopMBB);

	// Every iteration we increase the SSP by 128.
	BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);

	// Every iteration we decrement the counter by 1.
	unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
	BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);

	// Jump if the counter is not zero yet.
	BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
	fixShadowLoopMBB->addSuccessor(sinkMBB);
	fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);

	return sinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
	MI.memoperands_end());

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	const TargetRegisterClass *RC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	unsigned Tmp = MRI.createVirtualRegister(RC);
	// Since FP is only updated here but NOT referenced, it's treated as GPR.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
	unsigned SP = RegInfo->getStackRegister();

	MachineInstrBuilder MIB;

	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t SPOffset = 2 * PVT.getStoreSize();

	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

	MachineBasicBlock *thisMBB = MBB;

	// When CET and shadow stack is enabled, we need to fix the Shadow Stack.
	if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
	thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
	}

	// Reload FP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Reload IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if (i == X86::AddrDisp)
	MIB.addDisp(MO, LabelOffset);
	else if (MO.isReg()) // Don't add the whole operand, we don't want to
	// preserve kill flags.
	MIB.addReg(MO.getReg());
	else
	MIB.add(MO);
	}
	MIB.setMemRefs(MMOs);

	// Reload SP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), SPOffset);
	else
	MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
	// the last instruction of the expansion.
	}
	MIB.setMemRefs(MMOs);

	// Jump
	BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

	MI.eraseFromParent();
	return thisMBB;
	}

	void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
	MachineBasicBlock *MBB,
	MachineBasicBlock *DispatchBB,
	int FI) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!");

	unsigned Op = 0;
	unsigned VR = 0;

	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	if (UseImmLabel) {
	Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	} else {
	const TargetRegisterClass *TRC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	VR = MRI->createVirtualRegister(TRC);
	Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

	if (Subtarget.is64Bit())
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB)
	.addReg(0);
	else
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
	.addReg(0) /* TII->getGlobalBaseReg(MF) */
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}

	MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
	addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
	if (UseImmLabel)
	MIB.addMBB(DispatchBB);
	else
	MIB.addReg(VR);
	}

	MachineBasicBlock *
	X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = BB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	int FI = MF->getFrameInfo().getFunctionContextIndex();

	// Get a mapping of the call site numbers to all of the landing pads they're
	// associated with.
	DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
	unsigned MaxCSNum = 0;
	for (auto &MBB : *MF) {
	if (!MBB.isEHPad())
	continue;

	MCSymbol *Sym = nullptr;
	for (const auto &MI : MBB) {
	if (MI.isDebugInstr())
	continue;

	assert(MI.isEHLabel() && "expected EH_LABEL");
	Sym = MI.getOperand(0).getMCSymbol();
	break;
	}

	if (!MF->hasCallSiteLandingPad(Sym))
	continue;

	for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
	CallSiteNumToLPad[CSI].push_back(&MBB);
	MaxCSNum = std::max(MaxCSNum, CSI);
	}
	}

	// Get an ordered list of the machine basic blocks for the jump table.
	std::vector<MachineBasicBlock *> LPadList;
	SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
	LPadList.reserve(CallSiteNumToLPad.size());

	for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
	for (auto &LP : CallSiteNumToLPad[CSI]) {
	LPadList.push_back(LP);
	InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
	}
	}

	assert(!LPadList.empty() &&
	"No landing pad destinations for the dispatch jump table!");

	// Create the MBBs for the dispatch code.

	// Shove the dispatch's address into the return slot in the function context.
	MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
	DispatchBB->setIsEHPad(true);

	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
	BuildMI(TrapBB, DL, TII->get(X86::TRAP));
	DispatchBB->addSuccessor(TrapBB);

	MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
	DispatchBB->addSuccessor(DispContBB);

	// Insert MBBs.
	MF->push_back(DispatchBB);
	MF->push_back(DispContBB);
	MF->push_back(TrapBB);

	// Insert code into the entry block that creates and registers the function
	// context.
	SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

	// Create the jump table and associated information
	unsigned JTE = getJumpTableEncoding();
	MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
	unsigned MJTI = JTI->createJumpTableIndex(LPadList);

	const X86RegisterInfo &RI = TII->getRegisterInfo();
	// Add a register mask with no preserved registers. This results in all
	// registers being marked as clobbered.
	if (RI.hasBasePointer(*MF)) {
	const bool FPIs64Bit =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
	MFI->setRestoreBasePointer(MF);

	unsigned FP = RI.getFrameRegister(*MF);
	unsigned BP = RI.getBaseRegister();
	unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
	MFI->getRestoreBasePointerOffset())
	.addRegMask(RI.getNoPreservedMask());
	} else {
	BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
	.addRegMask(RI.getNoPreservedMask());
	}

	// IReg is used as an index in a memory operand and therefore can't be SP
	unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
	addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
	Subtarget.is64Bit() ? 8 : 4);
	BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
	.addReg(IReg)
	.addImm(LPadList.size());
	BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);

	if (Subtarget.is64Bit()) {
	unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
	unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

	// leaq .LJTI0_0(%rip), BReg
	BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	// movzx IReg64, IReg
	BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
	.addImm(0)
	.addReg(IReg)
	.addImm(X86::sub_32bit);

	switch (JTE) {
	case MachineJumpTableInfo::EK_BlockAddress:
	// jmpq *(BReg,IReg64,8)
	BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
	.addReg(BReg)
	.addImm(8)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	break;
	case MachineJumpTableInfo::EK_LabelDifference32: {
	unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
	unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
	unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

	// movl (BReg,IReg64,4), OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
	.addReg(BReg)
	.addImm(4)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	// movsx OReg64, OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
	// addq BReg, OReg64, TReg
	BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
	.addReg(OReg64)
	.addReg(BReg);
	// jmpq *TReg
	BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
	break;
	}
	default:
	llvm_unreachable("Unexpected jump table encoding");
	}
	} else {
	// jmpl *.LJTI0_0(,IReg,4)
	BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
	.addReg(0)
	.addImm(4)
	.addReg(IReg)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	}

	// Add the jump table entries as successors to the MBB.
	SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
	for (auto &LP : LPadList)
	if (SeenMBBs.insert(LP).second)
	DispContBB->addSuccessor(LP);

	// N.B. the order the invoke BBs are processed in doesn't matter here.
	SmallVector<MachineBasicBlock *, 64> MBBLPads;
	const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
	for (MachineBasicBlock *MBB : InvokeBBs) {
	// Remove the landing pad successor from the invoke block and replace it
	// with the new dispatch block.
	// Keep a copy of Successors since it's modified inside the loop.
	SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
	MBB->succ_rend());
	// FIXME: Avoid quadratic complexity.
	for (auto MBBS : Successors) {
	if (MBBS->isEHPad()) {
	MBB->removeSuccessor(MBBS);
	MBBLPads.push_back(MBBS);
	}
	}

	MBB->addSuccessor(DispatchBB);

	// Find the invoke call and mark all of the callee-saved registers as
	// 'implicit defined' so that they're spilled. This prevents code from
	// moving instructions to before the EH block, where they will never be
	// executed.
	for (auto &II : reverse(*MBB)) {
	if (!II.isCall())
	continue;

	DenseMap<unsigned, bool> DefRegs;
	for (auto &MOp : II.operands())
	if (MOp.isReg())
	DefRegs[MOp.getReg()] = true;

	MachineInstrBuilder MIB(*MF, &II);
	for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
	unsigned Reg = SavedRegs[RI];
	if (!DefRegs[Reg])
	MIB.addReg(Reg, RegState::ImplicitDefine \| RegState::Dead);
	}

	break;
	}
	}

	// Mark all former landing pads as non-landing pads. The dispatch is the only
	// landing pad now.
	for (auto &LP : MBBLPads)
	LP->setIsEHPad(false);

	// The instruction is gone now.
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unexpected instr type to insert");
	case X86::TLS_addr32:
	case X86::TLS_addr64:
	case X86::TLS_base_addr32:
	case X86::TLS_base_addr64:
	return EmitLoweredTLSAddr(MI, BB);
	case X86::RETPOLINE_CALL32:
	case X86::RETPOLINE_CALL64:
	case X86::RETPOLINE_TCRETURN32:
	case X86::RETPOLINE_TCRETURN64:
	return EmitLoweredRetpoline(MI, BB);
	case X86::CATCHRET:
	return EmitLoweredCatchRet(MI, BB);
	case X86::CATCHPAD:
	return EmitLoweredCatchPad(MI, BB);
	case X86::SEG_ALLOCA_32:
	case X86::SEG_ALLOCA_64:
	return EmitLoweredSegAlloca(MI, BB);
	case X86::TLSCall_32:
	case X86::TLSCall_64:
	return EmitLoweredTLSCall(MI, BB);
	case X86::CMOV_FR32:
	case X86::CMOV_FR32X:
	case X86::CMOV_FR64:
	case X86::CMOV_FR64X:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_VR128:
	case X86::CMOV_VR128X:
	case X86::CMOV_VR256:
	case X86::CMOV_VR256X:
	case X86::CMOV_VR512:
	case X86::CMOV_VK2:
	case X86::CMOV_VK4:
	case X86::CMOV_VK8:
	case X86::CMOV_VK16:
	case X86::CMOV_VK32:
	case X86::CMOV_VK64:
	return EmitLoweredSelect(MI, BB);

	case X86::RDFLAGS32:
	case X86::RDFLAGS64: {
	unsigned PushF =
	MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
	unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
	MachineInstr Push = BuildMI(BB, MI, DL, TII->get(PushF));
	// Permit reads of the EFLAGS and DF registers without them being defined.
	// This intrinsic exists to read external processor state in flags, such as
	// the trap flag, interrupt flag, and direction flag, none of which are
	// modeled by the backend.
	assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
	"Unexpected register in operand!");
	Push->getOperand(2).setIsUndef();
	assert(Push->getOperand(3).getReg() == X86::DF &&
	"Unexpected register in operand!");
	Push->getOperand(3).setIsUndef();
	BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::WRFLAGS32:
	case X86::WRFLAGS64: {
	unsigned Push =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
	unsigned PopF =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
	BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
	BuildMI(*BB, MI, DL, TII->get(PopF));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::FP32_TO_INT16_IN_MEM:
	case X86::FP32_TO_INT32_IN_MEM:
	case X86::FP32_TO_INT64_IN_MEM:
	case X86::FP64_TO_INT16_IN_MEM:
	case X86::FP64_TO_INT32_IN_MEM:
	case X86::FP64_TO_INT64_IN_MEM:
	case X86::FP80_TO_INT16_IN_MEM:
	case X86::FP80_TO_INT32_IN_MEM:
	case X86::FP80_TO_INT64_IN_MEM: {
	// Change the floating point control register to use "round towards zero"
	// mode when truncating to an integer value.
	int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);

	// Load the old value of the control word...
	unsigned OldCW =
	MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
	OrigCWFrameIdx);

	// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
	unsigned NewCW =
	MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
	BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
	.addReg(OldCW, RegState::Kill).addImm(0xC00);

	// Extract to 16 bits.
	unsigned NewCW16 =
	MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
	BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
	.addReg(NewCW, RegState::Kill, X86::sub_16bit);

	// Prepare memory for FLDCW.
	int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
	NewCWFrameIdx)
	.addReg(NewCW16, RegState::Kill);

	// Reload the modified control word now...
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), NewCWFrameIdx);

	// Get the X86 opcode to use.
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
	case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
	case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
	case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
	case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
	case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
	case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
	case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
	case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
	}

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
	.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

	// Reload the original control word now.
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), OrigCWFrameIdx);

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	// xbegin
	case X86::XBEGIN:
	return emitXBegin(MI, BB, Subtarget.getInstrInfo());

	case X86::VASTART_SAVE_XMM_REGS:
	return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);

	case X86::VAARG_64:
	return EmitVAARG64WithCustomInserter(MI, BB);

	case X86::EH_SjLj_SetJmp32:
	case X86::EH_SjLj_SetJmp64:
	return emitEHSjLjSetJmp(MI, BB);

	case X86::EH_SjLj_LongJmp32:
	case X86::EH_SjLj_LongJmp64:
	return emitEHSjLjLongJmp(MI, BB);

	case X86::Int_eh_sjlj_setup_dispatch:
	return EmitSjLjDispatchBlock(MI, BB);

	case TargetOpcode::STATEPOINT:
	// As an implementation detail, STATEPOINT shares the STACKMAP format at
	// this point in the process. We diverge later.
	return emitPatchPoint(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);

	case TargetOpcode::PATCHABLE_EVENT_CALL:
	return emitXRayCustomEvent(MI, BB);

	case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
	return emitXRayTypedEvent(MI, BB);

	case X86::LCMPXCHG8B: {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
	// requires a memory operand. If it happens that current architecture is
	// i686 and for current function we need a base pointer
	// - which is ESI for i686 - register allocator would not be able to
	// allocate registers for an address in form of X(%reg, %reg, Y)
	// - there never would be enough unreserved registers during regalloc
	// (without the need for base ptr the only option would be X(%edi, %esi, Y).
	// We are giving a hand to register allocator by precomputing the address in
	// a new vreg using LEA.

	// If it is not i686 or there is no base pointer - nothing to do here.
	if (!Subtarget.is32Bit() \|\| !TRI->hasBasePointer(*MF))
	return BB;

	// Even though this code does not necessarily needs the base pointer to
	// be ESI, we check for that. The reason: if this assert fails, there are
	// some changes happened in the compiler base pointer handling, which most
	// probably have to be addressed somehow here.
	assert(TRI->getBaseRegister() == X86::ESI &&
	"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
	"base pointer in mind");

	MachineRegisterInfo &MRI = MF->getRegInfo();
	MVT SPTy = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	// Regalloc does not need any help when the memory operand of CMPXCHG8B
	// does not use index register.
	if (AM.IndexReg == X86::NoRegister)
	return BB;

	// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
	// four operand definitions that are E[ABCD] registers. We skip them and
	// then insert the LEA.
	MachineBasicBlock::iterator MBBI(MI);
	while (MBBI->definesRegister(X86::EAX) \|\| MBBI->definesRegister(X86::EBX) \|\|
	MBBI->definesRegister(X86::ECX) \|\| MBBI->definesRegister(X86::EDX))
	--MBBI;
	addFullAddress(
	BuildMI(BB, MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

	setDirectAddressInInstr(&MI, 0, computedAddrVReg);

	return BB;
	}
	case X86::LCMPXCHG16B:
	return BB;
	case X86::LCMPXCHG8B_SAVE_EBX:
	case X86::LCMPXCHG16B_SAVE_RBX: {
	unsigned BasePtr =
	MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
	if (!BB->isLiveIn(BasePtr))
	BB->addLiveIn(BasePtr);
	return BB;
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// X86 Optimization Hooks
	//===----------------------------------------------------------------------===//

	bool
	X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
	const APInt &Demanded,
	TargetLoweringOpt &TLO) const {
	// Only optimize Ands to prevent shrinking a constant that could be
	// matched by movzx.
	if (Op.getOpcode() != ISD::AND)
	return false;

	EVT VT = Op.getValueType();

	// Ignore vectors.
	if (VT.isVector())
	return false;

	unsigned Size = VT.getSizeInBits();

	// Make sure the RHS really is a constant.
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!C)
	return false;

	const APInt &Mask = C->getAPIntValue();

	// Clear all non-demanded bits initially.
	APInt ShrunkMask = Mask & Demanded;

	// Find the width of the shrunk mask.
	unsigned Width = ShrunkMask.getActiveBits();

	// If the mask is all 0s there's nothing to do here.
	if (Width == 0)
	return false;

	// Find the next power of 2 width, rounding up to a byte.
	Width = PowerOf2Ceil(std::max(Width, 8U));
	// Truncate the width to size to handle illegal types.
	Width = std::min(Width, Size);

	// Calculate a possible zero extend mask for this constant.
	APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);

	// If we aren't changing the mask, just return true to keep it and prevent
	// the caller from optimizing.
	if (ZeroExtendMask == Mask)
	return true;

	// Make sure the new mask can be represented by a combination of mask bits
	// and non-demanded bits.
	if (!ZeroExtendMask.isSubsetOf(Mask \| ~Demanded))
	return false;

	// Replace the constant with the zero extend mask.
	SDLoc DL(Op);
	SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
	SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
	return TLO.CombineTo(Op, NewOp);
	}

	void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned BitWidth = Known.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();
	assert((Opc >= ISD::BUILTIN_OP_END \|\|
	Opc == ISD::INTRINSIC_WO_CHAIN \|\|
	Opc == ISD::INTRINSIC_W_CHAIN \|\|
	Opc == ISD::INTRINSIC_VOID) &&
	"Should use MaskedValueIsZero if you don't know whether Op"
	" is a target node!");

	Known.resetAll();
	switch (Opc) {
	default: break;
	case X86ISD::SETCC:
	Known.Zero.setBitsFrom(1);
	break;
	case X86ISD::MOVMSK: {
	unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
	Known.Zero.setBitsFrom(NumLoBits);
	break;
	}
	case X86ISD::PEXTRB:
	case X86ISD::PEXTRW: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
	Op.getConstantOperandVal(1));
	Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
	Known = Known.zextOrTrunc(BitWidth, false);
	Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
	break;
	}
	case X86ISD::VSRAI:
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
	Known.setAllZero();
	break;
	}

	Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
	unsigned ShAmt = ShiftImm->getZExtValue();
	if (Opc == X86ISD::VSHLI) {
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;
	// Low bits are known zero.
	Known.Zero.setLowBits(ShAmt);
	} else if (Opc == X86ISD::VSRLI) {
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);
	// High bits are known zero.
	Known.Zero.setHighBits(ShAmt);
	} else {
	Known.Zero.ashrInPlace(ShAmt);
	Known.One.ashrInPlace(ShAmt);
	}
	}
	break;
	}
	case X86ISD::PACKUS: {
	// PACKUS is just a truncation if the upper half is zero.
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	Known.One = APInt::getAllOnesValue(BitWidth * 2);
	Known.Zero = APInt::getAllOnesValue(BitWidth * 2);

	KnownBits Known2;
	if (!!DemandedLHS) {
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	if (!!DemandedRHS) {
	Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}

	if (Known.countMinLeadingZeros() < BitWidth)
	Known.resetAll();
	Known = Known.trunc(BitWidth);
	break;
	}
	case X86ISD::ANDNP: {
	KnownBits Known2;
	Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// ANDNP = (~X & Y);
	Known.One &= Known2.Zero;
	Known.Zero \|= Known2.One;
	break;
	}
	case X86ISD::FOR: {
	KnownBits Known2;
	Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
	Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

	// Output known-0 bits are only known if clear in both the LHS & RHS.
	Known.Zero &= Known2.Zero;
	// Output known-1 are known to be set if set in either the LHS \| RHS.
	Known.One \|= Known2.One;
	break;
	}
	case X86ISD::CMOV: {
	Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth+1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	}
	}

	// Handle target shuffles.
	// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
	if (isTargetShuffle(Opc)) {
	bool IsUnary;
	SmallVector<int, 64> Mask;
	SmallVector<SDValue, 2> Ops;
	if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
	IsUnary)) {
	unsigned NumOps = Ops.size();
	unsigned NumElts = VT.getVectorNumElements();
	if (Mask.size() == NumElts) {
	SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
	Known.Zero.setAllBits(); Known.One.setAllBits();
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	// For UNDEF elements, we don't know anything about the common state
	// of the shuffle result.
	Known.resetAll();
	break;
	} else if (M == SM_SentinelZero) {
	Known.One.clearAllBits();
	continue;
	}
	assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
	"Shuffle index out of range");

	unsigned OpIdx = (unsigned)M / NumElts;
	unsigned EltIdx = (unsigned)M % NumElts;
	if (Ops[OpIdx].getValueType() != VT) {
	// TODO - handle target shuffle ops with different value types.
	Known.resetAll();
	break;
	}
	DemandedOps[OpIdx].setBit(EltIdx);
	}
	// Known bits are the values that are shared by every demanded element.
	for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
	if (!DemandedOps[i])
	continue;
	KnownBits Known2 =
	DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	}
	}
	}
	}
	}

	unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getScalarSizeInBits();
	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::SETCC_CARRY:
	// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
	return VTBits;

	case X86ISD::VTRUNC: {
	// TODO: Add DemandedElts support.
	SDValue Src = Op.getOperand(0);
	unsigned NumSrcBits = Src.getScalarValueSizeInBits();
	assert(VTBits < NumSrcBits && "Illegal truncation input type");
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	if (Tmp > (NumSrcBits - VTBits))
	return Tmp - (NumSrcBits - VTBits);
	return 1;
	}

	case X86ISD::PACKSS: {
	// PACKSS is just a truncation if the sign bits extend to the packed size.
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
	DemandedRHS);

	unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
	unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
	if (!!DemandedLHS)
	Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
	if (!!DemandedRHS)
	Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
	unsigned Tmp = std::min(Tmp0, Tmp1);
	if (Tmp > (SrcBits - VTBits))
	return Tmp - (SrcBits - VTBits);
	return 1;
	}

	case X86ISD::VSHLI: {
	SDValue Src = Op.getOperand(0);
	const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
	if (ShiftVal.uge(VTBits))
	return VTBits; // Shifted all bits out --> zero.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	if (ShiftVal.uge(Tmp))
	return 1; // Shifted all sign bits out --> unknown.
	return Tmp - ShiftVal.getZExtValue();
	}

	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	APInt ShiftVal = Op.getConstantOperandAPInt(1);
	if (ShiftVal.uge(VTBits - 1))
	return VTBits; // Sign splat.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	ShiftVal += Tmp;
	return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
	}

	case X86ISD::PCMPGT:
	case X86ISD::PCMPEQ:
	case X86ISD::CMPP:
	case X86ISD::VPCOM:
	case X86ISD::VPCOMU:
	// Vector compares return zero/all-bits result values.
	return VTBits;

	case X86ISD::ANDNP: {
	unsigned Tmp0 =
	DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
	if (Tmp0 == 1) return 1; // Early out.
	unsigned Tmp1 =
	DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
	return std::min(Tmp0, Tmp1);
	}

	case X86ISD::CMOV: {
	unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp0 == 1) return 1; // Early out.
	unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
	return std::min(Tmp0, Tmp1);
	}
	}

	// Handle target shuffles.
	// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
	if (isTargetShuffle(Opcode)) {
	bool IsUnary;
	SmallVector<int, 64> Mask;
	SmallVector<SDValue, 2> Ops;
	if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
	IsUnary)) {
	unsigned NumOps = Ops.size();
	unsigned NumElts = VT.getVectorNumElements();
	if (Mask.size() == NumElts) {
	SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!DemandedElts[i])
	continue;
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	// For UNDEF elements, we don't know anything about the common state
	// of the shuffle result.
	return 1;
	} else if (M == SM_SentinelZero) {
	// Zero = all sign bits.
	continue;
	}
	assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
	"Shuffle index out of range");

	unsigned OpIdx = (unsigned)M / NumElts;
	unsigned EltIdx = (unsigned)M % NumElts;
	if (Ops[OpIdx].getValueType() != VT) {
	// TODO - handle target shuffle ops with different value types.
	return 1;
	}
	DemandedOps[OpIdx].setBit(EltIdx);
	}
	unsigned Tmp0 = VTBits;
	for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
	if (!DemandedOps[i])
	continue;
	unsigned Tmp1 =
	DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
	Tmp0 = std::min(Tmp0, Tmp1);
	}
	return Tmp0;
	}
	}
	}

	// Fallback case.
	return 1;
	}

	SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
	if (N->getOpcode() == X86ISD::Wrapper \|\| N->getOpcode() == X86ISD::WrapperRIP)
	return N->getOperand(0);
	return N;
	}

	// Attempt to match a combined shuffle mask against supported unary shuffle
	// instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget, unsigned &Shuffle,
	MVT &SrcVT, MVT &DstVT) {
	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

	// Match against a VZEXT_MOVL vXi32 zero-extending instruction.
	if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
	// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
	unsigned MaxScale = 64 / MaskEltSize;
	for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
	bool MatchAny = true;
	bool MatchZero = true;
	unsigned NumDstElts = NumMaskElts / Scale;
	for (unsigned i = 0; i != NumDstElts && (MatchAny \|\| MatchZero); ++i) {
	if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
	MatchAny = MatchZero = false;
	break;
	}
	MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
	MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
	}
	if (MatchAny \|\| MatchZero) {
	assert(MatchZero && "Failed to match zext but matched aext?");
	unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
	MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
	MVT::getIntegerVT(MaskEltSize);
	SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

	if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
	V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);

	Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
	if (SrcVT.getVectorNumElements() != NumDstElts)
	Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);

	DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
	DstVT = MVT::getVectorVT(DstVT, NumDstElts);
	return true;
	}
	}
	}

	// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
	if (((MaskEltSize == 32) \|\| (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
	isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Check if we have SSE3 which will let us use MOVDDUP etc. The
	// instructions are no slower than UNPCKLPD but has the option to
	// fold the input operand into even an unaligned memory load.
	if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
	if (isTargetShuffleEquivalent(Mask, {0, 0})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	if (MaskVT.is256BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v4f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	}

	if (MaskVT.is512BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX512() &&
	"AVX512 required for 512-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v8f64;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined shuffle mask against supported unary immediate
	// permute instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain, bool AllowIntDomain,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned InputSizeInBits = MaskVT.getSizeInBits();
	unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
	MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

	bool ContainsZeros =
	llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
	if (!ContainsZeros && MaskScalarSizeInBits == 64) {
	// Check for lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
	// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
	if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
	PermuteImm = getV4X86ShuffleImm(Mask);
	return true;
	}
	if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
	PermuteImm = getV4X86ShuffleImm(RepeatedMask);
	return true;
	}
	}
	} else if (AllowFloatDomain && Subtarget.hasAVX()) {
	// VPERMILPD can permute with a non-repeating shuffle.
	Shuffle = X86ISD::VPERMILPI;
	ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
	PermuteImm = 0;
	for (int i = 0, e = Mask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
	PermuteImm \|= (M & 1) << i;
	}
	return true;
	}
	}

	// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
	// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
	// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
	if ((MaskScalarSizeInBits == 64 \|\| MaskScalarSizeInBits == 32) &&
	!ContainsZeros && (AllowIntDomain \|\| Subtarget.hasAVX())) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	// Narrow the repeated mask to create 32-bit element permutes.
	SmallVector<int, 4> WordMask = RepeatedMask;
	if (MaskScalarSizeInBits == 64)
	scaleShuffleMask<int>(2, RepeatedMask, WordMask);

	Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
	ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
	PermuteImm = getV4X86ShuffleImm(WordMask);
	return true;
	}
	}

	// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
	if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	ArrayRef<int> LoMask(Mask.data() + 0, 4);
	ArrayRef<int> HiMask(Mask.data() + 4, 4);

	// PSHUFLW: permute lower 4 elements only.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	Shuffle = X86ISD::PSHUFLW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(LoMask);
	return true;
	}

	// PSHUFHW: permute upper 4 elements only.
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	// Offset the HiMask so that we can create the shuffle immediate.
	int OffsetHiMask[4];
	for (int i = 0; i != 4; ++i)
	OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

	Shuffle = X86ISD::PSHUFHW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
	return true;
	}
	}
	}

	// Attempt to match against byte/bit shifts.
	// FIXME: Add 512-bit support.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
	Mask, 0, Zeroable, Subtarget);
	if (0 < ShiftAmt) {
	PermuteImm = (unsigned)ShiftAmt;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined unary shuffle mask against supported binary
	// shuffle instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDValue &V2, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
	bool IsUnary) {
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	if (MaskVT.is128BitVector()) {
	if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
	V2 = V1;
	V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
	Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
	SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
	SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	std::swap(V1, V2);
	Shuffle = X86ISD::MOVSD;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	Shuffle = X86ISD::MOVSS;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
	if (((MaskVT == MVT::v8i16 \|\| MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) \|\|
	((MaskVT == MVT::v16i16 \|\| MaskVT == MVT::v32i8) && Subtarget.hasInt256()) \|\|
	((MaskVT == MVT::v32i16 \|\| MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
	if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
	Subtarget)) {
	DstVT = MaskVT;
	return true;
	}
	}

	// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
	if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
	if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
	DAG, Subtarget)) {
	SrcVT = DstVT = MaskVT;
	if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
	SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
	return true;
	}
	}

	return false;
	}

	static bool matchBinaryPermuteShuffle(
	MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
	bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
	const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	// Attempt to match against PALIGNR byte rotate.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
	if (0 < ByteRotation) {
	Shuffle = X86ISD::PALIGNR;
	ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
	PermuteImm = ByteRotation;
	return true;
	}
	}

	// Attempt to combine to X86ISD::BLENDI.
	if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) \|\|
	(Subtarget.hasAVX() && MaskVT.is256BitVector()))) \|\|
	(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
	if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
	BlendMask)) {
	if (MaskVT == MVT::v16i16) {
	// We can only use v16i16 PBLENDW if the lanes are repeated.
	SmallVector<int, 8> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
	RepeatedMask)) {
	assert(RepeatedMask.size() == 8 &&
	"Repeated mask size doesn't match!");
	PermuteImm = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	PermuteImm \|= 1 << i;
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	} else {
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	PermuteImm = (unsigned)BlendMask;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	}
	}

	// Attempt to combine to INSERTPS.
	if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
	MaskVT.is128BitVector()) {
	if (Zeroable.getBoolValue() &&
	matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
	Shuffle = X86ISD::INSERTPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}
	}

	// Attempt to combine to SHUFPD.
	if (AllowFloatDomain && EltSizeInBits == 64 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
	return true;
	}
	}

	// Attempt to combine to SHUFPS.
	if (AllowFloatDomain && EltSizeInBits == 32 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE1()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	SmallVector<int, 4> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
	// Match each half of the repeated mask, to determine if its just
	// referencing one of the vectors, is zeroable or entirely undef.
	auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
	int M0 = RepeatedMask[Offset];
	int M1 = RepeatedMask[Offset + 1];

	if (isUndefInRange(RepeatedMask, Offset, 2)) {
	return DAG.getUNDEF(MaskVT);
	} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : 0);
	S1 = (SM_SentinelUndef == M1 ? -1 : 1);
	return getZeroVector(MaskVT, Subtarget, DAG, DL);
	} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V1;
	} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V2;
	}

	return SDValue();
	};

	int ShufMask[4] = {-1, -1, -1, -1};
	SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
	SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

	if (Lo && Hi) {
	V1 = Lo;
	V2 = Hi;
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
	PermuteImm = getV4X86ShuffleImm(ShufMask);
	return true;
	}
	}
	}

	return false;
	}

	static SDValue combineX86ShuffleChainWithExtract(
	ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget);

	/// Combine an arbitrary chain of shuffles into a single instruction if
	/// possible.
	///
	/// This is the leaf of the recursive combine below. When we have found some
	/// chain of single-use x86 shuffle instructions and accumulated the combined
	/// shuffle mask represented by them, this will try to pattern match that mask
	/// into either a single instruction if there is a special purpose instruction
	/// for this operation, or into a PSHUFB instruction which is a fully general
	/// instruction but should only be used to replace chains over a certain depth.
	static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
	ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask,
	bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
	assert((Inputs.size() == 1 \|\| Inputs.size() == 2) &&
	"Unexpected number of shuffle inputs!");

	// Find the inputs that enter the chain. Note that multiple uses are OK
	// here, we're not going to remove the operands we find.
	bool UnaryShuffle = (Inputs.size() == 1);
	SDValue V1 = peekThroughBitcasts(Inputs[0]);
	SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
	: peekThroughBitcasts(Inputs[1]));

	MVT VT1 = V1.getSimpleValueType();
	MVT VT2 = V2.getSimpleValueType();
	MVT RootVT = Root.getSimpleValueType();
	assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
	VT2.getSizeInBits() == RootVT.getSizeInBits() &&
	"Vector size mismatch");

	SDLoc DL(Root);
	SDValue Res;

	unsigned NumBaseMaskElts = BaseMask.size();
	if (NumBaseMaskElts == 1) {
	assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
	return DAG.getBitcast(RootVT, V1);
	}

	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned NumRootElts = RootVT.getVectorNumElements();
	unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
	bool FloatDomain = VT1.isFloatingPoint() \|\| VT2.isFloatingPoint() \|\|
	(RootVT.isFloatingPoint() && Depth >= 2) \|\|
	(RootVT.is256BitVector() && !Subtarget.hasAVX2());

	// Don't combine if we are a AVX512/EVEX target and the mask element size
	// is different from the root element size - this would prevent writemasks
	// from being reused.
	// TODO - this currently prevents all lane shuffles from occurring.
	// TODO - check for writemasks usage instead of always preventing combining.
	// TODO - attempt to narrow Mask back to writemask size.
	bool IsEVEXShuffle =
	RootSizeInBits == 512 \|\| (Subtarget.hasVLX() && RootSizeInBits >= 128);

	// Attempt to match a subvector broadcast.
	// shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
	if (UnaryShuffle &&
	(BaseMaskEltSizeInBits == 128 \|\| BaseMaskEltSizeInBits == 256)) {
	SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
	if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
	SDValue Src = Inputs[0];
	if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
	Src.getOperand(0).isUndef() &&
	Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
	MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
	return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
	Src.getValueType(),
	Src.getOperand(1)));
	}
	}
	}

	// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.

	// Handle 128-bit lane shuffles of 256-bit vectors.
	// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
	// we need to use the zeroing feature.
	// TODO - this should support binary shuffles.
	if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
	!(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
	!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
	return SDValue(); // Nothing to do!
	MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
	unsigned PermMask = 0;
	PermMask \|= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
	PermMask \|= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);

	Res = DAG.getBitcast(ShuffleVT, V1);
	Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
	DAG.getUNDEF(ShuffleVT),
	DAG.getConstant(PermMask, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	// For masks that have been widened to 128-bit elements or more,
	// narrow back down to 64-bit elements.
	SmallVector<int, 64> Mask;
	if (BaseMaskEltSizeInBits > 64) {
	assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
	int MaskScale = BaseMaskEltSizeInBits / 64;
	scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
	} else {
	Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
	}

	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

	// Determine the effective mask value type.
	FloatDomain &= (32 <= MaskEltSizeInBits);
	MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
	: MVT::getIntegerVT(MaskEltSizeInBits);
	MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

	// Only allow legal mask types.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
	return SDValue();

	// Attempt to match the mask against known shuffle patterns.
	MVT ShuffleSrcVT, ShuffleVT;
	unsigned Shuffle, PermuteImm;

	// Which shuffle domains are permitted?
	// Permit domain crossing at higher combine depths.
	// TODO: Should we indicate which domain is preferred if both are allowed?
	bool AllowFloatDomain = FloatDomain \|\| (Depth > 3);
	bool AllowIntDomain = (!FloatDomain \|\| (Depth > 3)) && Subtarget.hasSSE2() &&
	(!MaskVT.is256BitVector() \|\| Subtarget.hasAVX2());

	// Determine zeroable mask elements.
	APInt Zeroable(NumMaskElts, 0);
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (isUndefOrZero(Mask[i]))
	Zeroable.setBit(i);

	if (UnaryShuffle) {
	// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
	// directly if we don't shuffle the lower element and we shuffle the upper
	// (zero) elements within themselves.
	if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
	(cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
	MaskEltSizeInBits) == 0) {
	unsigned Scale =
	cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
	MaskEltSizeInBits;
	ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
	if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
	isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
	return DAG.getBitcast(RootVT, V1);
	}
	}

	// Attempt to match against broadcast-from-vector.
	// Limit AVX1 to cases where we're loading+broadcasting a scalar element.
	if ((Subtarget.hasAVX2() \|\| (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))
	&& (!IsEVEXShuffle \|\| NumRootElts == NumMaskElts)) {
	SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
	if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
	if (V1.getValueType() == MaskVT &&
	V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	MayFoldLoad(V1.getOperand(0))) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
	return SDValue(); // Nothing to do!
	Res = V1.getOperand(0);
	Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}
	if (Subtarget.hasAVX2()) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}
	}
	}

	SDValue NewV1 = V1; // Save operand in case early exit happens.
	if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
	DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
	ShuffleVT) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
	PermuteImm) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleVT, V1);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
	DAG.getConstant(PermuteImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}
	}

	SDValue NewV1 = V1; // Save operands in case early exit happens.
	SDValue NewV2 = V2;
	if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
	NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
	ShuffleVT, UnaryShuffle) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
	NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
	return DAG.getBitcast(RootVT, Res);
	}

	NewV1 = V1; // Save operands in case early exit happens.
	NewV2 = V2;
	if (matchBinaryPermuteShuffle(
	MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
	NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
	NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
	DAG.getConstant(PermuteImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	// Typically from here on, we need an integer version of MaskVT.
	MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
	IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

	// Annoyingly, SSE4A instructions don't map into the above match helpers.
	if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
	uint64_t BitLen, BitIdx;
	if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
	Zeroable)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	V2 = DAG.getBitcast(IntMaskVT, V2);
	Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}
	}

	// Don't try to re-form single instruction chains under any circumstances now
	// that we've done encoding canonicalization for them.
	if (Depth < 2)
	return SDValue();

	// Depth threshold above which we can efficiently use variable mask shuffles.
	int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
	AllowVariableMask &= (Depth >= VariableShuffleDepth) \|\| HasVariableMask;

	bool MaskContainsZeros =
	any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
	// If we have a single input lane-crossing shuffle then lower to VPERMV.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX2() &&
	(MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
	return DAG.getBitcast(RootVT, Res);
	}

	// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
	// vector as the second source.
	if (UnaryShuffle && AllowVariableMask &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	// Adjust shuffle mask - replace SM_SentinelZero with second source index.
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (Mask[i] == SM_SentinelZero)
	Mask[i] = NumMaskElts + i;

	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	Res = DAG.getBitcast(MaskVT, V1);
	SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
	return DAG.getBitcast(RootVT, Res);
	}

	// If that failed and either input is extracted then try to combine as a
	// shuffle with the larger type.
	if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
	Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
	DAG, Subtarget))
	return WideShuffle;

	// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
	if (AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	return DAG.getBitcast(RootVT, Res);
	}
	return SDValue();
	}

	// See if we can combine a single input shuffle with zeros to a bit-mask,
	// which is much simpler than any shuffle.
	if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
	isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
	DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
	APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
	APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
	APInt UndefElts(NumMaskElts, 0);
	SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	}
	if (M == SM_SentinelZero)
	continue;
	EltBits[i] = AllOnes;
	}
	SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
	Res = DAG.getBitcast(MaskVT, V1);
	unsigned AndOpcode =
	FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
	Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// the 128-bit lanes use the variable mask to VPERMILPS.
	// TODO Combine other mask types at higher depths.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
	SmallVector<SDValue, 16> VPermIdx;
	for (int M : Mask) {
	SDValue Idx =
	M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
	VPermIdx.push_back(Idx);
	}
	SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
	Res = DAG.getBitcast(MaskVT, V1);
	Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
	// to VPERMIL2PD/VPERMIL2PS.
	if (AllowVariableMask && Subtarget.hasXOP() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4f32 \|\|
	MaskVT == MVT::v8f32)) {
	// VPERMIL2 Operation.
	// Bits[3] - Match Bit.
	// Bits[2:1] - (Per Lane) PD Shuffle Mask.
	// Bits[2:0] - (Per Lane) PS Shuffle Mask.
	unsigned NumLanes = MaskVT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = NumMaskElts / NumLanes;
	SmallVector<int, 8> VPerm2Idx;
	unsigned M2ZImm = 0;
	for (int M : Mask) {
	if (M == SM_SentinelUndef) {
	VPerm2Idx.push_back(-1);
	continue;
	}
	if (M == SM_SentinelZero) {
	M2ZImm = 2;
	VPerm2Idx.push_back(8);
	continue;
	}
	int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
	Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
	VPerm2Idx.push_back(Index);
	}
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
	Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
	DAG.getConstant(M2ZImm, DL, MVT::i8));
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have 3 or more shuffle instructions or a chain involving a variable
	// mask, we can replace them with a single PSHUFB instruction profitably.
	// Intel's manuals suggest only using PSHUFB if doing so replacing 5
	// instructions, but in practice PSHUFB tends to be very fast so we're
	// more aggressive.
	if (UnaryShuffle && AllowVariableMask &&
	((RootVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(RootVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
	SmallVector<SDValue, 16> PSHUFBMask;
	int NumBytes = RootVT.getSizeInBits() / 8;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	assert((M / 16) == (i / 16) && "Lane crossing detected");
	PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
	Res = DAG.getBitcast(ByteVT, V1);
	SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
	Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, if we have a 128-bit binary input shuffle we can always combine
	// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
	// slower than PSHUFB on targets that support both.
	if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
	// VPPERM Mask Operation
	// Bits[4:0] - Byte Index (0 - 31)
	// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
	SmallVector<SDValue, 16> VPPERMMask;
	int NumBytes = 16;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::v16i8;
	V1 = DAG.getBitcast(ByteVT, V1);
	V2 = DAG.getBitcast(ByteVT, V2);
	SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
	Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
	return DAG.getBitcast(RootVT, Res);
	}

	// If that failed and either input is extracted then try to combine as a
	// shuffle with the larger type.
	if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
	Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
	DAG, Subtarget))
	return WideShuffle;

	// If we have a dual input shuffle then lower to VPERMV3.
	if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v2i64 \|\| MaskVT == MVT::v4f64 \|\|
	MaskVT == MVT::v4i64 \|\| MaskVT == MVT::v4f32 \|\| MaskVT == MVT::v4i32 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() &&
	(MaskVT == MVT::v8i16 \|\| MaskVT == MVT::v16i16)) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() &&
	(MaskVT == MVT::v16i8 \|\| MaskVT == MVT::v32i8)))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	V1 = DAG.getBitcast(MaskVT, V1);
	V2 = DAG.getBitcast(MaskVT, V2);
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	return DAG.getBitcast(RootVT, Res);
	}

	// Failed to find any combines.
	return SDValue();
	}

	// Combine an arbitrary chain of shuffles + extract_subvectors into a single
	// instruction if possible.
	//
	// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
	// type size to attempt to combine:
	// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
	// -->
	// extract_subvector(shuffle(x,y,m2),0)
	static SDValue combineX86ShuffleChainWithExtract(
	ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NumMaskElts = BaseMask.size();
	unsigned NumInputs = Inputs.size();
	if (NumInputs == 0)
	return SDValue();

	SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
	SmallVector<unsigned, 4> Offsets(NumInputs, 0);

	// Peek through subvectors.
	// TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
	unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
	for (unsigned i = 0; i != NumInputs; ++i) {
	SDValue &Src = WideInputs[i];
	unsigned &Offset = Offsets[i];
	Src = peekThroughBitcasts(Src);
	EVT BaseVT = Src.getValueType();
	while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	isa<ConstantSDNode>(Src.getOperand(1))) {
	Offset += Src.getConstantOperandVal(1);
	Src = Src.getOperand(0);
	}
	WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits());
	assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
	"Unexpected subvector extraction");
	Offset /= BaseVT.getVectorNumElements();
	Offset *= NumMaskElts;
	}

	// Bail if we're always extracting from the lowest subvectors,
	// combineX86ShuffleChain should match this for the current width.
	if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
	return SDValue();

	EVT RootVT = Root.getValueType();
	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned Scale = WideSizeInBits / RootSizeInBits;
	assert((WideSizeInBits % RootSizeInBits) == 0 &&
	"Unexpected subvector extraction");

	// If the src vector types aren't the same, see if we can extend
	// them to match each other.
	// TODO: Support different scalar types?
	EVT WideSVT = WideInputs[0].getValueType().getScalarType();
	if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
	return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) \|\|
	Op.getValueType().getScalarType() != WideSVT;
	}))
	return SDValue();

	for (SDValue &NewInput : WideInputs) {
	assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
	"Shuffle vector size mismatch");
	if (WideSizeInBits > NewInput.getValueSizeInBits())
	NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
	SDLoc(NewInput), WideSizeInBits);
	assert(WideSizeInBits == NewInput.getValueSizeInBits() &&
	"Unexpected subvector extraction");
	}

	// Create new mask for larger type.
	for (unsigned i = 1; i != NumInputs; ++i)
	Offsets[i] += i * Scale * NumMaskElts;

	SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
	for (int &M : WideMask) {
	if (M < 0)
	continue;
	M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
	}
	WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);

	// Remove unused/repeated shuffle source ops.
	resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
	assert(!WideInputs.empty() && "Shuffle with no inputs detected");

	if (WideInputs.size() > 2)
	return SDValue();

	// Increase depth for every upper subvector we've peeked through.
	Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });

	// Attempt to combine wider chain.
	// TODO: Can we use a better Root?
	SDValue WideRoot = WideInputs[0];
	if (SDValue WideShuffle = combineX86ShuffleChain(
	WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
	AllowVariableMask, DAG, Subtarget)) {
	WideShuffle =
	extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
	return DAG.getBitcast(RootVT, WideShuffle);
	}
	return SDValue();
	}

	// Attempt to constant fold all of the constant source ops.
	// Returns true if the entire shuffle is folded to a constant.
	// TODO: Extend this to merge multiple constant Ops and update the mask.
	static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
	ArrayRef<int> Mask, SDValue Root,
	bool HasVariableMask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Root.getSimpleValueType();

	unsigned SizeInBits = VT.getSizeInBits();
	unsigned NumMaskElts = Mask.size();
	unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
	unsigned NumOps = Ops.size();

	// Extract constant bits from each source op.
	bool OneUseConstantOp = false;
	SmallVector<APInt, 16> UndefEltsOps(NumOps);
	SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue SrcOp = Ops[i];
	OneUseConstantOp \|= SrcOp.hasOneUse();
	if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
	RawBitsOps[i]))
	return SDValue();
	}

	// Only fold if at least one of the constants is only used once or
	// the combined shuffle has included a variable mask shuffle, this
	// is to avoid constant pool bloat.
	if (!OneUseConstantOp && !HasVariableMask)
	return SDValue();

	// Shuffle the constant bits according to the mask.
	APInt UndefElts(NumMaskElts, 0);
	APInt ZeroElts(NumMaskElts, 0);
	APInt ConstantElts(NumMaskElts, 0);
	SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
	APInt::getNullValue(MaskSizeInBits));
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	} else if (M == SM_SentinelZero) {
	ZeroElts.setBit(i);
	continue;
	}
	assert(0 <= M && M < (int)(NumMaskElts * NumOps));

	unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
	unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

	auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
	if (SrcUndefElts[SrcMaskIdx]) {
	UndefElts.setBit(i);
	continue;
	}

	auto &SrcEltBits = RawBitsOps[SrcOpIdx];
	APInt &Bits = SrcEltBits[SrcMaskIdx];
	if (!Bits) {
	ZeroElts.setBit(i);
	continue;
	}

	ConstantElts.setBit(i);
	ConstantBitData[i] = Bits;
	}
	assert((UndefElts \| ZeroElts \| ConstantElts).isAllOnesValue());

	// Create the constant data.
	MVT MaskSVT;
	if (VT.isFloatingPoint() && (MaskSizeInBits == 32 \|\| MaskSizeInBits == 64))
	MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
	else
	MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

	MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

	SDLoc DL(Root);
	SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
	return DAG.getBitcast(VT, CstOp);
	}

	/// Fully generic combining of x86 shuffle instructions.
	///
	/// This should be the last combine run over the x86 shuffle instructions. Once
	/// they have been fully optimized, this will recursively consider all chains
	/// of single-use shuffle instructions, build a generic model of the cumulative
	/// shuffle operation, and check for simpler instructions which implement this
	/// operation. We use this primarily for two purposes:
	///
	/// 1) Collapse generic shuffles to specialized single instructions when
	/// equivalent. In most cases, this is just an encoding size win, but
	/// sometimes we will collapse multiple generic shuffles into a single
	/// special-purpose shuffle.
	/// 2) Look for sequences of shuffle instructions with 3 or more total
	/// instructions, and replace them with the slightly more expensive SSSE3
	/// PSHUFB instruction if available. We do this as the last combining step
	/// to ensure we avoid using PSHUFB if we can implement the shuffle with
	/// a suitable short sequence of other instructions. The PSHUFB will either
	/// use a register or have to read from memory and so is slightly (but only
	/// slightly) more expensive than the other shuffle instructions.
	///
	/// Because this is inherently a quadratic operation (for each shuffle in
	/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
	/// This should never be an issue in practice as the shuffle lowering doesn't
	/// produce sequences of more than 8 instructions.
	///
	/// FIXME: We will currently miss some cases where the redundant shuffling
	/// would simplify under the threshold for PSHUFB formation because of
	/// combine-ordering. To fix this, we should do the redundant instruction
	/// combining in this recursive walk.
	static SDValue combineX86ShufflesRecursively(
	ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
	ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
	bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bound the depth of our recursive combine because this is ultimately
	// quadratic in nature.
	const unsigned MaxRecursionDepth = 8;
	if (Depth > MaxRecursionDepth)
	return SDValue();

	// Directly rip through bitcasts to find the underlying operand.
	SDValue Op = SrcOps[SrcOpIndex];
	Op = peekThroughOneUseBitcasts(Op);

	MVT VT = Op.getSimpleValueType();
	if (!VT.isVector())
	return SDValue(); // Bail if we hit a non-vector.

	assert(Root.getSimpleValueType().isVector() &&
	"Shuffles operate on vector types!");
	assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
	"Can only combine shuffles of the same vector register size.");

	// Extract target shuffle mask and resolve sentinels and inputs.
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
	return SDValue();

	// Add the inputs to the Ops list, avoiding duplicates.
	SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());

	auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
	// Attempt to find an existing match.
	SDValue InputBC = peekThroughBitcasts(Input);
	for (int i = 0, e = Ops.size(); i < e; ++i)
	if (InputBC == peekThroughBitcasts(Ops[i]))
	return i;
	// Match failed - should we replace an existing Op?
	if (InsertionPoint >= 0) {
	Ops[InsertionPoint] = Input;
	return InsertionPoint;
	}
	// Add to the end of the Ops list.
	Ops.push_back(Input);
	return Ops.size() - 1;
	};

	SmallVector<int, 2> OpInputIdx;
	for (SDValue OpInput : OpInputs)
	OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));

	assert(((RootMask.size() > OpMask.size() &&
	RootMask.size() % OpMask.size() == 0) \|\|
	(OpMask.size() > RootMask.size() &&
	OpMask.size() % RootMask.size() == 0) \|\|
	OpMask.size() == RootMask.size()) &&
	"The smaller number of elements must divide the larger.");

	// This function can be performance-critical, so we rely on the power-of-2
	// knowledge that we have about the mask sizes to replace div/rem ops with
	// bit-masks and shifts.
	assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
	unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

	unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
	unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
	unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
	assert((RootRatio == 1 \|\| OpRatio == 1) &&
	"Must not have a ratio for both incoming and op masks!");

	assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
	unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

	SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);

	// Merge this shuffle operation's mask into our accumulated mask. Note that
	// this shuffle's mask will be the first applied to the input, followed by the
	// root mask to get us all the way to the root value arrangement. The reason
	// for this order is that we are recursing up the operation chain.
	for (unsigned i = 0; i < MaskWidth; ++i) {
	unsigned RootIdx = i >> RootRatioLog2;
	if (RootMask[RootIdx] < 0) {
	// This is a zero or undef lane, we're done.
	Mask[i] = RootMask[RootIdx];
	continue;
	}

	unsigned RootMaskedIdx =
	RootRatio == 1
	? RootMask[RootIdx]
	: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

	// Just insert the scaled root mask value if it references an input other
	// than the SrcOp we're currently inserting.
	if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) \|\|
	(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
	Mask[i] = RootMaskedIdx;
	continue;
	}

	RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
	unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
	if (OpMask[OpIdx] < 0) {
	// The incoming lanes are zero or undef, it doesn't matter which ones we
	// are using.
	Mask[i] = OpMask[OpIdx];
	continue;
	}

	// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
	unsigned OpMaskedIdx =
	OpRatio == 1
	? OpMask[OpIdx]
	: (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));

	OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
	int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
	assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
	OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;

	Mask[i] = OpMaskedIdx;
	}

	// Handle the all undef/zero cases early.
	if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
	return DAG.getUNDEF(Root.getValueType());

	// TODO - should we handle the mixed zero/undef case as well? Just returning
	// a zero mask will lose information on undef elements possibly reducing
	// future combine possibilities.
	if (all_of(Mask, [](int Idx) { return Idx < 0; }))
	return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
	SDLoc(Root));

	// Remove unused/repeated shuffle source ops.
	resolveTargetShuffleInputsAndMask(Ops, Mask);
	assert(!Ops.empty() && "Shuffle with no inputs detected");

	HasVariableMask \|= isTargetShuffleVariableMask(Op.getOpcode());

	// Update the list of shuffle nodes that have been combined so far.
	SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
	SrcNodes.end());
	CombinedNodes.push_back(Op.getNode());

	// See if we can recurse into each shuffle source op (if it's a target
	// shuffle). The source op should only be generally combined if it either has
	// a single use (i.e. current Op) or all its users have already been combined,
	// if not then we can still combine but should prevent generation of variable
	// shuffles to avoid constant pool bloat.
	// Don't recurse if we already have more source ops than we can combine in
	// the remaining recursion depth.
	if (Ops.size() < (MaxRecursionDepth - Depth)) {
	for (int i = 0, e = Ops.size(); i < e; ++i) {
	bool AllowVar = false;
	if (Ops[i].getNode()->hasOneUse() \|\|
	SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
	AllowVar = AllowVariableMask;
	if (SDValue Res = combineX86ShufflesRecursively(
	Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
	AllowVar, DAG, Subtarget))
	return Res;
	}
	}

	// Attempt to constant fold all of the constant source ops.
	if (SDValue Cst = combineX86ShufflesConstants(
	Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
	return Cst;

	// We can only combine unary and binary shuffle mask cases.
	if (Ops.size() <= 2) {
	// Minor canonicalization of the accumulated shuffle mask to make it easier
	// to match below. All this does is detect masks with sequential pairs of
	// elements, and shrink them to the half-width mask. It does this in a loop
	// so it will reduce the size of the mask to the minimal width mask which
	// performs an equivalent shuffle.
	SmallVector<int, 64> WidenedMask;
	while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
	Mask = std::move(WidenedMask);
	}

	// Canonicalization of binary shuffle masks to improve pattern matching by
	// commuting the inputs.
	if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(Ops[0], Ops[1]);
	}

	// Finally, try to combine into a single shuffle instruction.
	return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
	AllowVariableMask, DAG, Subtarget);
	}

	// If that failed and any input is extracted then try to combine as a
	// shuffle with the larger type.
	return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
	HasVariableMask, AllowVariableMask,
	DAG, Subtarget);
	}

	/// Helper entry wrapper to combineX86ShufflesRecursively.
	static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false,
	/AllowVarMask/ true, DAG, Subtarget);
	}

	/// Get the PSHUF-style mask from PSHUF node.
	///
	/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
	/// PSHUF-style masks that can be reused with such instructions.
	static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	SmallVector<SDValue, 2> Ops;
	bool IsUnary;
	bool HaveMask =
	getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
	(void)HaveMask;
	assert(HaveMask);

	// If we have more than 128-bits, only the low 128-bits of shuffle mask
	// matter. Check that the upper masks are repeats and remove them.
	if (VT.getSizeInBits() > 128) {
	int LaneElts = 128 / VT.getScalarSizeInBits();
	#ifndef NDEBUG
	for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
	for (int j = 0; j < LaneElts; ++j)
	assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
	"Mask doesn't repeat in high 128-bit lanes!");
	#endif
	Mask.resize(LaneElts);
	}

	switch (N.getOpcode()) {
	case X86ISD::PSHUFD:
	return Mask;
	case X86ISD::PSHUFLW:
	Mask.resize(4);
	return Mask;
	case X86ISD::PSHUFHW:
	Mask.erase(Mask.begin(), Mask.begin() + 4);
	for (int &M : Mask)
	M -= 4;
	return Mask;
	default:
	llvm_unreachable("No valid shuffle instruction found!");
	}
	}

	/// Search for a combinable shuffle across a chain ending in pshufd.
	///
	/// We walk up the chain and look for a combinable shuffle, skipping over
	/// shuffles that we could hoist this shuffle's transformation past without
	/// altering anything.
	static SDValue
	combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(N.getOpcode() == X86ISD::PSHUFD &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);

	// Walk up a single-use chain looking for a combinable shuffle. Keep a stack
	// of the shuffles in the chain so that we can form a fresh chain to replace
	// this one.
	SmallVector<SDValue, 8> Chain;
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFD:
	// Found another dword shuffle.
	break;

	case X86ISD::PSHUFLW:
	// Check that the low words (being shuffled) are the identity in the
	// dword shuffle, and the high words are self-contained.
	if (Mask[0] != 0 \|\| Mask[1] != 1 \|\|
	!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::PSHUFHW:
	// Check that the high words (being shuffled) are the identity in the
	// dword shuffle, and the low words are self-contained.
	if (Mask[2] != 2 \|\| Mask[3] != 3 \|\|
	!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
	// shuffle into a preceding word shuffle.
	if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
	V.getSimpleValueType().getVectorElementType() != MVT::i16)
	return SDValue();

	// Search for a half-shuffle which we can combine with.
	unsigned CombineOp =
	V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	if (V.getOperand(0) != V.getOperand(1) \|\|
	!V->isOnlyUserOf(V.getOperand(0).getNode()))
	return SDValue();
	Chain.push_back(V);
	V = V.getOperand(0);
	do {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing to combine.

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOp)
	break;

	Chain.push_back(V);

	LLVM_FALLTHROUGH;
	case ISD::BITCAST:
	V = V.getOperand(0);
	continue;
	}
	break;
	} while (V.hasOneUse());
	break;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return SDValue();

	// Merge this node's mask and our incoming mask.
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Rebuild the chain around this new shuffle.
	while (!Chain.empty()) {
	SDValue W = Chain.pop_back_val();

	if (V.getValueType() != W.getOperand(0).getValueType())
	V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

	switch (W.getOpcode()) {
	default:
	llvm_unreachable("Only PSHUF and UNPCK instructions get here!");

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
	break;

	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
	break;
	}
	}
	if (V.getValueType() != N.getValueType())
	V = DAG.getBitcast(N.getValueType(), V);

	// Return the new chain to replace N.
	return V;
	}

	/// Try to combine x86 target specific shuffles.
	static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	unsigned Opcode = N.getOpcode();

	// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
	// single instruction.
	if (VT.getScalarSizeInBits() == 64 &&
	(Opcode == X86ISD::MOVSD \|\| Opcode == X86ISD::UNPCKH \|\|
	Opcode == X86ISD::UNPCKL)) {
	auto BC0 = peekThroughBitcasts(N.getOperand(0));
	auto BC1 = peekThroughBitcasts(N.getOperand(1));
	EVT VT0 = BC0.getValueType();
	EVT VT1 = BC1.getValueType();
	unsigned Opcode0 = BC0.getOpcode();
	unsigned Opcode1 = BC1.getOpcode();
	if (Opcode0 == Opcode1 && VT0 == VT1 &&
	(Opcode0 == X86ISD::FHADD \|\| Opcode0 == X86ISD::HADD \|\|
	Opcode0 == X86ISD::FHSUB \|\| Opcode0 == X86ISD::HSUB \|\|
	Opcode0 == X86ISD::PACKSS \|\| Opcode0 == X86ISD::PACKUS)) {
	SDValue Lo, Hi;
	if (Opcode == X86ISD::MOVSD) {
	Lo = BC1.getOperand(0);
	Hi = BC0.getOperand(1);
	} else {
	Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
	Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
	}
	SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
	return DAG.getBitcast(VT, Horiz);
	}
	}

	switch (Opcode) {
	case X86ISD::VBROADCAST: {
	SDValue Src = N.getOperand(0);
	SDValue BC = peekThroughBitcasts(Src);
	EVT SrcVT = Src.getValueType();
	EVT BCVT = BC.getValueType();

	// If broadcasting from another shuffle, attempt to simplify it.
	// TODO - we really need a general SimplifyDemandedVectorElts mechanism.
	if (isTargetShuffle(BC.getOpcode()) &&
	VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
	unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
	SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
	SM_SentinelUndef);
	for (unsigned i = 0; i != Scale; ++i)
	DemandedMask[i] = i;
	if (SDValue Res = combineX86ShufflesRecursively(
	{BC}, 0, BC, DemandedMask, {}, /Depth/ 1,
	/HasVarMask/ false, /AllowVarMask/ true, DAG, Subtarget))
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getBitcast(SrcVT, Res));
	}

	// broadcast(bitcast(src)) -> bitcast(broadcast(src))
	// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
	if (Src.getOpcode() == ISD::BITCAST &&
	SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) {
	EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
	VT.getVectorNumElements());
	return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
	}

	// Reduce broadcast source vector to lowest 128-bits.
	if (SrcVT.getSizeInBits() > 128)
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	extract128BitVector(Src, 0, DAG, DL));

	// broadcast(scalar_to_vector(x)) -> broadcast(x).
	if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

	// Share broadcast with the longest vector and extract low subvector (free).
	for (SDNode *User : Src->uses())
	if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
	User->getValueSizeInBits(0) > VT.getSizeInBits()) {
	return extractSubVector(SDValue(User, 0), 0, DAG, DL,
	VT.getSizeInBits());
	}

	return SDValue();
	}
	case X86ISD::BLENDI: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);

	// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
	// TODO: Handle MVT::v16i16 repeated blend mask.
	if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
	N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
	MVT SrcVT = N0.getOperand(0).getSimpleValueType();
	if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
	SrcVT.getScalarSizeInBits() >= 32) {
	unsigned Mask = N.getConstantOperandVal(2);
	unsigned Size = VT.getVectorNumElements();
	unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
	unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
	N1.getOperand(0),
	DAG.getConstant(ScaleMask, DL, MVT::i8)));
	}
	}
	return SDValue();
	}
	case X86ISD::VPERMI: {
	// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
	// TODO: Remove when we have preferred domains in combineX86ShuffleChain.
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	if (N0.getOpcode() == ISD::BITCAST &&
	N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
	SDValue Src = N0.getOperand(0);
	EVT SrcVT = Src.getValueType();
	SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
	return DAG.getBitcast(VT, Res);
	}
	return SDValue();
	}
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	Mask = getPSHUFShuffleMask(N);
	assert(Mask.size() == 4);
	break;
	case X86ISD::MOVSD:
	case X86ISD::MOVSS: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);

	// Canonicalize scalar FPOps:
	// MOVS(N0, OP(N0, N1)) --> MOVS(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
	// If commutable, allow OP(N1[0], N0[0]).
	unsigned Opcode1 = N1.getOpcode();
	if (Opcode1 == ISD::FADD \|\| Opcode1 == ISD::FMUL \|\| Opcode1 == ISD::FSUB \|\|
	Opcode1 == ISD::FDIV) {
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);
	if (N10 == N0 \|\|
	(N11 == N0 && (Opcode1 == ISD::FADD \|\| Opcode1 == ISD::FMUL))) {
	if (N10 != N0)
	std::swap(N10, N11);
	MVT SVT = VT.getVectorElementType();
	SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
	N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
	N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
	SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
	SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
	return DAG.getNode(Opcode, DL, VT, N0, SclVec);
	}
	}

	return SDValue();
	}
	case X86ISD::INSERTPS: {
	assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
	SDValue Op0 = N.getOperand(0);
	SDValue Op1 = N.getOperand(1);
	SDValue Op2 = N.getOperand(2);
	unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
	unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
	unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
	unsigned ZeroMask = InsertPSMask & 0xF;

	// If we zero out all elements from Op0 then we don't need to reference it.
	if (((ZeroMask \| (1u << DstIdx)) == 0xF) && !Op0.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	// If we zero out the element from Op1 then we don't need to reference it.
	if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	// Attempt to merge insertps Op1 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask1;
	SmallVector<SDValue, 2> Ops1;
	if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
	int M = TargetMask1[SrcIdx];
	if (isUndefOrZero(M)) {
	// Zero/UNDEF insertion - zero out element and remove dependency.
	InsertPSMask \|= (1u << DstIdx);
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}
	// Update insertps mask srcidx and reference the source input directly.
	assert(0 <= M && M < 8 && "Shuffle index out of range");
	InsertPSMask = (InsertPSMask & 0x3f) \| ((M & 0x3) << 6);
	Op1 = Ops1[M < 4 ? 0 : 1];
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}

	// Attempt to merge insertps Op0 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask0;
	SmallVector<SDValue, 2> Ops0;
	if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
	return SDValue();

	bool Updated = false;
	bool UseInput00 = false;
	bool UseInput01 = false;
	for (int i = 0; i != 4; ++i) {
	int M = TargetMask0[i];
	if ((InsertPSMask & (1u << i)) \|\| (i == (int)DstIdx)) {
	// No change if element is already zero or the inserted element.
	continue;
	} else if (isUndefOrZero(M)) {
	// If the target mask is undef/zero then we must zero the element.
	InsertPSMask \|= (1u << i);
	Updated = true;
	continue;
	}

	// The input vector element must be inline.
	if (M != i && M != (i + 4))
	return SDValue();

	// Determine which inputs of the target shuffle we're using.
	UseInput00 \|= (0 <= M && M < 4);
	UseInput01 \|= (4 <= M);
	}

	// If we're not using both inputs of the target shuffle then use the
	// referenced input directly.
	if (UseInput00 && !UseInput01) {
	Updated = true;
	Op0 = Ops0[0];
	} else if (!UseInput00 && UseInput01) {
	Updated = true;
	Op0 = Ops0[1];
	}

	if (Updated)
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	return SDValue();
	}
	default:
	return SDValue();
	}

	// Nuke no-op shuffles that show up after combining.
	if (isNoopShuffleMask(Mask))
	return N.getOperand(0);

	// Look for simplifications involving one or two shuffle instructions.
	SDValue V = N.getOperand(0);
	switch (N.getOpcode()) {
	default:
	break;
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");

	// See if this reduces to a PSHUFD which is no more expensive and can
	// combine with more operations. Note that it has to at least flip the
	// dwords as otherwise it would have been removed as a no-op.
	if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
	int DMask[] = {0, 1, 2, 3};
	int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
	DMask[DOffset + 0] = DOffset + 1;
	DMask[DOffset + 1] = DOffset + 0;
	MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
	V = DAG.getBitcast(DVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
	getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	}

	// Look for shuffle patterns which can be implemented as a single unpack.
	// FIXME: This doesn't handle the location of the PSHUFD generically, and
	// only works when we have a PSHUFD followed by two half-shuffles.
	if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
	(V.getOpcode() == X86ISD::PSHUFLW \|\|
	V.getOpcode() == X86ISD::PSHUFHW) &&
	V.getOpcode() != N.getOpcode() &&
	V.hasOneUse()) {
	SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
	if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
	int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int WordMask[8];
	for (int i = 0; i < 4; ++i) {
	WordMask[i + NOffset] = Mask[i] + NOffset;
	WordMask[i + VOffset] = VMask[i] + VOffset;
	}
	// Map the word mask through the DWord mask.
	int MappedMask[8];
	for (int i = 0; i < 8; ++i)
	MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
	if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
	makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
	// We can replace all three shuffles with an unpack.
	V = DAG.getBitcast(VT, D.getOperand(0));
	return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
	: X86ISD::UNPCKH,
	DL, VT, V, V);
	}
	}
	}

	break;

	case X86ISD::PSHUFD:
	if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
	return NewN;

	break;
	}

	return SDValue();
	}

	/// Checks if the shuffle mask takes subsequent elements
	/// alternately from two vectors.
	/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
	static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {

	int ParitySrc[2] = {-1, -1};
	unsigned Size = Mask.size();
	for (unsigned i = 0; i != Size; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue;

	// Make sure we are using the matching element from the input.
	if ((M % Size) != i)
	return false;

	// Make sure we use the same input for all elements of the same parity.
	int Src = M / Size;
	if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
	return false;
	ParitySrc[i % 2] = Src;
	}

	// Make sure each input is used.
	if (ParitySrc[0] < 0 \|\| ParitySrc[1] < 0 \|\| ParitySrc[0] == ParitySrc[1])
	return false;

	Op0Even = ParitySrc[0] == 0;
	return true;
	}

	/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
	/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	///
	/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
	/// so it is easier to generically match. We also insert dummy vector shuffle
	/// nodes for the operands which explicitly discard the lanes which are unused
	/// by this operation to try to flow through the rest of the combiner
	/// the fact that they're unused.
	static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
	bool &IsSubAdd) {

	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!Subtarget.hasSSE3() \|\| !TLI.isTypeLegal(VT) \|\|
	!VT.getSimpleVT().isFloatingPoint())
	return false;

	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);

	// Make sure we have an FADD and an FSUB.
	if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) \|\|
	(V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) \|\|
	V1.getOpcode() == V2.getOpcode())
	return false;

	// If there are other uses of these operations we can't fold them.
	if (!V1->hasOneUse() \|\| !V2->hasOneUse())
	return false;

	// Ensure that both operations have the same operands. Note that we can
	// commute the FADD operands.
	SDValue LHS, RHS;
	if (V1.getOpcode() == ISD::FSUB) {
	LHS = V1->getOperand(0); RHS = V1->getOperand(1);
	if ((V2->getOperand(0) != LHS \|\| V2->getOperand(1) != RHS) &&
	(V2->getOperand(0) != RHS \|\| V2->getOperand(1) != LHS))
	return false;
	} else {
	assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
	LHS = V2->getOperand(0); RHS = V2->getOperand(1);
	if ((V1->getOperand(0) != LHS \|\| V1->getOperand(1) != RHS) &&
	(V1->getOperand(0) != RHS \|\| V1->getOperand(1) != LHS))
	return false;
	}

	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	bool Op0Even;
	if (!isAddSubOrSubAddMask(Mask, Op0Even))
	return false;

	// It's a subadd if the vector in the even parity is an FADD.
	IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
	: V2->getOpcode() == ISD::FADD;

	Opnd0 = LHS;
	Opnd1 = RHS;
	return true;
	}

	/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
	static SDValue combineShuffleToFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!Subtarget.hasAnyFMA() \|\| !TLI.isTypeLegal(VT))
	return SDValue();

	// We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDValue FMAdd = Op0, FMSub = Op1;
	if (FMSub.getOpcode() != X86ISD::FMSUB)
	std::swap(FMAdd, FMSub);

	if (FMAdd.getOpcode() != ISD::FMA \|\| FMSub.getOpcode() != X86ISD::FMSUB \|\|
	FMAdd.getOperand(0) != FMSub.getOperand(0) \|\| !FMAdd.hasOneUse() \|\|
	FMAdd.getOperand(1) != FMSub.getOperand(1) \|\| !FMSub.hasOneUse() \|\|
	FMAdd.getOperand(2) != FMSub.getOperand(2))
	return SDValue();

	// Check for correct shuffle mask.
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	bool Op0Even;
	if (!isAddSubOrSubAddMask(Mask, Op0Even))
	return SDValue();

	// FMAddSub takes zeroth operand from FMSub node.
	SDLoc DL(N);
	bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
	unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
	FMAdd.getOperand(2));
	}

	/// Try to combine a shuffle into a target-specific add-sub or
	/// mul-add-sub node.
	static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
	return V;

	SDValue Opnd0, Opnd1;
	bool IsSubAdd;
	if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
	unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
	return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
	}

	if (IsSubAdd)
	return SDValue();

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	// We are looking for a shuffle where both sources are concatenated with undef
	// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
	// if we can express this as a single-source shuffle, that's preferable.
	static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX2() \|\| !isa<ShuffleVectorSDNode>(N))
	return SDValue();

	EVT VT = N->getValueType(0);

	// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();

	if (VT.getVectorElementType() != MVT::i32 &&
	VT.getVectorElementType() != MVT::i64 &&
	VT.getVectorElementType() != MVT::f32 &&
	VT.getVectorElementType() != MVT::f64)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Check that both sources are concats with undef.
	if (N0.getOpcode() != ISD::CONCAT_VECTORS \|\|
	N1.getOpcode() != ISD::CONCAT_VECTORS \|\| N0.getNumOperands() != 2 \|\|
	N1.getNumOperands() != 2 \|\| !N0.getOperand(1).isUndef() \|\|
	!N1.getOperand(1).isUndef())
	return SDValue();

	// Construct the new shuffle mask. Elements from the first source retain their
	// index, but elements from the second source no longer need to skip an undef.
	SmallVector<int, 8> Mask;
	int NumElts = VT.getVectorNumElements();

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (int Elt : SVOp->getMask())
	Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

	SDLoc DL(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
	N1.getOperand(0));
	return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
	}

	/// Eliminate a redundant shuffle of a horizontal math op.
	static SDValue foldShuffleOfHorizOp(SDNode *N) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
	if (Opcode != ISD::VECTOR_SHUFFLE \|\| !N->getOperand(1).isUndef())
	return SDValue();

	// For a broadcast, peek through an extract element of index 0 to find the
	// horizontal op: broadcast (ext_vec_elt HOp, 0)
	EVT VT = N->getValueType(0);
	if (Opcode == X86ISD::VBROADCAST) {
	SDValue SrcOp = N->getOperand(0);
	if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	SrcOp.getValueType() == MVT::f64 &&
	SrcOp.getOperand(0).getValueType() == VT &&
	isNullConstant(SrcOp.getOperand(1)))
	N = SrcOp.getNode();
	}

	SDValue HOp = N->getOperand(0);
	if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
	HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
	return SDValue();

	// 128-bit horizontal math instructions are defined to operate on adjacent
	// lanes of each operand as:
	// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
	// ...similarly for v2f64 and v8i16.
	if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
	HOp.getOperand(0) != HOp.getOperand(1))
	return SDValue();

	// When the operands of a horizontal math op are identical, the low half of
	// the result is the same as the high half. If a target shuffle is also
	// replicating low and high halves, we don't need the shuffle.
	if (Opcode == X86ISD::MOVDDUP \|\| Opcode == X86ISD::VBROADCAST) {
	if (HOp.getScalarValueSizeInBits() == 64) {
	// movddup (hadd X, X) --> hadd X, X
	// broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
	assert((HOp.getValueType() == MVT::v2f64 \|\|
	HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT &&
	"Unexpected type for h-op");
	return HOp;
	}
	return SDValue();
	}

	// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
	// but this should be tied to whatever horizontal op matching and shuffle
	// canonicalization are producing.
	if (HOp.getValueSizeInBits() == 128 &&
	(isTargetShuffleEquivalent(Mask, {0, 0}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
	return HOp;

	if (HOp.getValueSizeInBits() == 256 &&
	(isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) \|\|
	isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) \|\|
	isTargetShuffleEquivalent(
	Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
	return HOp;

	return SDValue();
	}

	/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
	/// low half of each source vector and does not set any high half elements in
	/// the destination vector, narrow the shuffle to half its original size.
	static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
	if (!Shuf->getValueType(0).isSimple())
	return SDValue();
	MVT VT = Shuf->getSimpleValueType(0);
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// See if we can ignore all of the high elements of the shuffle.
	ArrayRef<int> Mask = Shuf->getMask();
	if (!isUndefUpperHalf(Mask))
	return SDValue();

	// Check if the shuffle mask accesses only the low half of each input vector
	// (half-index output is 0 or 2).
	int HalfIdx1, HalfIdx2;
	SmallVector<int, 8> HalfMask(Mask.size() / 2);
	if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) \|\|
	(HalfIdx1 % 2 == 1) \|\| (HalfIdx2 % 2 == 1))
	return SDValue();

	// Create a half-width shuffle to replace the unnecessarily wide shuffle.
	// The trick is knowing that all of the insert/extract are actually free
	// subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
	// of narrow inputs into a narrow output, and that is always cheaper than
	// the wide shuffle that we started with.
	return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
	Shuf->getOperand(1), HalfMask, HalfIdx1,
	HalfIdx2, false, DAG);
	}

	static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
	if (SDValue V = narrowShuffle(Shuf, DAG))
	return V;

	// If we have legalized the vector types, look for blends of FADD and FSUB
	// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(VT)) {
	if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
	return AddSub;

	if (SDValue HAddSub = foldShuffleOfHorizOp(N))
	return HAddSub;
	}

	// During Type Legalization, when promoting illegal vector types,
	// the backend might introduce new shuffle dag nodes and bitcasts.
	//
	// This code performs the following transformation:
	// fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
	// (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
	//
	// We do this only if both the bitcast and the BINOP dag nodes have
	// one use. Also, perform this transformation only if the new binary
	// operation is legal. This is to avoid introducing dag nodes that
	// potentially need to be further expanded (or custom lowered) into a
	// less optimal sequence of dag nodes.
	if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
	N->getOpcode() == ISD::VECTOR_SHUFFLE &&
	N->getOperand(0).getOpcode() == ISD::BITCAST &&
	N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	SDValue BC0 = N0.getOperand(0);
	EVT SVT = BC0.getValueType();
	unsigned Opcode = BC0.getOpcode();
	unsigned NumElts = VT.getVectorNumElements();

	if (BC0.hasOneUse() && SVT.isVector() &&
	SVT.getVectorNumElements() * 2 == NumElts &&
	TLI.isOperationLegal(Opcode, VT)) {
	bool CanFold = false;
	switch (Opcode) {
	default : break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	// isOperationLegal lies for integer ops on floating point types.
	CanFold = VT.isInteger();
	break;
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	// isOperationLegal lies for floating point ops on integer types.
	CanFold = VT.isFloatingPoint();
	break;
	}

	unsigned SVTNumElts = SVT.getVectorNumElements();
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
	CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
	for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
	CanFold = SVOp->getMaskElt(i) < 0;

	if (CanFold) {
	SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
	SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
	SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
	return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
	}
	}
	}

	// Attempt to combine into a vector load/broadcast.
	if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
	return LD;

	// For AVX2, we sometimes want to combine
	// (vector_shuffle <mask> (concat_vectors t1, undef)
	// (concat_vectors t2, undef))
	// Into:
	// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
	// Since the latter can be efficiently lowered with VPERMD/VPERMQ
	if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
	return ShufConcat;

	if (isTargetShuffle(N->getOpcode())) {
	SDValue Op(N, 0);
	if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
	return Shuffle;

	// Try recursively combining arbitrary sequences of x86 shuffle
	// instructions into higher-order shuffles. We do this after combining
	// specific PSHUF instruction sequences into their minimal form so that we
	// can evaluate how many specialized shuffle instructions are involved in
	// a particular chain.
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;

	// Simplify source operands based on shuffle mask.
	// TODO - merge this into combineX86ShufflesRecursively.
	APInt KnownUndef, KnownZero;
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
	return SDValue(N, 0);
	}

	// Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros
	// in the upper 64 bits.
	// TODO: Can we generalize this using computeKnownBits.
	if (N->getOpcode() == X86ISD::VZEXT_MOVL &&
	(VT == MVT::v2f64 \|\| VT == MVT::v2i64) &&
	N->getOperand(0).getOpcode() == ISD::BITCAST &&
	(N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 \|\|
	N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) {
	SDValue In = N->getOperand(0).getOperand(0);
	switch (In.getOpcode()) {
	default:
	break;
	case X86ISD::CVTP2SI: case X86ISD::CVTP2UI:
	case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI:
	case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI:
	case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI:
	case X86ISD::CVTSI2P: case X86ISD::CVTUI2P:
	case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P:
	case X86ISD::VFPROUND: case X86ISD::VMFPROUND:
	if (In.getOperand(0).getValueType() == MVT::v2f64 \|\|
	In.getOperand(0).getValueType() == MVT::v2i64)
	return N->getOperand(0); // return the bitcast
	break;
	}
	}

	// Pull subvector inserts into undef through VZEXT_MOVL by making it an
	// insert into a zero vector. This helps get VZEXT_MOVL closer to
	// scalar_to_vectors where 256/512 are canonicalized to an insert and a
	// 128-bit scalar_to_vector. This reduces the number of isel patterns.
	if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
	N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
	N->getOperand(0).hasOneUse() &&
	N->getOperand(0).getOperand(0).isUndef() &&
	isNullConstant(N->getOperand(0).getOperand(2))) {
	SDValue In = N->getOperand(0).getOperand(1);
	SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
	getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
	Movl, N->getOperand(0).getOperand(2));
	}

	// If this a vzmovl of a full vector load, replace it with a vzload, unless
	// the load is volatile.
	if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
	ISD::isNormalLoad(N->getOperand(0).getNode())) {
	LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
	if (!LN->isVolatile()) {
	SDVTList Tys = DAG.getVTList(VT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue VZLoad =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
	VT.getVectorElementType(),
	LN->getPointerInfo(),
	LN->getAlignment(),
	MachineMemOperand::MOLoad);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	return VZLoad;
	}
	}


	// Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
	// operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
	// FIXME: This can probably go away once we default to widening legalization.
	if (Subtarget.hasSSE41() && VT == MVT::v4i32 &&
	N->getOpcode() == ISD::VECTOR_SHUFFLE &&
	N->getOperand(0).getOpcode() == ISD::BITCAST &&
	N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) {
	SDValue BC = N->getOperand(0);
	SDValue MULUDQ = BC.getOperand(0);
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	ArrayRef<int> Mask = SVOp->getMask();
	if (BC.hasOneUse() && MULUDQ.hasOneUse() &&
	Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) {
	SDValue Op0 = MULUDQ.getOperand(0);
	SDValue Op1 = MULUDQ.getOperand(1);
	if (Op0.getOpcode() == ISD::BITCAST &&
	Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
	Op0.getOperand(0).getValueType() == MVT::v4i32) {
	ShuffleVectorSDNode *SVOp0 =
	cast<ShuffleVectorSDNode>(Op0.getOperand(0));
	ArrayRef<int> Mask2 = SVOp0->getMask();
	if (Mask2[0] == 0 && Mask2[1] == -1 &&
	Mask2[2] == 1 && Mask2[3] == -1) {
	Op0 = SVOp0->getOperand(0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);
	Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask);
	return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
	}
	}
	if (Op1.getOpcode() == ISD::BITCAST &&
	Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
	Op1.getOperand(0).getValueType() == MVT::v4i32) {
	ShuffleVectorSDNode *SVOp1 =
	cast<ShuffleVectorSDNode>(Op1.getOperand(0));
	ArrayRef<int> Mask2 = SVOp1->getMask();
	if (Mask2[0] == 0 && Mask2[1] == -1 &&
	Mask2[2] == 1 && Mask2[3] == -1) {
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask);
	Op1 = SVOp1->getOperand(0);
	return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
	}
	}
	}
	}

	return SDValue();
	}

	bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
	SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
	TargetLoweringOpt &TLO, unsigned Depth) const {
	int NumElts = DemandedElts.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();

	// Handle special case opcodes.
	switch (Opc) {
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ: {
	APInt LHSUndef, LHSZero;
	APInt RHSUndef, RHSZero;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
	Depth + 1))
	return true;
	if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
	Depth + 1))
	return true;
	// Multiply by zero.
	KnownZero = LHSZero \| RHSZero;
	break;
	}
	case X86ISD::VSHL:
	case X86ISD::VSRL:
	case X86ISD::VSRA: {
	// We only need the bottom 64-bits of the (128-bit) shift amount.
	SDValue Amt = Op.getOperand(1);
	MVT AmtVT = Amt.getSimpleValueType();
	assert(AmtVT.is128BitVector() && "Unexpected value type");

	// If we reuse the shift amount just for sse shift amounts then we know that
	// only the bottom 64-bits are only ever used.
	bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
	unsigned UseOpc = Use->getOpcode();
	return (UseOpc == X86ISD::VSHL \|\| UseOpc == X86ISD::VSRL \|\|
	UseOpc == X86ISD::VSRA) &&
	Use->getOperand(0) != Amt;
	});

	APInt AmtUndef, AmtZero;
	unsigned NumAmtElts = AmtVT.getVectorNumElements();
	APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
	if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
	Depth + 1, AssumeSingleUse))
	return true;
	LLVM_FALLTHROUGH;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI:
	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	APInt SrcUndef;
	if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
	Depth + 1))
	return true;
	// TODO convert SrcUndef to KnownUndef.
	break;
	}
	case X86ISD::CVTSI2P:
	case X86ISD::CVTUI2P: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	APInt SrcUndef, SrcZero;
	APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
	if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: {
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef,
	SrcZero, TLO, Depth + 1))
	return true;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef,
	SrcZero, TLO, Depth + 1))
	return true;
	break;
	}
	case X86ISD::HADD:
	case X86ISD::HSUB:
	case X86ISD::FHADD:
	case X86ISD::FHSUB: {
	APInt DemandedLHS, DemandedRHS;
	getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

	APInt LHSUndef, LHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
	LHSZero, TLO, Depth + 1))
	return true;
	APInt RHSUndef, RHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
	RHSZero, TLO, Depth + 1))
	return true;
	break;
	}
	case X86ISD::VTRUNC:
	case X86ISD::VTRUNCS:
	case X86ISD::VTRUNCUS: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	KnownZero = SrcZero.zextOrTrunc(NumElts);
	KnownUndef = SrcUndef.zextOrTrunc(NumElts);
	break;
	}
	case X86ISD::BLENDV: {
	APInt SelUndef, SelZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
	SelZero, TLO, Depth + 1))
	return true;

	// TODO: Use SelZero to adjust LHS/RHS DemandedElts.
	APInt LHSUndef, LHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
	LHSZero, TLO, Depth + 1))
	return true;

	APInt RHSUndef, RHSZero;
	if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
	RHSZero, TLO, Depth + 1))
	return true;

	KnownZero = LHSZero & RHSZero;
	KnownUndef = LHSUndef & RHSUndef;
	break;
	}
	case X86ISD::VBROADCAST: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	if (!SrcVT.isVector())
	return false;
	// Don't bother broadcasting if we just need the 0'th element.
	if (DemandedElts == 1) {
	if (Src.getValueType() != VT)
	Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
	SDLoc(Op));
	return TLO.CombineTo(Op, Src);
	}
	APInt SrcUndef, SrcZero;
	APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
	if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	- case X86ISD::SUBV_BROADCAST: {
	- // Reduce size of broadcast if we don't need the upper half.
	- unsigned HalfElts = NumElts / 2;
	- if (DemandedElts.extractBits(HalfElts, HalfElts).isNullValue()) {
	- SDValue Src = Op.getOperand(0);
	- MVT SrcVT = Src.getSimpleValueType();
	-
	- SDValue Half = Src;
	- if (SrcVT.getVectorNumElements() != HalfElts) {
	- MVT HalfVT = MVT::getVectorVT(SrcVT.getScalarType(), HalfElts);
	- Half = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, SDLoc(Op), HalfVT, Src);
	- }
	-
	- return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Half, 0,
	- TLO.DAG, SDLoc(Op),
	- Half.getValueSizeInBits()));
	- }
	- break;
	- }
	case X86ISD::VPERMV: {
	SDValue Mask = Op.getOperand(0);
	APInt MaskUndef, MaskZero;
	if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	case X86ISD::PSHUFB:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMILPV: {
	SDValue Mask = Op.getOperand(1);
	APInt MaskUndef, MaskZero;
	if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	case X86ISD::VPPERM:
	case X86ISD::VPERMIL2: {
	SDValue Mask = Op.getOperand(2);
	APInt MaskUndef, MaskZero;
	if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
	Depth + 1))
	return true;
	break;
	}
	}

	// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
	// demand any of the high elements, then narrow the op to 128/256-bits: e.g.
	// (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
	if ((VT.is256BitVector() \|\| VT.is512BitVector()) &&
	DemandedElts.lshr(NumElts / 2) == 0) {
	unsigned SizeInBits = VT.getSizeInBits();
	unsigned ExtSizeInBits = SizeInBits / 2;

	// See if 512-bit ops only use the bottom 128-bits.
	if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
	ExtSizeInBits = SizeInBits / 4;

	switch (Opc) {
	// Zero upper elements.
	case X86ISD::VZEXT_MOVL: {
	SDLoc DL(Op);
	SDValue Ext0 =
	extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
	SDValue ExtOp =
	TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0);
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert =
	insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
	return TLO.CombineTo(Op, Insert);
	}
	+ // Subvector broadcast.
	+ case X86ISD::SUBV_BROADCAST: {
	+ SDLoc DL(Op);
	+ SDValue Src = Op.getOperand(0);
	+ if (Src.getValueSizeInBits() > ExtSizeInBits)
	+ Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
	+ else if (Src.getValueSizeInBits() < ExtSizeInBits) {
	+ MVT SrcSVT = Src.getSimpleValueType().getScalarType();
	+ MVT SrcVT =
	+ MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
	+ Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
	+ }
	+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
	+ TLO.DAG, DL, ExtSizeInBits));
	+ }
	// Byte shifts by immediate.
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	// Shift by uniform.
	case X86ISD::VSHL:
	case X86ISD::VSRL:
	case X86ISD::VSRA:
	// Shift by immediate.
	case X86ISD::VSHLI:
	case X86ISD::VSRLI:
	case X86ISD::VSRAI: {
	SDLoc DL(Op);
	SDValue Ext0 =
	extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
	SDValue ExtOp =
	TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert =
	insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
	return TLO.CombineTo(Op, Insert);
	}
	case X86ISD::VPERMI: {
	// Simplify PERMPD/PERMQ to extract_subvector.
	// TODO: This should be done in shuffle combining.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64) {
	SmallVector<int, 4> Mask;
	DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
	if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
	SDLoc DL(Op);
	SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
	return TLO.CombineTo(Op, Insert);
	}
	}
	break;
	}
	// Target Shuffles.
	case X86ISD::PSHUFB:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	// Saturated Packs.
	case X86ISD::PACKSS:
	case X86ISD::PACKUS:
	// Horizontal Ops.
	case X86ISD::HADD:
	case X86ISD::HSUB:
	case X86ISD::FHADD:
	case X86ISD::FHSUB: {
	SDLoc DL(Op);
	MVT ExtVT = VT.getSimpleVT();
	ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
	ExtSizeInBits / ExtVT.getScalarSizeInBits());
	SDValue Ext0 =
	extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
	SDValue Ext1 =
	extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits);
	SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1);
	SDValue UndefVec = TLO.DAG.getUNDEF(VT);
	SDValue Insert =
	insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
	return TLO.CombineTo(Op, Insert);
	}
	}
	}

	// Simplify target shuffles.
	if (!isTargetShuffle(Opc) \|\| !VT.isSimple())
	return false;

	// Get target shuffle mask.
	bool IsUnary;
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	if (!getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, OpInputs,
	OpMask, IsUnary))
	return false;

	// Shuffle inputs must be the same type as the result.
	if (llvm::any_of(OpInputs,
	[VT](SDValue V) { return VT != V.getValueType(); }))
	return false;

	// Clear known elts that might have been set above.
	KnownZero.clearAllBits();
	KnownUndef.clearAllBits();

	// Check if shuffle mask can be simplified to undef/zero/identity.
	int NumSrcs = OpInputs.size();
	for (int i = 0; i != NumElts; ++i) {
	int &M = OpMask[i];
	if (!DemandedElts[i])
	M = SM_SentinelUndef;
	else if (0 <= M && OpInputs[M / NumElts].isUndef())
	M = SM_SentinelUndef;
	}

	if (isUndefInRange(OpMask, 0, NumElts)) {
	KnownUndef.setAllBits();
	return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
	}
	if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
	KnownZero.setAllBits();
	return TLO.CombineTo(
	Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
	}
	for (int Src = 0; Src != NumSrcs; ++Src)
	if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
	return TLO.CombineTo(Op, OpInputs[Src]);

	// Attempt to simplify inputs.
	for (int Src = 0; Src != NumSrcs; ++Src) {
	int Lo = Src * NumElts;
	APInt SrcElts = APInt::getNullValue(NumElts);
	for (int i = 0; i != NumElts; ++i)
	if (DemandedElts[i]) {
	int M = OpMask[i] - Lo;
	if (0 <= M && M < NumElts)
	SrcElts.setBit(M);
	}

	APInt SrcUndef, SrcZero;
	if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
	TLO, Depth + 1))
	return true;
	}

	// Extract known zero/undef elements.
	// TODO - Propagate input undef/zero elts.
	for (int i = 0; i != NumElts; ++i) {
	if (OpMask[i] == SM_SentinelUndef)
	KnownUndef.setBit(i);
	if (OpMask[i] == SM_SentinelZero)
	KnownZero.setBit(i);
	}

	return false;
	}

	bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
	SDValue Op, const APInt &OriginalDemandedBits,
	const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
	unsigned Depth) const {
	EVT VT = Op.getValueType();
	unsigned BitWidth = OriginalDemandedBits.getBitWidth();
	unsigned Opc = Op.getOpcode();
	switch(Opc) {
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ: {
	// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
	KnownBits KnownOp;
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	// FIXME: Can we bound this better?
	APInt DemandedMask = APInt::getLowBitsSet(64, 32);
	if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
	TLO, Depth + 1))
	return true;
	if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
	TLO, Depth + 1))
	return true;
	break;
	}
	case X86ISD::VSHLI: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
	if (ShiftImm->getAPIntValue().uge(BitWidth))
	break;

	unsigned ShAmt = ShiftImm->getZExtValue();
	APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);

	// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
	// single shift. We can do this if the bottom bits (which are shifted
	// out) are never demanded.
	if (Op0.getOpcode() == X86ISD::VSRLI &&
	OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
	if (auto *Shift2Imm = dyn_cast<ConstantSDNode>(Op0.getOperand(1))) {
	if (Shift2Imm->getAPIntValue().ult(BitWidth)) {
	int Diff = ShAmt - Shift2Imm->getZExtValue();
	if (Diff == 0)
	return TLO.CombineTo(Op, Op0.getOperand(0));

	unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
	SDValue NewShift = TLO.DAG.getNode(
	NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
	TLO.DAG.getConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
	return TLO.CombineTo(Op, NewShift);
	}
	}
	}

	if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
	TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;

	// Low bits known zero.
	Known.Zero.setLowBits(ShAmt);
	}
	break;
	}
	case X86ISD::VSRLI: {
	if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	if (ShiftImm->getAPIntValue().uge(BitWidth))
	break;

	unsigned ShAmt = ShiftImm->getZExtValue();
	APInt DemandedMask = OriginalDemandedBits << ShAmt;

	if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
	OriginalDemandedElts, Known, TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);

	// High bits known zero.
	Known.Zero.setHighBits(ShAmt);
	}
	break;
	}
	case X86ISD::VSRAI: {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
	if (ShiftImm->getAPIntValue().uge(BitWidth))
	break;

	unsigned ShAmt = ShiftImm->getZExtValue();
	APInt DemandedMask = OriginalDemandedBits << ShAmt;

	// If we just want the sign bit then we don't need to shift it.
	if (OriginalDemandedBits.isSignMask())
	return TLO.CombineTo(Op, Op0);

	// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
	if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
	SDValue Op00 = Op0.getOperand(0);
	unsigned NumSignBits =
	TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
	if (ShAmt < NumSignBits)
	return TLO.CombineTo(Op, Op00);
	}

	// If any of the demanded bits are produced by the sign extension, we also
	// demand the input sign bit.
	if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
	DemandedMask.setSignBit();

	if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
	TLO, Depth + 1))
	return true;

	assert(!Known.hasConflict() && "Bits known to be one AND zero?");
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);

	// If the input sign bit is known to be zero, or if none of the top bits
	// are demanded, turn this into an unsigned shift right.
	if (Known.Zero[BitWidth - ShAmt - 1] \|\|
	OriginalDemandedBits.countLeadingZeros() >= ShAmt)
	return TLO.CombineTo(
	Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));

	// High bits are known one.
	if (Known.One[BitWidth - ShAmt - 1])
	Known.One.setHighBits(ShAmt);
	}
	break;
	}
	case X86ISD::PEXTRB:
	case X86ISD::PEXTRW: {
	SDValue Vec = Op.getOperand(0);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	MVT VecVT = Vec.getSimpleValueType();
	unsigned NumVecElts = VecVT.getVectorNumElements();

	if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
	unsigned Idx = CIdx->getZExtValue();
	unsigned VecBitWidth = VecVT.getScalarSizeInBits();

	// If we demand no bits from the vector then we must have demanded
	// bits from the implict zext - simplify to zero.
	APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
	if (DemandedVecBits == 0)
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

	APInt KnownUndef, KnownZero;
	APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
	if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
	KnownZero, TLO, Depth + 1))
	return true;

	KnownBits KnownVec;
	if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
	KnownVec, TLO, Depth + 1))
	return true;

	Known = KnownVec.zext(BitWidth, true);
	return false;
	}
	break;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	SDValue Vec = Op.getOperand(0);
	SDValue Scl = Op.getOperand(1);
	auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	MVT VecVT = Vec.getSimpleValueType();

	if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
	unsigned Idx = CIdx->getZExtValue();
	if (!OriginalDemandedElts[Idx])
	return TLO.CombineTo(Op, Vec);

	KnownBits KnownVec;
	APInt DemandedVecElts(OriginalDemandedElts);
	DemandedVecElts.clearBit(Idx);
	if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
	KnownVec, TLO, Depth + 1))
	return true;

	KnownBits KnownScl;
	unsigned NumSclBits = Scl.getScalarValueSizeInBits();
	APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
	if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
	return true;

	KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
	Known.One = KnownVec.One & KnownScl.One;
	Known.Zero = KnownVec.Zero & KnownScl.Zero;
	return false;
	}
	break;
	}
	case X86ISD::PACKSS:
	// PACKSS saturates to MIN/MAX integer values. So if we just want the
	// sign bit then we can just ask for the source operands sign bit.
	// TODO - add known bits handling.
	if (OriginalDemandedBits.isSignMask()) {
	APInt DemandedLHS, DemandedRHS;
	getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);

	KnownBits KnownLHS, KnownRHS;
	APInt SignMask = APInt::getSignMask(BitWidth * 2);
	if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
	KnownLHS, TLO, Depth + 1))
	return true;
	if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
	KnownRHS, TLO, Depth + 1))
	return true;
	}
	// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
	break;
	case X86ISD::PCMPGT:
	// icmp sgt(0, R) == ashr(R, BitWidth-1).
	// iff we only need the sign bit then we can use R directly.
	if (OriginalDemandedBits.isSignMask() &&
	ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
	return TLO.CombineTo(Op, Op.getOperand(1));
	break;
	case X86ISD::MOVMSK: {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	unsigned SrcBits = SrcVT.getScalarSizeInBits();
	unsigned NumElts = SrcVT.getVectorNumElements();

	// If we don't need the sign bits at all just return zero.
	if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
	return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

	// Only demand the vector elements of the sign bits we need.
	APInt KnownUndef, KnownZero;
	APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
	if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
	TLO, Depth + 1))
	return true;

	Known.Zero = KnownZero.zextOrSelf(BitWidth);
	Known.Zero.setHighBits(BitWidth - NumElts);

	// MOVMSK only uses the MSB from each vector element.
	KnownBits KnownSrc;
	if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,
	KnownSrc, TLO, Depth + 1))
	return true;

	if (KnownSrc.One[SrcBits - 1])
	Known.One.setLowBits(NumElts);
	else if (KnownSrc.Zero[SrcBits - 1])
	Known.Zero.setLowBits(NumElts);
	return false;
	}
	}

	return TargetLowering::SimplifyDemandedBitsForTargetNode(
	Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
	}

	/// Check if a vector extract from a target-specific shuffle of a load can be
	/// folded into a single element load.
	/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
	/// shuffles have been custom lowered so we need to handle those here.
	static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue InVec = N->getOperand(0);
	SDValue EltNo = N->getOperand(1);
	EVT EltVT = N->getValueType(0);

	if (!isa<ConstantSDNode>(EltNo))
	return SDValue();

	EVT OriginalVT = InVec.getValueType();

	// Peek through bitcasts, don't duplicate a load with other uses.
	InVec = peekThroughOneUseBitcasts(InVec);

	EVT CurrentVT = InVec.getValueType();
	if (!CurrentVT.isVector() \|\|
	CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
	return SDValue();

	if (!isTargetShuffle(InVec.getOpcode()))
	return SDValue();

	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 2> ShuffleOps;
	bool UnaryShuffle;
	if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
	ShuffleOps, ShuffleMask, UnaryShuffle))
	return SDValue();

	// Select the input vector, guarding against out of range extract vector.
	unsigned NumElems = CurrentVT.getVectorNumElements();
	int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
	int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];

	if (Idx == SM_SentinelZero)
	return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
	if (Idx == SM_SentinelUndef)
	return DAG.getUNDEF(EltVT);

	// Bail if any mask element is SM_SentinelZero - getVectorShuffle below
	// won't handle it.
	if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
	return SDValue();

	assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
	SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] : ShuffleOps[1];

	// If inputs to shuffle are the same for both ops, then allow 2 uses
	unsigned AllowedUses =
	(ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;

	if (LdNode.getOpcode() == ISD::BITCAST) {
	// Don't duplicate a load with other uses.
	if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
	return SDValue();

	AllowedUses = 1; // only allow 1 load use if we have a bitcast
	LdNode = LdNode.getOperand(0);
	}

	if (!ISD::isNormalLoad(LdNode.getNode()))
	return SDValue();

	LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);

	if (!LN0 \|\|!LN0->hasNUsesOfValue(AllowedUses, 0) \|\| LN0->isVolatile())
	return SDValue();

	// If there's a bitcast before the shuffle, check if the load type and
	// alignment is valid.
	unsigned Align = LN0->getAlignment();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
	EltVT.getTypeForEVT(*DAG.getContext()));

	if (NewAlign > Align \|\| !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
	return SDValue();

	// All checks match so transform back to vector_shuffle so that DAG combiner
	// can finish the job
	SDLoc dl(N);

	// Create shuffle node taking into account the case that its a unary shuffle
	SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
	Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
	ShuffleMask);
	Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
	EltNo);
	}

	// Helper to peek through bitops/setcc to determine size of source vector.
	// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
	static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
	switch (Src.getOpcode()) {
	case ISD::SETCC:
	return Src.getOperand(0).getValueSizeInBits() == Size;
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
	checkBitcastSrcVectorSize(Src.getOperand(1), Size);
	}
	return false;
	}

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the illegal vector is scalarized on subtargets that don't have legal
	// vxi1 types.
	static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
	const SDLoc &DL,
	const X86Subtarget &Subtarget) {
	EVT SrcVT = Src.getValueType();
	if (!SrcVT.isSimple() \|\| SrcVT.getScalarType() != MVT::i1)
	return SDValue();

	// If the input is a truncate from v16i8 or v32i8 go ahead and use a
	// movmskb even with avx512. This will be better than truncating to vXi1 and
	// using a kmov. This can especially help KNL if the input is a v16i8/v32i8
	// vpcmpeqb/vpcmpgtb.
	bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
	(Src.getOperand(0).getValueType() == MVT::v16i8 \|\|
	Src.getOperand(0).getValueType() == MVT::v32i8 \|\|
	Src.getOperand(0).getValueType() == MVT::v64i8);

	// With AVX512 vxi1 types are legal and we prefer using k-regs.
	// MOVMSK is supported in SSE2 or later.
	if (!Subtarget.hasSSE2() \|\| (Subtarget.hasAVX512() && !IsTruncated))
	return SDValue();

	// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
	// v8f64. So all legal 128-bit and 256-bit vectors are covered except for
	// v8i16 and v16i16.
	// For these two cases, we can shuffle the upper element bytes to a
	// consecutive sequence at the start of the vector and treat the results as
	// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
	// for v16i16 this is not the case, because the shuffle is expensive, so we
	// avoid sign-extending to this type entirely.
	// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
	// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
	MVT SExtVT;
	switch (SrcVT.getSimpleVT().SimpleTy) {
	default:
	return SDValue();
	case MVT::v2i1:
	SExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	SExtVT = MVT::v4i32;
	// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
	// sign-extend to a 256-bit operation to avoid truncation.
	if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256))
	SExtVT = MVT::v4i64;
	break;
	case MVT::v8i1:
	SExtVT = MVT::v8i16;
	// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
	// sign-extend to a 256-bit operation to match the compare.
	// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
	// 256-bit because the shuffle is cheaper than sign extending the result of
	// the compare.
	// TODO : use checkBitcastSrcVectorSize
	if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
	(Src.getOperand(0).getValueType().is256BitVector() \|\|
	Src.getOperand(0).getValueType().is512BitVector())) {
	SExtVT = MVT::v8i32;
	}
	break;
	case MVT::v16i1:
	SExtVT = MVT::v16i8;
	// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
	// it is not profitable to sign-extend to 256-bit because this will
	// require an extra cross-lane shuffle which is more expensive than
	// truncating the result of the compare to 128-bits.
	break;
	case MVT::v32i1:
	SExtVT = MVT::v32i8;
	break;
	case MVT::v64i1:
	// If we have AVX512F, but not AVX512BW and the input is truncated from
	// v64i8 checked earlier. Then split the input and make two pmovmskbs.
	if (Subtarget.hasAVX512() && !Subtarget.hasBWI()) {
	SExtVT = MVT::v64i8;
	break;
	}
	return SDValue();
	};

	SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

	if (SExtVT == MVT::v64i8) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
	Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
	Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
	Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
	Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
	Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
	DAG.getConstant(32, DL, MVT::i8));
	V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
	} else if (SExtVT == MVT::v16i8 \|\| SExtVT == MVT::v32i8) {
	V = getPMOVMSKB(DL, V, DAG, Subtarget);
	} else {
	if (SExtVT == MVT::v8i16)
	V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
	DAG.getUNDEF(MVT::v8i16));
	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	}

	EVT IntVT =
	EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
	V = DAG.getZExtOrTrunc(V, DL, IntVT);
	return DAG.getBitcast(VT, V);
	}

	// Convert a vXi1 constant build vector to the same width scalar integer.
	static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
	EVT SrcVT = Op.getValueType();
	assert(SrcVT.getVectorElementType() == MVT::i1 &&
	"Expected a vXi1 vector");
	assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
	"Expected a constant build vector");

	APInt Imm(SrcVT.getVectorNumElements(), 0);
	for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
	SDValue In = Op.getOperand(Idx);
	if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
	Imm.setBit(Idx);
	}
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
	return DAG.getConstant(Imm, SDLoc(Op), IntVT);
	}

	static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");

	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	// Only do this if we have k-registers.
	if (!Subtarget.hasAVX512())
	return SDValue();

	EVT DstVT = N->getValueType(0);
	SDValue Op = N->getOperand(0);
	EVT SrcVT = Op.getValueType();

	if (!Op.hasOneUse())
	return SDValue();

	// Look for logic ops.
	if (Op.getOpcode() != ISD::AND &&
	Op.getOpcode() != ISD::OR &&
	Op.getOpcode() != ISD::XOR)
	return SDValue();

	// Make sure we have a bitcast between mask registers and a scalar type.
	if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
	DstVT.isScalarInteger()) &&
	!(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
	SrcVT.isScalarInteger()))
	return SDValue();

	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);

	if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
	LHS.getOperand(0).getValueType() == DstVT)
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
	DAG.getBitcast(DstVT, RHS));

	if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
	RHS.getOperand(0).getValueType() == DstVT)
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
	DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));

	// If the RHS is a vXi1 build vector, this is a good reason to flip too.
	// Most of these have to move a constant from the scalar domain anyway.
	if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
	RHS = combinevXi1ConstantToInteger(RHS, DAG);
	return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
	DAG.getBitcast(DstVT, LHS), RHS);
	}

	return SDValue();
	}

	static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(BV);
	unsigned NumElts = BV->getNumOperands();
	SDValue Splat = BV->getSplatValue();

	// Build MMX element from integer GPR or SSE float values.
	auto CreateMMXElement = [&](SDValue V) {
	if (V.isUndef())
	return DAG.getUNDEF(MVT::x86mmx);
	if (V.getValueType().isFloatingPoint()) {
	if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
	V = DAG.getBitcast(MVT::v2i64, V);
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
	}
	V = DAG.getBitcast(MVT::i32, V);
	} else {
	V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
	}
	return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
	};

	// Convert build vector ops to MMX data in the bottom elements.
	SmallVector<SDValue, 8> Ops;

	// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
	if (Splat) {
	if (Splat.isUndef())
	return DAG.getUNDEF(MVT::x86mmx);

	Splat = CreateMMXElement(Splat);

	if (Subtarget.hasSSE1()) {
	// Unpack v8i8 to splat i8 elements to lowest 16-bits.
	if (NumElts == 8)
	Splat = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
	DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
	Splat);

	// Use PSHUFW to repeat 16-bit elements.
	unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
	DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
	DAG.getConstant(ShufMask, DL, MVT::i8));
	}
	Ops.append(NumElts, Splat);
	} else {
	for (unsigned i = 0; i != NumElts; ++i)
	Ops.push_back(CreateMMXElement(BV->getOperand(i)));
	}

	// Use tree of PUNPCKLs to build up general MMX vector.
	while (Ops.size() > 1) {
	unsigned NumOps = Ops.size();
	unsigned IntrinOp =
	(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
	: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
	: Intrinsic::x86_mmx_punpcklbw));
	SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
	for (unsigned i = 0; i != NumOps; i += 2)
	Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
	Ops[i], Ops[i + 1]);
	Ops.resize(NumOps / 2);
	}

	return Ops[0];
	}

	static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SrcVT = N0.getValueType();

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the setcc result is scalarized on subtargets that don't have legal
	// vxi1 types.
	if (DCI.isBeforeLegalize()) {
	SDLoc dl(N);
	if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
	return V;

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((VT == MVT::v4i1 \|\| VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
	Subtarget.hasAVX512()) {
	N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
	N0 = DAG.getBitcast(MVT::v8i1, N0);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
	DAG.getIntPtrConstant(0, dl));
	}

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((SrcVT == MVT::v4i1 \|\| SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
	Subtarget.hasAVX512()) {
	unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
	Ops[0] = N0;
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	N0 = DAG.getBitcast(MVT::i8, N0);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
	}
	}

	// Since MMX types are special and don't usually play with other vector types,
	// it's better to handle them early to be sure we emit efficient code by
	// avoiding store-load conversions.
	if (VT == MVT::x86mmx) {
	// Detect MMX constant vectors.
	APInt UndefElts;
	SmallVector<APInt, 1> EltBits;
	if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
	SDLoc DL(N0);
	// Handle zero-extension of i32 with MOVD.
	if (EltBits[0].countLeadingZeros() >= 32)
	return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
	DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
	// Else, bitcast to a double.
	// TODO - investigate supporting sext 32-bit immediates on x86_64.
	APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
	return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
	}

	// Detect bitcasts to x86mmx low word.
	if (N0.getOpcode() == ISD::BUILD_VECTOR &&
	(SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8) &&
	N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
	bool LowUndef = true, AllUndefOrZero = true;
	for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
	SDValue Op = N0.getOperand(i);
	LowUndef &= Op.isUndef() \|\| (i >= e/2);
	AllUndefOrZero &= (Op.isUndef() \|\| isNullConstant(Op));
	}
	if (AllUndefOrZero) {
	SDValue N00 = N0.getOperand(0);
	SDLoc dl(N00);
	N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
	: DAG.getZExtOrTrunc(N00, dl, MVT::i32);
	return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
	}
	}

	// Detect bitcasts of 64-bit build vectors and convert to a
	// MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
	// lowest element.
	if (N0.getOpcode() == ISD::BUILD_VECTOR &&
	(SrcVT == MVT::v2f32 \|\| SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\|
	SrcVT == MVT::v8i8))
	return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);

	// Detect bitcasts between element or subvector extraction to x86mmx.
	if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
	isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getValueType().is128BitVector())
	return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
	DAG.getBitcast(MVT::v2i64, N00));
	}

	// Detect bitcasts from FP_TO_SINT to x86mmx.
	if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
	SDLoc DL(N0);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
	DAG.getBitcast(MVT::v2i64, Res));
	}
	}

	// Try to remove a bitcast of constant vXi1 vector. We have to legalize
	// most of these to scalar anyway.
	if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
	SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
	ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
	return combinevXi1ConstantToInteger(N0, DAG);
	}

	if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
	VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	isa<ConstantSDNode>(N0)) {
	auto *C = cast<ConstantSDNode>(N0);
	if (C->isAllOnesValue())
	return DAG.getConstant(1, SDLoc(N0), VT);
	if (C->isNullValue())
	return DAG.getConstant(0, SDLoc(N0), VT);
	}

	// Try to remove bitcasts from input and output of mask arithmetic to
	// remove GPR<->K-register crossings.
	if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
	return V;

	// Convert a bitcasted integer logic operation that has one bitcasted
	// floating-point operand into a floating-point logic operation. This may
	// create a load of a constant, but that is cheaper than materializing the
	// constant in an integer register and transferring it to an SSE register or
	// transferring the SSE operand to integer register and back.
	unsigned FPOpcode;
	switch (N0.getOpcode()) {
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	default: return SDValue();
	}

	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64)))
	return SDValue();

	SDValue LogicOp0 = N0.getOperand(0);
	SDValue LogicOp1 = N0.getOperand(1);
	SDLoc DL0(N0);

	// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
	if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
	LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
	SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
	}
	// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
	if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
	LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
	SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
	}

	return SDValue();
	}

	// Given a ABS node, detect the following pattern:
	// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
	// This is useful as it is the input into a SAD pattern.
	static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
	SDValue AbsOp1 = Abs->getOperand(0);
	if (AbsOp1.getOpcode() != ISD::SUB)
	return false;

	Op0 = AbsOp1.getOperand(0);
	Op1 = AbsOp1.getOperand(1);

	// Check if the operands of the sub are zero-extended from vectors of i8.
	if (Op0.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 \|\|
	Op1.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
	return false;

	return true;
	}

	// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
	// to these zexts.
	static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
	const SDValue &Zext1, const SDLoc &DL,
	const X86Subtarget &Subtarget) {
	// Find the appropriate width for the PSADBW.
	EVT InVT = Zext0.getOperand(0).getValueType();
	unsigned RegSize = std::max(128u, InVT.getSizeInBits());

	// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
	// fill in the missing vector elements with 0.
	unsigned NumConcat = RegSize / InVT.getSizeInBits();
	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
	Ops[0] = Zext0.getOperand(0);
	MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
	SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
	Ops[0] = Zext1.getOperand(0);
	SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

	// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
	auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
	return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
	};
	MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
	return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
	PSADBWBuilder);
	}

	// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
	// PHMINPOSUW.
	static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE41.
	if (!Subtarget.hasSSE41())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
	return SDValue();

	// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
	ISD::NodeType BinOp;
	SDValue Src = DAG.matchBinOpReduction(
	Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
	if (!Src)
	return SDValue();

	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getScalarType();
	if (SrcSVT != ExtractVT \|\| (SrcVT.getSizeInBits() % 128) != 0)
	return SDValue();

	SDLoc DL(Extract);
	SDValue MinPos = Src;

	// First, reduce the source down to 128-bit, applying BinOp to lo/hi.
	while (SrcVT.getSizeInBits() > 128) {
	unsigned NumElts = SrcVT.getVectorNumElements();
	unsigned NumSubElts = NumElts / 2;
	SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
	unsigned SubSizeInBits = SrcVT.getSizeInBits();
	SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
	SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
	MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
	}
	assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) \|\|
	(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
	"Unexpected value type");

	// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
	// to flip the value accordingly.
	SDValue Mask;
	unsigned MaskEltsBits = ExtractVT.getSizeInBits();
	if (BinOp == ISD::SMAX)
	Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::SMIN)
	Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::UMAX)
	Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	// For v16i8 cases we need to perform UMIN on pairs of byte elements,
	// shuffling each upper element down and insert zeros. This means that the
	// v16i8 UMIN will leave the upper element as zero, performing zero-extension
	// ready for the PHMINPOS.
	if (ExtractVT == MVT::i8) {
	SDValue Upper = DAG.getVectorShuffle(
	SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
	{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
	MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
	}

	// Perform the PHMINPOS on a v8i16 vector,
	MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
	MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
	MinPos = DAG.getBitcast(SrcVT, MinPos);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
	DAG.getIntPtrConstant(0, DL));
	}

	// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
	static SDValue combineHorizontalPredicateResult(SDNode *Extract,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE2.
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	unsigned BitWidth = ExtractVT.getSizeInBits();
	if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
	ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
	return SDValue();

	// Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
	ISD::NodeType BinOp;
	SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
	if (!Match && ExtractVT == MVT::i1)
	Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
	if (!Match)
	return SDValue();

	// EXTRACT_VECTOR_ELT can require implicit extension of the vector element
	// which we can't support here for now.
	if (Match.getScalarValueSizeInBits() != BitWidth)
	return SDValue();

	SDValue Movmsk;
	SDLoc DL(Extract);
	EVT MatchVT = Match.getValueType();
	unsigned NumElts = MatchVT.getVectorNumElements();

	if (ExtractVT == MVT::i1) {
	// Special case for (pre-legalization) vXi1 reductions.
	if (NumElts > 32)
	return SDValue();
	if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) {
	// If this is a legal AVX512 predicate type then we can just bitcast.
	EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	Movmsk = DAG.getBitcast(MovmskVT, Match);
	} else {
	// Use combineBitcastvxi1 to create the MOVMSK.
	if (NumElts == 32 && !Subtarget.hasInt256()) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
	Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
	NumElts = 16;
	}
	EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
	}
	if (!Movmsk)
	return SDValue();
	Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32);
	} else {
	// Bail with AVX512VL (which uses predicate registers).
	if (Subtarget.hasVLX())
	return SDValue();

	unsigned MatchSizeInBits = Match.getValueSizeInBits();
	if (!(MatchSizeInBits == 128 \|\|
	(MatchSizeInBits == 256 && Subtarget.hasAVX())))
	return SDValue();

	// Make sure this isn't a vector of 1 element. The perf win from using
	// MOVMSK diminishes with less elements in the reduction, but it is
	// generally better to get the comparison over to the GPRs as soon as
	// possible to reduce the number of vector ops.
	if (Match.getValueType().getVectorNumElements() < 2)
	return SDValue();

	// Check that we are extracting a reduction of all sign bits.
	if (DAG.ComputeNumSignBits(Match) != BitWidth)
	return SDValue();

	if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
	Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
	MatchSizeInBits = Match.getValueSizeInBits();
	}

	// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
	MVT MaskSrcVT;
	if (64 == BitWidth \|\| 32 == BitWidth)
	MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
	MatchSizeInBits / BitWidth);
	else
	MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

	SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
	Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
	NumElts = MaskSrcVT.getVectorNumElements();
	}
	assert(NumElts <= 32 && "Not expecting more than 32 elements");

	if (BinOp == ISD::XOR) {
	// parity -> (AND (CTPOP(MOVMSK X)), 1)
	SDValue Mask = DAG.getConstant(1, DL, MVT::i32);
	SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk);
	Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask);
	return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
	}

	SDValue CmpC;
	ISD::CondCode CondCode;
	if (BinOp == ISD::OR) {
	// any_of -> MOVMSK != 0
	CmpC = DAG.getConstant(0, DL, MVT::i32);
	CondCode = ISD::CondCode::SETNE;
	} else {
	// all_of -> MOVMSK == ((1 << NumElts) - 1)
	CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32);
	CondCode = ISD::CondCode::SETEQ;
	}

	// The setcc produces an i8 of 0/1, so extend that to the result width and
	// negate to get the final 0/-1 mask value.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetccVT =
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
	SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
	SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
	SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
	return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
	}

	static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// PSADBW is only supported on SSE2 and up.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Verify the type we're extracting from is any integer type above i16.
	EVT VT = Extract->getOperand(0).getValueType();
	if (!VT.isSimple() \|\| !(VT.getVectorElementType().getSizeInBits() > 16))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.useBWIRegs())
	RegSize = 512;
	else if (Subtarget.hasAVX())
	RegSize = 256;

	// We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (RegSize / VT.getVectorNumElements() < 8)
	return SDValue();

	// Match shuffle + add pyramid.
	ISD::NodeType BinOp;
	SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

	// The operand is expected to be zero extended from i8
	// (verified in detectZextAbsDiff).
	// In order to convert to i64 and above, additional any/zero/sign
	// extend is expected.
	// The zero extend from 32 bit has no mathematical effect on the result.
	// Also the sign extend is basically zero extend
	// (extends the sign bit which is zero).
	// So it is correct to skip the sign/zero extend instruction.
	if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND \|\|
	Root.getOpcode() == ISD::ZERO_EXTEND \|\|
	Root.getOpcode() == ISD::ANY_EXTEND))
	Root = Root.getOperand(0);

	// If there was a match, we want Root to be a select that is the root of an
	// abs-diff pattern.
	if (!Root \|\| Root.getOpcode() != ISD::ABS)
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	SDValue Zext0, Zext1;
	if (!detectZextAbsDiff(Root, Zext0, Zext1))
	return SDValue();

	// Create the SAD instruction.
	SDLoc DL(Extract);
	SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);

	// If the original vector was wider than 8 elements, sum over the results
	// in the SAD vector.
	unsigned Stages = Log2_32(VT.getVectorNumElements());
	MVT SadVT = SAD.getSimpleValueType();
	if (Stages > 3) {
	unsigned SadElems = SadVT.getVectorNumElements();

	for(unsigned i = Stages - 3; i > 0; --i) {
	SmallVector<int, 16> Mask(SadElems, -1);
	for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
	Mask[j] = MaskEnd + j;

	SDValue Shuffle =
	DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
	SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
	}
	}

	MVT Type = Extract->getSimpleValueType(0);
	unsigned TypeSizeInBits = Type.getSizeInBits();
	// Return the lowest TypeSizeInBits bits.
	MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
	SAD = DAG.getBitcast(ResVT, SAD);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
	Extract->getOperand(1));
	}

	// Attempt to peek through a target shuffle and extract the scalar from the
	// source.
	static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue Src = N->getOperand(0);
	SDValue Idx = N->getOperand(1);

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();

	// Don't attempt this for boolean mask vectors or unknown extraction indices.
	if (SrcSVT == MVT::i1 \|\| !isa<ConstantSDNode>(Idx))
	return SDValue();

	SDValue SrcBC = peekThroughBitcasts(Src);

	// Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
	if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
	SDValue SrcOp = SrcBC.getOperand(0);
	if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, SrcOp);
	}

	// Resolve the target shuffle inputs and mask.
	SmallVector<int, 16> Mask;
	SmallVector<SDValue, 2> Ops;
	if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
	return SDValue();

	// Attempt to narrow/widen the shuffle mask to the correct size.
	if (Mask.size() != NumSrcElts) {
	if ((NumSrcElts % Mask.size()) == 0) {
	SmallVector<int, 16> ScaledMask;
	int Scale = NumSrcElts / Mask.size();
	scaleShuffleMask<int>(Scale, Mask, ScaledMask);
	Mask = std::move(ScaledMask);
	} else if ((Mask.size() % NumSrcElts) == 0) {
	// Simplify Mask based on demanded element.
	int ExtractIdx = (int)N->getConstantOperandVal(1);
	int Scale = Mask.size() / NumSrcElts;
	int Lo = Scale * ExtractIdx;
	int Hi = Scale * (ExtractIdx + 1);
	for (int i = 0, e = (int)Mask.size(); i != e; ++i)
	if (i < Lo \|\| Hi <= i)
	Mask[i] = SM_SentinelUndef;

	SmallVector<int, 16> WidenedMask;
	while (Mask.size() > NumSrcElts &&
	canWidenShuffleElements(Mask, WidenedMask))
	Mask = std::move(WidenedMask);
	// TODO - investigate support for wider shuffle masks with known upper
	// undef/zero elements for implicit zero-extension.
	}
	}

	// Check if narrowing/widening failed.
	if (Mask.size() != NumSrcElts)
	return SDValue();

	int SrcIdx = Mask[N->getConstantOperandVal(1)];
	SDLoc dl(N);

	// If the shuffle source element is undef/zero then we can just accept it.
	if (SrcIdx == SM_SentinelUndef)
	return DAG.getUNDEF(VT);

	if (SrcIdx == SM_SentinelZero)
	return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
	: DAG.getConstant(0, dl, VT);

	SDValue SrcOp = Ops[SrcIdx / Mask.size()];
	SrcIdx = SrcIdx % Mask.size();

	// We can only extract other elements from 128-bit vectors and in certain
	// circumstances, depending on SSE-level.
	// TODO: Investigate using extract_subvector for larger vectors.
	// TODO: Investigate float/double extraction if it will be just stored.
	if ((SrcVT == MVT::v4i32 \|\| SrcVT == MVT::v2i64) &&
	((SrcIdx == 0 && Subtarget.hasSSE2()) \|\| Subtarget.hasSSE41())) {
	assert(SrcSVT == VT && "Unexpected extraction type");
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	}

	if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
	assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
	"Unexpected extraction type");
	unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	return DAG.getZExtOrTrunc(ExtOp, dl, VT);
	}

	return SDValue();
	}

	/// Extracting a scalar FP value from vector element 0 is free, so extract each
	/// operand first, then perform the math as a scalar op.
	static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
	assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
	SDValue Vec = ExtElt->getOperand(0);
	SDValue Index = ExtElt->getOperand(1);
	EVT VT = ExtElt->getValueType(0);
	EVT VecVT = Vec.getValueType();

	// TODO: If this is a unary/expensive/expand op, allow extraction from a
	// non-zero element because the shuffle+scalar op will be cheaper?
	if (!Vec.hasOneUse() \|\| !isNullConstant(Index) \|\| VecVT.getScalarType() != VT)
	return SDValue();

	// Vector FP compares don't fit the pattern of FP math ops (propagate, not
	// extract, the condition code), so deal with those as a special-case.
	if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
	EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
	if (OpVT != MVT::f32 && OpVT != MVT::f64)
	return SDValue();

	// extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
	SDLoc DL(ExtElt);
	SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
	Vec.getOperand(0), Index);
	SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
	Vec.getOperand(1), Index);
	return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
	}

	if (VT != MVT::f32 && VT != MVT::f64)
	return SDValue();

	// Vector FP selects don't fit the pattern of FP math ops (because the
	// condition has a different type and we have to change the opcode), so deal
	// with those here.
	// FIXME: This is restricted to pre type legalization by ensuring the setcc
	// has i1 elements. If we loosen this we need to convert vector bool to a
	// scalar bool.
	if (Vec.getOpcode() == ISD::VSELECT &&
	Vec.getOperand(0).getOpcode() == ISD::SETCC &&
	Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
	Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
	// ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
	SDLoc DL(ExtElt);
	SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
	Vec.getOperand(0).getValueType().getScalarType(),
	Vec.getOperand(0), Index);
	SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	Vec.getOperand(1), Index);
	SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	Vec.getOperand(2), Index);
	return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
	}

	// TODO: This switch could include FNEG and the x86-specific FP logic ops
	// (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
	// missed load folding and fma+fneg combining.
	switch (Vec.getOpcode()) {
	case ISD::FMA: // Begin 3 operands
	case ISD::FMAD:
	case ISD::FADD: // Begin 2 operands
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	case ISD::FCOPYSIGN:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMINNUM_IEEE:
	case ISD::FMAXNUM_IEEE:
	case ISD::FMAXIMUM:
	case ISD::FMINIMUM:
	case X86ISD::FMAX:
	case X86ISD::FMIN:
	case ISD::FABS: // Begin 1 operand
	case ISD::FSQRT:
	case ISD::FRINT:
	case ISD::FCEIL:
	case ISD::FTRUNC:
	case ISD::FNEARBYINT:
	case ISD::FROUND:
	case ISD::FFLOOR:
	case X86ISD::FRCP:
	case X86ISD::FRSQRT: {
	// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
	SDLoc DL(ExtElt);
	SmallVector<SDValue, 4> ExtOps;
	for (SDValue Op : Vec->ops())
	ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
	return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
	}
	default:
	return SDValue();
	}
	llvm_unreachable("All opcodes should return within switch");
	}

	/// Try to convert a vector reduction sequence composed of binops and shuffles
	/// into horizontal ops.
	static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
	bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
	if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
	return SDValue();
	SDValue Index = ExtElt->getOperand(1);
	if (!isNullConstant(Index))
	return SDValue();

	// TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.
	ISD::NodeType Opc;
	SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD});
	if (!Rdx)
	return SDValue();

	EVT VT = ExtElt->getValueType(0);
	EVT VecVT = ExtElt->getOperand(0).getValueType();
	if (VecVT.getScalarType() != VT)
	return SDValue();

	unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
	SDLoc DL(ExtElt);

	// 256-bit horizontal instructions operate on 128-bit chunks rather than
	// across the whole vector, so we need an extract + hop preliminary stage.
	// This is the only step where the operands of the hop are not the same value.
	// TODO: We could extend this to handle 512-bit or even longer vectors.
	if (((VecVT == MVT::v16i16 \|\| VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) \|\|
	((VecVT == MVT::v8f32 \|\| VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
	unsigned NumElts = VecVT.getVectorNumElements();
	SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
	SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
	VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2);
	Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo);
	}
	if (!((VecVT == MVT::v8i16 \|\| VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
	!((VecVT == MVT::v4f32 \|\| VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
	return SDValue();

	// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
	assert(Rdx.getValueType() == VecVT && "Unexpected reduction match");
	unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
	for (unsigned i = 0; i != ReductionSteps; ++i)
	Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
	}

	/// Detect vector gather/scatter index generation and convert it from being a
	/// bunch of shuffles and extracts into a somewhat faster sequence.
	/// For i686, the best sequence is apparently storing the value and loading
	/// scalars back, while for x64 we should use 64-bit extracts and shifts.
	static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
	return NewOp;

	SDValue InputVector = N->getOperand(0);
	SDValue EltIdx = N->getOperand(1);
	auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);

	EVT SrcVT = InputVector.getValueType();
	EVT VT = N->getValueType(0);
	SDLoc dl(InputVector);
	bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;

	if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements()))
	return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

	// Integer Constant Folding.
	if (CIdx && VT.isInteger()) {
	APInt UndefVecElts;
	SmallVector<APInt, 16> EltBits;
	unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
	if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
	EltBits, true, false)) {
	uint64_t Idx = CIdx->getZExtValue();
	if (UndefVecElts[Idx])
	return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
	return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
	dl, VT);
	}
	}

	// TODO - Remove this once we can handle the implicit zero-extension of
	// X86ISD::PEXTRW/X86ISD::PEXTRB in:
	// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
	// combineBasicSADPattern.
	if (IsPextr) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(
	SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
	return SDValue(N, 0);
	return SDValue();
	}

	if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
	return NewOp;

	// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getBitcast(VT, InputVector);
	}

	// Detect mmx to i32 conversion through a v2i32 elt extract.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
	}

	// Check whether this extract is the root of a sum of absolute differences
	// pattern. This has to be done here because we really want it to happen
	// pre-legalization,
	if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
	return SAD;

	// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
	if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
	return Cmp;

	// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
	if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
	return MinMax;

	if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
	return V;

	if (SDValue V = scalarizeExtEltFP(N, DAG))
	return V;

	// Attempt to extract a i1 element by using MOVMSK to extract the signbits
	// and then testing the relevant element.
	if (CIdx && SrcVT.getScalarType() == MVT::i1) {
	SmallVector<SDNode *, 16> BoolExtracts;
	auto IsBoolExtract = [&BoolExtracts](SDNode *Use) {
	if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(Use->getOperand(1)) &&
	Use->getValueType(0) == MVT::i1) {
	BoolExtracts.push_back(Use);
	return true;
	}
	return false;
	};
	if (all_of(InputVector->uses(), IsBoolExtract) &&
	BoolExtracts.size() > 1) {
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
	if (SDValue BC =
	combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
	for (SDNode *Use : BoolExtracts) {
	// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
	unsigned MaskIdx = Use->getConstantOperandVal(1);
	APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
	SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
	SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
	Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
	DCI.CombineTo(Use, Res);
	}
	return SDValue(N, 0);
	}
	}
	}

	return SDValue();
	}

	/// If a vector select has an operand that is -1 or 0, try to simplify the
	/// select to a bitwise logic operation.
	/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
	static SDValue
	combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	SDLoc DL(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (N->getOpcode() != ISD::VSELECT)
	return SDValue();

	assert(CondVT.isVector() && "Vector select expects a vector selector!");

	// Check if the first operand is all zeros and Cond type is vXi1.
	// This situation only applies to avx512.
	// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
	// TODO: Can we assert that both operands are not zeros (because that should
	// get simplified at node creation time)?
	bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
	bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
	if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&
	Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {
	// Invert the cond to not(cond) : xor(op,allones)=not(op)
	SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
	// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
	return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
	}

	// To use the condition operand as a bitwise mask, it must have elements that
	// are the same size as the select elements. Ie, the condition operand must
	// have already been promoted from the IR select condition type <N x i1>.
	// Don't check if the types themselves are equal because that excludes
	// vector floating-point selects.
	if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	// Try to invert the condition if true value is not all 1s and false value is
	// not all 0s. Only do this if the condition has one use.
	bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
	if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
	// Check if the selector will be produced by CMPP/PCMP.
	Cond.getOpcode() == ISD::SETCC &&
	// Check if SETCC has already been promoted.
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
	CondVT) {
	bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

	if (TValIsAllZeros \|\| FValIsAllOnes) {
	SDValue CC = Cond.getOperand(2);
	ISD::CondCode NewCC =
	ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
	Cond.getOperand(0).getValueType().isInteger());
	Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
	NewCC);
	std::swap(LHS, RHS);
	TValIsAllOnes = FValIsAllOnes;
	FValIsAllZeros = TValIsAllZeros;
	}
	}

	// Cond value must be 'sign splat' to be converted to a logical op.
	if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
	return SDValue();

	// vselect Cond, 111..., 000... -> Cond
	if (TValIsAllOnes && FValIsAllZeros)
	return DAG.getBitcast(VT, Cond);

	if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
	return SDValue();

	// vselect Cond, 111..., X -> or Cond, X
	if (TValIsAllOnes) {
	SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
	SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
	return DAG.getBitcast(VT, Or);
	}

	// vselect Cond, X, 000... -> and Cond, X
	if (FValIsAllZeros) {
	SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
	SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
	return DAG.getBitcast(VT, And);
	}

	// vselect Cond, 000..., X -> andn Cond, X
	if (TValIsAllZeros) {
	MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
	SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
	SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
	SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
	return DAG.getBitcast(VT, AndN);
	}

	return SDValue();
	}

	/// If both arms of a vector select are concatenated vectors, split the select,
	/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
	/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
	/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
	static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
	return SDValue();

	// TODO: Split 512-bit vectors too?
	EVT VT = N->getValueType(0);
	if (!VT.is256BitVector())
	return SDValue();

	// TODO: Split as long as any 2 of the 3 operands are concatenated?
	SDValue Cond = N->getOperand(0);
	SDValue TVal = N->getOperand(1);
	SDValue FVal = N->getOperand(2);
	SmallVector<SDValue, 4> CatOpsT, CatOpsF;
	if (!TVal.hasOneUse() \|\| !FVal.hasOneUse() \|\|
	!collectConcatOps(TVal.getNode(), CatOpsT) \|\|
	!collectConcatOps(FVal.getNode(), CatOpsF))
	return SDValue();

	auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
	makeBlend, /CheckBWI/ false);
	}

	static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	SDLoc DL(N);

	auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
	auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
	if (!TrueC \|\| !FalseC)
	return SDValue();

	// Don't do this for crazy integer types.
	EVT VT = N->getValueType(0);
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// We're going to use the condition bit in math or logic ops. We could allow
	// this with a wider condition value (post-legalization it becomes an i8),
	// but if nothing is creating selects that late, it doesn't matter.
	if (Cond.getValueType() != MVT::i1)
	return SDValue();

	// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
	// 3, 5, or 9 with i32/i64, so those get transformed too.
	// TODO: For constants that overflow or do not differ by power-of-2 or small
	// multiplier, convert to 'and' + 'add'.
	const APInt &TrueVal = TrueC->getAPIntValue();
	const APInt &FalseVal = FalseC->getAPIntValue();
	bool OV;
	APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
	if (OV)
	return SDValue();

	APInt AbsDiff = Diff.abs();
	if (AbsDiff.isPowerOf2() \|\|
	((VT == MVT::i32 \|\| VT == MVT::i64) &&
	(AbsDiff == 3 \|\| AbsDiff == 5 \|\| AbsDiff == 9))) {

	// We need a positive multiplier constant for shift/LEA codegen. The 'not'
	// of the condition can usually be folded into a compare predicate, but even
	// without that, the sequence should be cheaper than a CMOV alternative.
	if (TrueVal.slt(FalseVal)) {
	Cond = DAG.getNOT(DL, Cond, MVT::i1);
	std::swap(TrueC, FalseC);
	}

	// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
	SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

	// Multiply condition by the difference if non-one.
	if (!AbsDiff.isOneValue())
	R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

	// Add the base if non-zero.
	if (!FalseC->isNullValue())
	R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

	return R;
	}

	return SDValue();
	}

	/// If this is a dynamic select (non-constant condition) and we can match
	/// this node with one of the variable blend instructions, restructure the
	/// condition so that blends can use the high (sign) bit of each element.
	/// This function will also call SimplifyDemandedBits on already created
	/// BLENDV to perform additional simplifications.
	static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	if ((N->getOpcode() != ISD::VSELECT &&
	N->getOpcode() != X86ISD::BLENDV) \|\|
	ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return SDValue();

	// Don't optimize before the condition has been transformed to a legal type
	// and don't ever optimize vector selects that map to AVX512 mask-registers.
	unsigned BitWidth = Cond.getScalarValueSizeInBits();
	if (BitWidth < 8 \|\| BitWidth > 64)
	return SDValue();

	// We can only handle the cases where VSELECT is directly legal on the
	// subtarget. We custom lower VSELECT nodes with constant conditions and
	// this makes it hard to see whether a dynamic VSELECT will correctly
	// lower, so we both check the operation's status and explicitly handle the
	// cases where a dynamic blend will fail even though a constant-condition
	// blend could be custom lowered.
	// FIXME: We should find a better way to handle this class of problems.
	// Potentially, we should combine constant-condition vselect nodes
	// pre-legalization into shuffles and not mark as many types as custom
	// lowered.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = N->getValueType(0);
	if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
	return SDValue();
	// FIXME: We don't support i16-element blends currently. We could and
	// should support them by making all the bits in the condition be set
	// rather than just the high bit and using an i8-element blend.
	if (VT.getVectorElementType() == MVT::i16)
	return SDValue();
	// Dynamic blending was only available from SSE4.1 onward.
	if (VT.is128BitVector() && !Subtarget.hasSSE41())
	return SDValue();
	// Byte blends are only available in AVX2
	if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
	return SDValue();
	// There are no 512-bit blend instructions that use sign bits.
	if (VT.is512BitVector())
	return SDValue();

	// TODO: Add other opcodes eventually lowered into BLEND.
	for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
	UI != UE; ++UI)
	if ((UI->getOpcode() != ISD::VSELECT &&
	UI->getOpcode() != X86ISD::BLENDV) \|\|
	UI.getOperandNo() != 0)
	return SDValue();

	APInt DemandedMask(APInt::getSignMask(BitWidth));
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
	return SDValue();

	// If we changed the computation somewhere in the DAG, this change will
	// affect all users of Cond. Update all the nodes so that we do not use
	// the generic VSELECT anymore. Otherwise, we may perform wrong
	// optimizations as we messed with the actual expectation for the vector
	// boolean values.
	for (SDNode *U : Cond->uses()) {
	if (U->getOpcode() == X86ISD::BLENDV)
	continue;

	SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
	Cond, U->getOperand(1), U->getOperand(2));
	DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
	DCI.AddToWorklist(U);
	}
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue(N, 0);
	}

	/// Do target-specific dag combines on SELECT and VSELECT nodes.
	static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);

	// Try simplification again because we use this function to optimize
	// BLENDV nodes that are not handled by the generic combiner.
	if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
	return V;

	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Convert vselects with constant condition into shuffles.
	if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
	DCI.isBeforeLegalizeOps()) {
	SmallVector<int, 64> Mask;
	if (createShuffleMaskFromVSELECT(Mask, Cond))
	return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
	}

	// If we have SSE[12] support, try to form min/max nodes. SSE min/max
	// instructions match the semantics of the common C idiom x<y?x:y but not
	// x<=y?x:y, because of how they handle negative zero (which can be
	// ignored in unsafe-math mode).
	// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
	if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
	VT != MVT::f80 && VT != MVT::f128 &&
	(TLI.isTypeLegal(VT) \|\| VT == MVT::v2f32) &&
	(Subtarget.hasSSE2() \|\|
	(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	unsigned Opcode = 0;
	// Check for x CC y ? x : y.
	if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	switch (CC) {
	default: break;
	case ISD::SETULT:
	// Converting this to a min would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETOLE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETULE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETOGE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGT:
	// Converting this to a max would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMAX;
	break;
	}
	// Check for x CC y ? y : x -- a min/max with reversed arms.
	} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(0))) {
	switch (CC) {
	default: break;
	case ISD::SETOGE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZeroFloat(LHS) \|\|
	DAG.isKnownNeverZeroFloat(RHS))) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGT:
	// Converting this to a min would handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	(!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETULT:
	// Converting this to a max would handle NaNs incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETOLE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZeroFloat(LHS) &&
	!DAG.isKnownNeverZeroFloat(RHS)) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETULE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMAX;
	break;
	}
	}

	if (Opcode)
	return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
	}

	// Some mask scalar intrinsics rely on checking if only one bit is set
	// and implement it in C code like this:
	// A[0] = (U & 1) ? A[0] : W[0];
	// This creates some redundant instructions that break pattern matching.
	// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
	if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
	Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 \|\| VT == MVT::f64)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	SDValue AndNode = Cond.getOperand(0);
	if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
	isNullConstant(Cond.getOperand(1)) &&
	isOneConstant(AndNode.getOperand(1))) {
	// LHS and RHS swapped due to
	// setcc outputting 1 when AND resulted in 0 and vice versa.
	AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
	return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
	}
	}

	// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
	// lowering on KNL. In this case we convert it to
	// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
	// The same situation all vectors of i8 and i16 without BWI.
	// Make sure we extend these even before type legalization gets a chance to
	// split wide vectors.
	// Since SKX these selects have a proper lowering.
	if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1 &&
	(ExperimentalVectorWideningLegalization \|\|
	VT.getVectorNumElements() > 4) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
	}

	// AVX512 - Extend select with zero to merge with target shuffle.
	// select(mask, extract_subvector(shuffle(x)), zero) -->
	// extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
	// TODO - support non target shuffles as well.
	if (Subtarget.hasAVX512() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1) {
	auto SelectableOp = [&TLI](SDValue Op) {
	return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	isTargetShuffle(Op.getOperand(0).getOpcode()) &&
	isNullConstant(Op.getOperand(1)) &&
	TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
	Op.hasOneUse() && Op.getOperand(0).hasOneUse();
	};

	bool SelectableLHS = SelectableOp(LHS);
	bool SelectableRHS = SelectableOp(RHS);
	bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
	bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());

	if ((SelectableLHS && ZeroRHS) \|\| (SelectableRHS && ZeroLHS)) {
	EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
	: RHS.getOperand(0).getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
	LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
	VT.getSizeInBits());
	RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
	VT.getSizeInBits());
	Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
	DAG.getUNDEF(SrcCondVT), Cond,
	DAG.getIntPtrConstant(0, DL));
	SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
	return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
	}
	}

	if (SDValue V = combineSelectOfTwoConstants(N, DAG))
	return V;

	// Canonicalize max and min:
	// (x > y) ? x : y -> (x >= y) ? x : y
	// (x < y) ? x : y -> (x <= y) ? x : y
	// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
	// the need for an extra compare
	// against zero. e.g.
	// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
	// subl %esi, %edi
	// testl %edi, %edi
	// movl $0, %eax
	// cmovgl %edi, %eax
	// =>
	// xorl %eax, %eax
	// subl %esi, $edi
	// cmovsl %eax, %edi
	if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
	DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	switch (CC) {
	default: break;
	case ISD::SETLT:
	case ISD::SETGT: {
	ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
	Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
	Cond.getOperand(0), Cond.getOperand(1), NewCC);
	return DAG.getSelect(DL, VT, Cond, LHS, RHS);
	}
	}
	}

	// Match VSELECTs into subs with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// psubus is available in SSE2 for i8 and i16 vectors.
	Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
	isPowerOf2_32(VT.getVectorNumElements()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	// Check if one of the arms of the VSELECT is a zero vector. If it's on the
	// left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, true);
	} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other->getNumOperands() == 2 &&
	Other->getOperand(0) == Cond.getOperand(0)) {
	SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
	SDValue CondRHS = Cond->getOperand(1);

	// Look for a general sub with unsigned saturation first.
	// x >= y ? x-y : 0 --> subus x, y
	// x > y ? x-y : 0 --> subus x, y
	if ((CC == ISD::SETUGE \|\| CC == ISD::SETUGT) &&
	Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);

	if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
	if (isa<BuildVectorSDNode>(CondRHS)) {
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > C-1 ? x+-C : 0 --> subus x, C
	auto MatchUSUBSAT = [](ConstantSDNode Op, ConstantSDNode Cond) {
	return (!Op && !Cond) \|\|
	(Op && Cond &&
	Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
	};
	if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
	ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
	/AllowUndefs/ true)) {
	OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	OpRHS);
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
	}

	// Another special case: If C was a sign bit, the sub has been
	// canonicalized into a xor.
	// FIXME: Would it be better to use computeKnownBits to determine
	// whether it's safe to decanonicalize the xor?
	// x s< 0 ? x^C : 0 --> subus x, C
	if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
	if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
	OpRHSConst->getAPIntValue().isSignMask()) {
	// Note that we have to rebuild the RHS constant here to ensure we
	// don't rely on particular values of undef lanes.
	OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
	return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
	}
	}
	}
	}
	}
	}

	// Match VSELECTs into add with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// paddus is available in SSE2 for i8 and i16 vectors.
	Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
	isPowerOf2_32(VT.getVectorNumElements()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	SDValue CondLHS = Cond->getOperand(0);
	SDValue CondRHS = Cond->getOperand(1);

	// Check if one of the arms of the VSELECT is vector with all bits set.
	// If it's on the left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, true);
	} else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
	SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);

	// Canonicalize condition operands.
	if (CC == ISD::SETUGE) {
	std::swap(CondLHS, CondRHS);
	CC = ISD::SETULE;
	}

	// We can test against either of the addition operands.
	// x <= x+y ? x+y : ~0 --> addus x, y
	// x+y >= x ? x+y : ~0 --> addus x, y
	if (CC == ISD::SETULE && Other == CondRHS &&
	(OpLHS == CondLHS \|\| OpRHS == CondLHS))
	return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);

	if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
	CondLHS == OpLHS) {
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > ~C ? x+C : ~0 --> addus x, C
	auto MatchUADDSAT = [](ConstantSDNode Op, ConstantSDNode Cond) {
	return Cond->getAPIntValue() == ~Op->getAPIntValue();
	};
	if (CC == ISD::SETULE &&
	ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
	return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
	}
	}
	}

	// Early exit check
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
	return V;

	// Custom action for SELECT MMX
	if (VT == MVT::x86mmx) {
	LHS = DAG.getBitcast(MVT::i64, LHS);
	RHS = DAG.getBitcast(MVT::i64, RHS);
	SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
	return DAG.getBitcast(VT, newSelect);
	}

	return SDValue();
	}

	/// Combine:
	/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
	/// to:
	/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
	/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
	/// Note that this is only legal for some op/cc combinations.
	static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Can't replace the cmp if it has more uses than the one we're looking at.
	// FIXME: We would like to be able to handle this, but would need to make sure
	// all uses were updated.
	if (!Cmp.hasOneUse())
	return SDValue();

	// This only applies to variations of the common case:
	// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
	// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
	// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
	// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
	// Using the proper condcodes (see below), overflow is checked for.

	// FIXME: We can generalize both constraints:
	// - XOR/OR/AND (if they were made to survive AtomicExpand)
	// - LHS != 1
	// if the result is compared.

	SDValue CmpLHS = Cmp.getOperand(0);
	SDValue CmpRHS = Cmp.getOperand(1);

	if (!CmpLHS.hasOneUse())
	return SDValue();

	unsigned Opc = CmpLHS.getOpcode();
	if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
	return SDValue();

	SDValue OpRHS = CmpLHS.getOperand(2);
	auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
	if (!OpRHSC)
	return SDValue();

	APInt Addend = OpRHSC->getAPIntValue();
	if (Opc == ISD::ATOMIC_LOAD_SUB)
	Addend = -Addend;

	auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
	if (!CmpRHSC)
	return SDValue();

	APInt Comparison = CmpRHSC->getAPIntValue();

	// If the addend is the negation of the comparison value, then we can do
	// a full comparison by emitting the atomic arithmetic as a locked sub.
	if (Comparison == -Addend) {
	// The CC is fine, but we need to rewrite the LHS of the comparison as an
	// atomic sub.
	auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
	auto AtomicSub = DAG.getAtomic(
	ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
	/Chain/ CmpLHS.getOperand(0), /LHS/ CmpLHS.getOperand(1),
	/RHS/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
	AN->getMemOperand());
	auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// We can handle comparisons with zero in a number of cases by manipulating
	// the CC used.
	if (!Comparison.isNullValue())
	return SDValue();

	if (CC == X86::COND_S && Addend == 1)
	CC = X86::COND_LE;
	else if (CC == X86::COND_NS && Addend == 1)
	CC = X86::COND_G;
	else if (CC == X86::COND_G && Addend == -1)
	CC = X86::COND_GE;
	else if (CC == X86::COND_LE && Addend == -1)
	CC = X86::COND_L;
	else
	return SDValue();

	SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// Check whether a boolean test is testing a boolean value generated by
	// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
	// code.
	//
	// Simplify the following patterns:
	// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
	// to (Op EFLAGS Cond)
	//
	// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
	// to (Op EFLAGS !Cond)
	//
	// where Op could be BRCOND or CMOV.
	//
	static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Quit if not used as a boolean value.
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	// Check CMP operands. One of them should be 0 or 1 and the other should be
	// an SetCC or extended from it.
	SDValue Op1 = Cmp.getOperand(0);
	SDValue Op2 = Cmp.getOperand(1);

	SDValue SetCC;
	const ConstantSDNode* C = nullptr;
	bool needOppositeCond = (CC == X86::COND_E);
	bool checkAgainstTrue = false; // Is it a comparison against 1?

	if ((C = dyn_cast<ConstantSDNode>(Op1)))
	SetCC = Op2;
	else if ((C = dyn_cast<ConstantSDNode>(Op2)))
	SetCC = Op1;
	else // Quit if all operands are not constants.
	return SDValue();

	if (C->getZExtValue() == 1) {
	needOppositeCond = !needOppositeCond;
	checkAgainstTrue = true;
	} else if (C->getZExtValue() != 0)
	// Quit if the constant is neither 0 or 1.
	return SDValue();

	bool truncatedToBoolWithAnd = false;
	// Skip (zext $x), (trunc $x), or (and $x, 1) node.
	while (SetCC.getOpcode() == ISD::ZERO_EXTEND \|\|
	SetCC.getOpcode() == ISD::TRUNCATE \|\|
	SetCC.getOpcode() == ISD::AND) {
	if (SetCC.getOpcode() == ISD::AND) {
	int OpIdx = -1;
	if (isOneConstant(SetCC.getOperand(0)))
	OpIdx = 1;
	if (isOneConstant(SetCC.getOperand(1)))
	OpIdx = 0;
	if (OpIdx < 0)
	break;
	SetCC = SetCC.getOperand(OpIdx);
	truncatedToBoolWithAnd = true;
	} else
	SetCC = SetCC.getOperand(0);
	}

	switch (SetCC.getOpcode()) {
	case X86ISD::SETCC_CARRY:
	// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
	// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
	// i.e. it's a comparison against true but the result of SETCC_CARRY is not
	// truncated to i1 using 'and'.
	if (checkAgainstTrue && !truncatedToBoolWithAnd)
	break;
	assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
	"Invalid use of SETCC_CARRY!");
	LLVM_FALLTHROUGH;
	case X86ISD::SETCC:
	// Set the condition code or opposite one if necessary.
	CC = X86::CondCode(SetCC.getConstantOperandVal(0));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(1);
	case X86ISD::CMOV: {
	// Check whether false/true value has canonical one, i.e. 0 or 1.
	ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
	ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
	// Quit if true value is not a constant.
	if (!TVal)
	return SDValue();
	// Quit if false value is not a constant.
	if (!FVal) {
	SDValue Op = SetCC.getOperand(0);
	// Skip 'zext' or 'trunc' node.
	if (Op.getOpcode() == ISD::ZERO_EXTEND \|\|
	Op.getOpcode() == ISD::TRUNCATE)
	Op = Op.getOperand(0);
	// A special case for rdrand/rdseed, where 0 is set if false cond is
	// found.
	if ((Op.getOpcode() != X86ISD::RDRAND &&
	Op.getOpcode() != X86ISD::RDSEED) \|\| Op.getResNo() != 0)
	return SDValue();
	}
	// Quit if false value is not the constant 0 or 1.
	bool FValIsFalse = true;
	if (FVal && FVal->getZExtValue() != 0) {
	if (FVal->getZExtValue() != 1)
	return SDValue();
	// If FVal is 1, opposite cond is needed.
	needOppositeCond = !needOppositeCond;
	FValIsFalse = false;
	}
	// Quit if TVal is not the constant opposite of FVal.
	if (FValIsFalse && TVal->getZExtValue() != 1)
	return SDValue();
	if (!FValIsFalse && TVal->getZExtValue() != 0)
	return SDValue();
	CC = X86::CondCode(SetCC.getConstantOperandVal(2));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(3);
	}
	}

	return SDValue();
	}

	/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
	/// Match:
	/// (X86or (X86setcc) (X86setcc))
	/// (X86cmp (and (X86setcc) (X86setcc)), 0)
	static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
	X86::CondCode &CC1, SDValue &Flags,
	bool &isAnd) {
	if (Cond->getOpcode() == X86ISD::CMP) {
	if (!isNullConstant(Cond->getOperand(1)))
	return false;

	Cond = Cond->getOperand(0);
	}

	isAnd = false;

	SDValue SetCC0, SetCC1;
	switch (Cond->getOpcode()) {
	default: return false;
	case ISD::AND:
	case X86ISD::AND:
	isAnd = true;
	LLVM_FALLTHROUGH;
	case ISD::OR:
	case X86ISD::OR:
	SetCC0 = Cond->getOperand(0);
	SetCC1 = Cond->getOperand(1);
	break;
	};

	// Make sure we have SETCC nodes, using the same flags value.
	if (SetCC0.getOpcode() != X86ISD::SETCC \|\|
	SetCC1.getOpcode() != X86ISD::SETCC \|\|
	SetCC0->getOperand(1) != SetCC1->getOperand(1))
	return false;

	CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
	CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
	Flags = SetCC0->getOperand(1);
	return true;
	}

	// When legalizing carry, we create carries via add X, -1
	// If that comes from an actual carry, via setcc, we use the
	// carry directly.
	static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
	if (EFLAGS.getOpcode() == X86ISD::ADD) {
	if (isAllOnesConstant(EFLAGS.getOperand(1))) {
	SDValue Carry = EFLAGS.getOperand(0);
	while (Carry.getOpcode() == ISD::TRUNCATE \|\|
	Carry.getOpcode() == ISD::ZERO_EXTEND \|\|
	Carry.getOpcode() == ISD::SIGN_EXTEND \|\|
	Carry.getOpcode() == ISD::ANY_EXTEND \|\|
	(Carry.getOpcode() == ISD::AND &&
	isOneConstant(Carry.getOperand(1))))
	Carry = Carry.getOperand(0);
	if (Carry.getOpcode() == X86ISD::SETCC \|\|
	Carry.getOpcode() == X86ISD::SETCC_CARRY) {
	// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
	uint64_t CarryCC = Carry.getConstantOperandVal(0);
	SDValue CarryOp1 = Carry.getOperand(1);
	if (CarryCC == X86::COND_B)
	return CarryOp1;
	if (CarryCC == X86::COND_A) {
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp
	// instruction cannot take an immediate as its first operand.
	//
	if (CarryOp1.getOpcode() == X86ISD::SUB &&
	CarryOp1.getNode()->hasOneUse() &&
	CarryOp1.getValueType().isInteger() &&
	!isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
	SDValue SubCommute =
	DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
	CarryOp1.getOperand(1), CarryOp1.getOperand(0));
	return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
	}
	}
	// If this is a check of the z flag of an add with 1, switch to the
	// C flag.
	if (CarryCC == X86::COND_E &&
	CarryOp1.getOpcode() == X86ISD::ADD &&
	isOneConstant(CarryOp1.getOperand(1)))
	return CarryOp1;
	}
	}
	}

	return SDValue();
	}

	/// Optimize an EFLAGS definition used according to the condition code \p CC
	/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
	/// uses of chain values.
	static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (CC == X86::COND_B)
	if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
	return Flags;

	if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
	return R;
	return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
	}

	/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
	static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	SDValue FalseOp = N->getOperand(0);
	SDValue TrueOp = N->getOperand(1);
	X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
	SDValue Cond = N->getOperand(3);

	// cmov X, X, ?, ? --> X
	if (TrueOp == FalseOp)
	return TrueOp;

	// Try to simplify the EFLAGS and condition code operands.
	// We can't always do this as FCMOV only supports a subset of X86 cond.
	if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
	if (FalseOp.getValueType() != MVT::f80 \|\| hasFPCMov(CC)) {
	SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
	Flags};
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}

	// If this is a select between two integer constants, try to do some
	// optimizations. Note that the operands are ordered the opposite of SELECT
	// operands.
	if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
	if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
	// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
	// larger than FalseC (the false value).
	if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueC, FalseC);
	std::swap(TrueOp, FalseOp);
	}

	// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
	// This is efficient for any integer data type (including i8/i16) and
	// shift amount.
	if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

	unsigned ShAmt = TrueC->getAPIntValue().logBase2();
	Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(ShAmt, DL, MVT::i8));
	return Cond;
	}

	// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
	// for any integer data type, including i8/i16.
	if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
	FalseC->getValueType(0), Cond);
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}

	// Optimize cases that will turn into an LEA instruction. This requires
	// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
	if (N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i64) {
	uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
	if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;

	bool isFastMultiplier = false;
	if (Diff < 10) {
	switch ((unsigned char)Diff) {
	default: break;
	case 1: // result = add base, cond
	case 2: // result = lea base( , cond*2)
	case 3: // result = lea base(cond, cond*2)
	case 4: // result = lea base( , cond*4)
	case 5: // result = lea base(cond, cond*4)
	case 8: // result = lea base( , cond*8)
	case 9: // result = lea base(cond, cond*8)
	isFastMultiplier = true;
	break;
	}
	}

	if (isFastMultiplier) {
	APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
	Cond = getSETCC(CC, Cond, DL ,DAG);
	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
	Cond);
	// Scale the condition by the difference.
	if (Diff != 1)
	Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(Diff, DL, Cond.getValueType()));

	// Add the base if non-zero.
	if (FalseC->getAPIntValue() != 0)
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}
	}
	}
	}

	// Handle these cases:
	// (select (x != c), e, c) -> select (x != c), e, x),
	// (select (x == c), c, e) -> select (x == c), x, e)
	// where the c is an integer constant, and the "select" is the combination
	// of CMOV and CMP.
	//
	// The rationale for this change is that the conditional-move from a constant
	// needs two instructions, however, conditional-move from a register needs
	// only one instruction.
	//
	// CAVEAT: By replacing a constant with a symbolic value, it may obscure
	// some instruction-combining opportunities. This opt needs to be
	// postponed as late as possible.
	//
	if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
	// the DCI.xxxx conditions are provided to postpone the optimization as
	// late as possible.

	ConstantSDNode *CmpAgainst = nullptr;
	if ((Cond.getOpcode() == X86ISD::CMP \|\| Cond.getOpcode() == X86ISD::SUB) &&
	(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
	!isa<ConstantSDNode>(Cond.getOperand(0))) {

	if (CC == X86::COND_NE &&
	CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueOp, FalseOp);
	}

	if (CC == X86::COND_E &&
	CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
	SDValue Ops[] = { FalseOp, Cond.getOperand(0),
	DAG.getConstant(CC, DL, MVT::i8), Cond };
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}
	}

	// Fold and/or of setcc's to double CMOV:
	// (CMOV F, T, ((cc1 \| cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
	// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
	//
	// This combine lets us generate:
	// cmovcc1 (jcc1 if we don't have CMOV)
	// cmovcc2 (same)
	// instead of:
	// setcc1
	// setcc2
	// and/or
	// cmovne (jne if we don't have CMOV)
	// When we can't use the CMOV instruction, it might increase branch
	// mispredicts.
	// When we can use CMOV, or when there is no mispredict, this improves
	// throughput and reduces register pressure.
	//
	if (CC == X86::COND_NE) {
	SDValue Flags;
	X86::CondCode CC0, CC1;
	bool isAndSetCC;
	if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
	if (isAndSetCC) {
	std::swap(FalseOp, TrueOp);
	CC0 = X86::GetOppositeBranchCondition(CC0);
	CC1 = X86::GetOppositeBranchCondition(CC1);
	}

	SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
	Flags};
	SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
	SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
	SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	return CMOV;
	}
	}

	// Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
	// (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
	// Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
	// (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
	if ((CC == X86::COND_NE \|\| CC == X86::COND_E) &&
	Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
	SDValue Add = TrueOp;
	SDValue Const = FalseOp;
	// Canonicalize the condition code for easier matching and output.
	if (CC == X86::COND_E)
	std::swap(Add, Const);

	// We might have replaced the constant in the cmov with the LHS of the
	// compare. If so change it to the RHS of the compare.
	if (Const == Cond.getOperand(0))
	Const = Cond.getOperand(1);

	// Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
	if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
	Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
	(Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF \|\|
	Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
	Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
	EVT VT = N->getValueType(0);
	// This should constant fold.
	SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
	SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
	DAG.getConstant(X86::COND_NE, DL, MVT::i8),
	Cond);
	return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
	}
	}

	return SDValue();
	}

	/// Different mul shrinking modes.
	enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

	static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
	EVT VT = N->getOperand(0).getValueType();
	if (VT.getScalarSizeInBits() != 32)
	return false;

	assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
	unsigned SignBits[2] = {1, 1};
	bool IsPositive[2] = {false, false};
	for (unsigned i = 0; i < 2; i++) {
	SDValue Opd = N->getOperand(i);

	SignBits[i] = DAG.ComputeNumSignBits(Opd);
	IsPositive[i] = DAG.SignBitIsZero(Opd);
	}

	bool AllPositive = IsPositive[0] && IsPositive[1];
	unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
	// When ranges are from -128 ~ 127, use MULS8 mode.
	if (MinSignBits >= 25)
	Mode = MULS8;
	// When ranges are from 0 ~ 255, use MULU8 mode.
	else if (AllPositive && MinSignBits >= 24)
	Mode = MULU8;
	// When ranges are from -32768 ~ 32767, use MULS16 mode.
	else if (MinSignBits >= 17)
	Mode = MULS16;
	// When ranges are from 0 ~ 65535, use MULU16 mode.
	else if (AllPositive && MinSignBits >= 16)
	Mode = MULU16;
	else
	return false;
	return true;
	}

	/// When the operands of vector mul are extended from smaller size values,
	/// like i8 and i16, the type of mul may be shrinked to generate more
	/// efficient code. Two typical patterns are handled:
	/// Pattern1:
	/// %2 = sext/zext <N x i8> %1 to <N x i32>
	/// %4 = sext/zext <N x i8> %3 to <N x i32>
	// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// Pattern2:
	/// %2 = zext/sext <N x i16> %1 to <N x i32>
	/// %4 = zext/sext <N x i16> %3 to <N x i32>
	/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// There are four mul shrinking modes:
	/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
	/// generate pmullw+sext32 for it (MULS8 mode).
	/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
	/// generate pmullw+zext32 for it (MULU8 mode).
	/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
	/// generate pmullw+pmulhw for it (MULS16 mode).
	/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
	/// generate pmullw+pmulhuw for it (MULU16 mode).
	static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Check for legality
	// pmullw/pmulhw are not supported by SSE.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Check for profitability
	// pmulld is supported since SSE41. It is better to use pmulld
	// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
	// the expansion.
	bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
	if (Subtarget.hasSSE41() && (OptForMinSize \|\| !Subtarget.isPMULLDSlow()))
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(N, DAG, Mode))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getOperand(0).getValueType();
	unsigned NumElts = VT.getVectorNumElements();
	if ((NumElts % 2) != 0)
	return SDValue();

	unsigned RegSize = 128;
	MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

	// Shrink the operands of mul.
	SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
	SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

	if (ExperimentalVectorWideningLegalization \|\|
	NumElts >= OpsVT.getVectorNumElements()) {
	// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
	// lower part is needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
	if (Mode == MULU8 \|\| Mode == MULS8)
	return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
	DL, VT, MulLo);

	MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
	// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
	// the higher part is also needed.
	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	ReducedVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result.
	// Generate shuffle functioning as punpcklwd.
	SmallVector<int, 16> ShuffleMask(NumElts);
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i;
	ShuffleMask[2 * i + 1] = i + NumElts;
	}
	SDValue ResLo =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResLo = DAG.getBitcast(ResVT, ResLo);
	// Generate shuffle functioning as punpckhwd.
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i + NumElts / 2;
	ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
	}
	SDValue ResHi =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResHi = DAG.getBitcast(ResVT, ResHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
	}

	// When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
	// to legalize the mul explicitly because implicit legalization for type
	// <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
	// instructions which will not exist when we explicitly legalize it by
	// extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
	// <4 x i16> undef).
	//
	// Legalize the operands of mul.
	// FIXME: We may be able to handle non-concatenated vectors by insertion.
	unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
	if ((RegSize % ReducedSizeInBits) != 0)
	return SDValue();

	SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
	DAG.getUNDEF(ReducedVT));
	Ops[0] = NewN0;
	NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
	Ops[0] = NewN1;
	NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);

	if (Mode == MULU8 \|\| Mode == MULS8) {
	// Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
	// part is needed.
	SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);

	// convert the type of mul result to VT.
	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
	SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
	: ISD::SIGN_EXTEND_VECTOR_INREG,
	DL, ResVT, Mul);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	// Generate the lower and higher part of mul: pmulhw/pmulhuw. For
	// MULU16/MULS16, both parts are needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	OpsVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result. Make sure the type of mul result is VT.
	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
	SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
	Res = DAG.getBitcast(ResVT, Res);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
	EVT VT, const SDLoc &DL) {

	auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mult, DL, VT));
	Result = DAG.getNode(ISD::SHL, DL, VT, Result,
	DAG.getConstant(Shift, DL, MVT::i8));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mul1, DL, VT));
	Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
	DAG.getConstant(Mul2, DL, VT));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	switch (MulAmt) {
	default:
	break;
	case 11:
	// mul x, 11 => add ((shl (mul x, 5), 1), x)
	return combineMulShlAddOrSub(5, 1, /isAdd/ true);
	case 21:
	// mul x, 21 => add ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ true);
	case 41:
	// mul x, 41 => add ((shl (mul x, 5), 3), x)
	return combineMulShlAddOrSub(5, 3, /isAdd/ true);
	case 22:
	// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(5, 2, /isAdd/ true));
	case 19:
	// mul x, 19 => add ((shl (mul x, 9), 1), x)
	return combineMulShlAddOrSub(9, 1, /isAdd/ true);
	case 37:
	// mul x, 37 => add ((shl (mul x, 9), 2), x)
	return combineMulShlAddOrSub(9, 2, /isAdd/ true);
	case 73:
	// mul x, 73 => add ((shl (mul x, 9), 3), x)
	return combineMulShlAddOrSub(9, 3, /isAdd/ true);
	case 13:
	// mul x, 13 => add ((shl (mul x, 3), 2), x)
	return combineMulShlAddOrSub(3, 2, /isAdd/ true);
	case 23:
	// mul x, 23 => sub ((shl (mul x, 3), 3), x)
	return combineMulShlAddOrSub(3, 3, /isAdd/ false);
	case 26:
	// mul x, 26 => add ((mul (mul x, 5), 5), x)
	return combineMulMulAddOrSub(5, 5, /isAdd/ true);
	case 28:
	// mul x, 28 => add ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(9, 3, /isAdd/ true);
	case 29:
	// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulMulAddOrSub(9, 3, /isAdd/ true));
	}

	// Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
	// by a single LEA.
	// First check if this a sum of two power of 2s because that's easy. Then
	// count how many zeros are up to the first bit.
	// TODO: We can do this even without LEA at a cost of two shifts and an add.
	if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
	unsigned ScaleShift = countTrailingZeros(MulAmt);
	if (ScaleShift >= 1 && ScaleShift < 4) {
	unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
	SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(ShiftAmt, DL, MVT::i8));
	SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(ScaleShift, DL, MVT::i8));
	return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
	}
	}

	return SDValue();
	}

	// If the upper 17 bits of each element are zero then we can use PMADDWD,
	// which is always at least as quick as PMULLD, except on KNL.
	static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	if (Subtarget.isPMADDWDSlow())
	return SDValue();

	EVT VT = N->getValueType(0);

	// Only support vXi32 vectors.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32)
	return SDValue();

	// Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
	// Also allow v2i32 if it will be widened.
	MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
	if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) \|\|
	DAG.getTargetLoweringInfo().isTypeLegal(WVT)))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// If we are zero extending two steps without SSE4.1, its better to reduce
	// the vmul width instead.
	if (!Subtarget.hasSSE41() &&
	(N0.getOpcode() == ISD::ZERO_EXTEND &&
	N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
	(N1.getOpcode() == ISD::ZERO_EXTEND &&
	N1.getOperand(0).getScalarValueSizeInBits() <= 8))
	return SDValue();

	APInt Mask17 = APInt::getHighBitsSet(32, 17);
	if (!DAG.MaskedValueIsZero(N1, Mask17) \|\|
	!DAG.MaskedValueIsZero(N0, Mask17))
	return SDValue();

	// Use SplitOpsAndApply to handle AVX splitting.
	auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
	return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
	{ DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
	PMADDWDBuilder);
	}

	static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	EVT VT = N->getValueType(0);

	// Only support vXi64 vectors.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i64 \|\|
	VT.getVectorNumElements() < 2 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// MULDQ returns the 64-bit result of the signed multiplication of the lower
	// 32-bits. We can lower with this if the sign bits stretch that far.
	if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
	DAG.ComputeNumSignBits(N1) > 32) {
	auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
	PMULDQBuilder, /CheckBWI/false);
	}

	// If the upper bits are zero we can use a single pmuludq.
	APInt Mask = APInt::getHighBitsSet(64, 32);
	if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
	auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
	PMULUDQBuilder, /CheckBWI/false);
	}

	return SDValue();
	}

	/// Optimize a single multiply with constant into two operations in order to
	/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
	static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
	return V;

	if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
	return V;

	if (DCI.isBeforeLegalize() && VT.isVector())
	return reduceVMULWidth(N, DAG, Subtarget);

	if (!MulConstantOptimization)
	return SDValue();
	// An imul is usually smaller than the alternative sequence.
	if (DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	if (VT != MVT::i64 && VT != MVT::i32)
	return SDValue();

	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();
	if (isPowerOf2_64(C->getZExtValue()))
	return SDValue();

	int64_t SignMulAmt = C->getSExtValue();
	assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
	uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;

	SDLoc DL(N);
	if (AbsMulAmt == 3 \|\| AbsMulAmt == 5 \|\| AbsMulAmt == 9) {
	SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(AbsMulAmt, DL, VT));
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	NewMul);

	return NewMul;
	}

	uint64_t MulAmt1 = 0;
	uint64_t MulAmt2 = 0;
	if ((AbsMulAmt % 9) == 0) {
	MulAmt1 = 9;
	MulAmt2 = AbsMulAmt / 9;
	} else if ((AbsMulAmt % 5) == 0) {
	MulAmt1 = 5;
	MulAmt2 = AbsMulAmt / 5;
	} else if ((AbsMulAmt % 3) == 0) {
	MulAmt1 = 3;
	MulAmt2 = AbsMulAmt / 3;
	}

	SDValue NewMul;
	// For negative multiply amounts, only allow MulAmt2 to be a power of 2.
	if (MulAmt2 &&
	(isPowerOf2_64(MulAmt2) \|\|
	(SignMulAmt >= 0 && (MulAmt2 == 3 \|\| MulAmt2 == 5 \|\| MulAmt2 == 9)))) {

	if (isPowerOf2_64(MulAmt2) &&
	!(SignMulAmt >= 0 && N->hasOneUse() &&
	N->use_begin()->getOpcode() == ISD::ADD))
	// If second multiplifer is pow2, issue it first. We want the multiply by
	// 3, 5, or 9 to be folded into the addressing mode unless the lone use
	// is an add. Only do this for positive multiply amounts since the
	// negate would prevent it from being used as an address mode anyway.
	std::swap(MulAmt1, MulAmt2);

	if (isPowerOf2_64(MulAmt1))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(MulAmt1, DL, VT));

	if (isPowerOf2_64(MulAmt2))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
	DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
	DAG.getConstant(MulAmt2, DL, VT));

	// Negate the result.
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	NewMul);
	} else if (!Subtarget.slowLEA())
	NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);

	if (!NewMul) {
	assert(C->getZExtValue() != 0 &&
	C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
	"Both cases that could cause potential overflows should have "
	"already been handled.");
	if (isPowerOf2_64(AbsMulAmt - 1)) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::ADD, DL, VT, N->getOperand(0),
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
	MVT::i8)));
	// To negate, subtract the number from zero
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT,
	DAG.getConstant(0, DL, VT), NewMul);
	} else if (isPowerOf2_64(AbsMulAmt + 1)) {
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt + 1),
	DL, MVT::i8));
	// To negate, reverse the operands of the subtract.
	if (SignMulAmt < 0)
	NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
	else
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
	// (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt - 2),
	DL, MVT::i8));
	NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
	NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
	} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
	// (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(AbsMulAmt + 2),
	DL, MVT::i8));
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
	}
	}

	return NewMul;
	}

	static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	EVT VT = N0.getValueType();

	// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
	// since the result of setcc_c is all zero's or all ones.
	if (VT.isInteger() && !VT.isVector() &&
	N1C && N0.getOpcode() == ISD::AND &&
	N0.getOperand(1).getOpcode() == ISD::Constant) {
	SDValue N00 = N0.getOperand(0);
	APInt Mask = N0.getConstantOperandAPInt(1);
	Mask <<= N1C->getAPIntValue();
	bool MaskOK = false;
	// We can handle cases concerning bit-widening nodes containing setcc_c if
	// we carefully interrogate the mask to make sure we are semantics
	// preserving.
	// The transform is not safe if the result of C1 << C2 exceeds the bitwidth
	// of the underlying setcc_c operation if the setcc_c was zero extended.
	// Consider the following example:
	// zext(setcc_c) -> i32 0x0000FFFF
	// c1 -> i32 0x0000FFFF
	// c2 -> i32 0x00000001
	// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
	// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if ((N00.getOpcode() == ISD::ZERO_EXTEND \|\|
	N00.getOpcode() == ISD::ANY_EXTEND) &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
	}
	if (MaskOK && Mask != 0) {
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
	}
	}

	// Hardware support for vector shifts is sparse which makes us scalarize the
	// vector operations in many cases. Also, on sandybridge ADD is faster than
	// shl.
	// (shl V, 1) -> add V,V
	if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
	if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
	assert(N0.getValueType().isVector() && "Invalid vector shift type");
	// We shift all of the values by one. In many cases we do not have
	// hardware support for this operation. This is better expressed as an ADD
	// of two values.
	if (N1SplatC->getAPIntValue() == 1)
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
	}

	return SDValue();
	}

	static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned Size = VT.getSizeInBits();

	// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
	// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
	// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
	// depending on sign of (SarConst - [56,48,32,24,16])

	// sexts in X86 are MOVs. The MOVs have the same code size
	// as above SHIFTs (only SHIFT on 1 has lower code size).
	// However the MOVs have 2 advantages to a SHIFT:
	// 1. MOVs can write to a register that differs from source
	// 2. MOVs accept memory operands

	if (VT.isVector() \|\| N1.getOpcode() != ISD::Constant \|\|
	N0.getOpcode() != ISD::SHL \|\| !N0.hasOneUse() \|\|
	N0.getOperand(1).getOpcode() != ISD::Constant)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
	APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
	EVT CVT = N1.getValueType();

	if (SarConst.isNegative())
	return SDValue();

	for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
	unsigned ShiftSize = SVT.getSizeInBits();
	// skipping types without corresponding sext/zext and
	// ShlConst that is not one of [56,48,32,24,16]
	if (ShiftSize >= Size \|\| ShlConst != Size - ShiftSize)
	continue;
	SDLoc DL(N);
	SDValue NN =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
	SarConst = SarConst - (Size - ShiftSize);
	if (SarConst == 0)
	return NN;
	else if (SarConst.isNegative())
	return DAG.getNode(ISD::SHL, DL, VT, NN,
	DAG.getConstant(-SarConst, DL, CVT));
	else
	return DAG.getNode(ISD::SRA, DL, VT, NN,
	DAG.getConstant(SarConst, DL, CVT));
	}
	return SDValue();
	}

	static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	// Only do this on the last DAG combine as it can interfere with other
	// combines.
	if (!DCI.isAfterLegalizeDAG())
	return SDValue();

	// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
	// TODO: This is a generic DAG combine that became an x86-only combine to
	// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
	// and-not ('andn').
	if (N0.getOpcode() != ISD::AND \|\| !N0.hasOneUse())
	return SDValue();

	auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
	auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!ShiftC \|\| !AndC)
	return SDValue();

	// If we can shrink the constant mask below 8-bits or 32-bits, then this
	// transform should reduce code size. It may also enable secondary transforms
	// from improved known-bits analysis or instruction selection.
	APInt MaskVal = AndC->getAPIntValue();

	// If this can be matched by a zero extend, don't optimize.
	if (MaskVal.isMask()) {
	unsigned TO = MaskVal.countTrailingOnes();
	if (TO >= 8 && isPowerOf2_32(TO))
	return SDValue();
	}

	APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
	unsigned OldMaskSize = MaskVal.getMinSignedBits();
	unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
	if ((OldMaskSize > 8 && NewMaskSize <= 8) \|\|
	(OldMaskSize > 32 && NewMaskSize <= 32)) {
	// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
	SDLoc DL(N);
	SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
	SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
	return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
	}
	return SDValue();
	}

	static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::PACKSS == Opcode \|\| X86ISD::PACKUS == Opcode) &&
	"Unexpected shift opcode");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned DstBitsPerElt = VT.getScalarSizeInBits();
	unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
	assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
	N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
	"Unexpected PACKSS/PACKUS input type");

	bool IsSigned = (X86ISD::PACKSS == Opcode);

	// Constant Folding.
	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if ((N0.isUndef() \|\| N->isOnlyUserOf(N0.getNode())) &&
	(N1.isUndef() \|\| N->isOnlyUserOf(N1.getNode())) &&
	getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
	getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumDstElts = VT.getVectorNumElements();
	unsigned NumSrcElts = NumDstElts / 2;
	unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
	unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;

	APInt Undefs(NumDstElts, 0);
	SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
	unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
	auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
	auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

	if (UndefElts[SrcIdx]) {
	Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
	continue;
	}

	APInt &Val = EltBits[SrcIdx];
	if (IsSigned) {
	// PACKSS: Truncate signed value with signed saturation.
	// Source values less than dst minint are saturated to minint.
	// Source values greater than dst maxint are saturated to maxint.
	if (Val.isSignedIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getSignedMinValue(DstBitsPerElt);
	else
	Val = APInt::getSignedMaxValue(DstBitsPerElt);
	} else {
	// PACKUS: Truncate signed value with unsigned saturation.
	// Source values less than zero are saturated to zero.
	// Source values greater than dst maxuint are saturated to maxuint.
	if (Val.isIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getNullValue(DstBitsPerElt);
	else
	Val = APInt::getAllOnesValue(DstBitsPerElt);
	}
	Bits[Lane * NumDstEltsPerLane + Elt] = Val;
	}
	}

	return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
	// truncate to create a larger truncate.
	if (Subtarget.hasAVX512() &&
	N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
	N0.getOperand(0).getValueType() == MVT::v8i32) {
	if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) \|\|
	(!IsSigned &&
	DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
	if (Subtarget.hasVLX())
	return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));

	// Widen input to v16i32 so we can truncate that.
	SDLoc dl(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
	N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
	}
	}

	// Attempt to combine as shuffle.
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;

	return SDValue();
	}

	static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert((X86ISD::VSHL == N->getOpcode() \|\| X86ISD::VSRA == N->getOpcode() \|\|
	X86ISD::VSRL == N->getOpcode()) &&
	"Unexpected shift opcode");
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Shift zero -> zero.
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	// Detect constant shift amounts.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
	unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
	return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
	EltBits[0].getZExtValue(), DAG);
	}

	APInt KnownUndef, KnownZero;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
	KnownZero, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\|
	X86ISD::VSRLI == Opcode) &&
	"Unexpected shift opcode");
	bool LogicalShift = X86ISD::VSHLI == Opcode \|\| X86ISD::VSRLI == Opcode;
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
	"Unexpected value type");
	assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");

	// Out of range logical bit shifts are guaranteed to be zero.
	// Out of range arithmetic bit shifts splat the sign bit.
	unsigned ShiftVal = cast<ConstantSDNode>(N1)->getZExtValue();
	if (ShiftVal >= NumBitsPerElt) {
	if (LogicalShift)
	return DAG.getConstant(0, SDLoc(N), VT);
	else
	ShiftVal = NumBitsPerElt - 1;
	}

	// Shift N0 by zero -> N0.
	if (!ShiftVal)
	return N0;

	// Shift zero -> zero.
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	// Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)
	// clamped to (NumBitsPerElt - 1).
	if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {
	unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
	unsigned NewShiftVal = ShiftVal + ShiftVal2;
	if (NewShiftVal >= NumBitsPerElt)
	NewShiftVal = NumBitsPerElt - 1;
	return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
	DAG.getConstant(NewShiftVal, SDLoc(N), MVT::i8));
	}

	// We can decode 'whole byte' logical bit shifts as shuffles.
	if (LogicalShift && (ShiftVal % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	// Constant Folding.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (N->isOnlyUserOf(N0.getNode()) &&
	getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
	assert(EltBits.size() == VT.getVectorNumElements() &&
	"Unexpected shift value type");
	for (APInt &Elt : EltBits) {
	if (X86ISD::VSHLI == Opcode)
	Elt <<= ShiftVal;
	else if (X86ISD::VSRAI == Opcode)
	Elt.ashrInPlace(ShiftVal);
	else
	Elt.lshrInPlace(ShiftVal);
	}
	return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0),
	APInt::getAllOnesValue(NumBitsPerElt), DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) \|\|
	(N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) &&
	"Unexpected vector insertion");

	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0),
	APInt::getAllOnesValue(NumBitsPerElt), DCI))
	return SDValue(N, 0);

	// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;

	return SDValue();
	}

	/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
	/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
	/// OR -> CMPNEQSS.
	static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned opcode;

	// SSE1 supports CMP{eq\|ne}SS, and SSE2 added CMP{eq\|ne}SD, but
	// we're requiring SSE2 for both.
	if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CMP0 = N0.getOperand(1);
	SDValue CMP1 = N1.getOperand(1);
	SDLoc DL(N);

	// The SETCCs should both refer to the same CMP.
	if (CMP0.getOpcode() != X86ISD::CMP \|\| CMP0 != CMP1)
	return SDValue();

	SDValue CMP00 = CMP0->getOperand(0);
	SDValue CMP01 = CMP0->getOperand(1);
	EVT VT = CMP00.getValueType();

	if (VT == MVT::f32 \|\| VT == MVT::f64) {
	bool ExpectingFlags = false;
	// Check for any users that want flags:
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	!ExpectingFlags && UI != UE; ++UI)
	switch (UI->getOpcode()) {
	default:
	case ISD::BR_CC:
	case ISD::BRCOND:
	case ISD::SELECT:
	ExpectingFlags = true;
	break;
	case ISD::CopyToReg:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	}

	if (!ExpectingFlags) {
	enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
	enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

	if (cc1 == X86::COND_E \|\| cc1 == X86::COND_NE) {
	X86::CondCode tmp = cc0;
	cc0 = cc1;
	cc1 = tmp;
	}

	if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) \|\|
	(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
	// FIXME: need symbolic constants for these magic numbers.
	// See X86ATTInstPrinter.cpp:printSSECC().
	unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
	if (Subtarget.hasAVX512()) {
	SDValue FSetCC =
	DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
	DAG.getConstant(x86cc, DL, MVT::i8));
	// Need to fill with zeros to ensure the bitcast will produce zeroes
	// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
	SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
	DAG.getConstant(0, DL, MVT::v16i1),
	FSetCC, DAG.getIntPtrConstant(0, DL));
	return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
	N->getSimpleValueType(0));
	}
	SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
	CMP00.getValueType(), CMP00, CMP01,
	DAG.getConstant(x86cc, DL,
	MVT::i8));

	bool is64BitFP = (CMP00.getValueType() == MVT::f64);
	MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

	if (is64BitFP && !Subtarget.is64Bit()) {
	// On a 32-bit target, we cannot bitcast the 64-bit float to a
	// 64-bit integer, since that's not a legal type. Since
	// OnesOrZeroesF is all ones of all zeroes, we don't need all the
	// bits, but can do this little dance to extract the lowest 32 bits
	// and work with those going forward.
	SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	OnesOrZeroesF);
	SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
	OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
	Vector32, DAG.getIntPtrConstant(0, DL));
	IntVT = MVT::i32;
	}

	SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
	SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
	DAG.getConstant(1, DL, IntVT));
	SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	ANDed);
	return OneBitOfTruth;
	}
	}
	}
	}
	return SDValue();
	}

	// Match (xor X, -1) -> X.
	// Match extract_subvector(xor X, -1) -> extract_subvector(X).
	// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
	static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
	V = peekThroughBitcasts(V);
	if (V.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
	return V.getOperand(0);
	if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	(isNullConstant(V.getOperand(1)) \|\| V.getOperand(0).hasOneUse())) {
	if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
	Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
	Not, V.getOperand(1));
	}
	}
	SmallVector<SDValue, 2> CatOps;
	if (collectConcatOps(V.getNode(), CatOps)) {
	for (SDValue &CatOp : CatOps) {
	SDValue NotCat = IsNOT(CatOp, DAG);
	if (!NotCat) return SDValue();
	CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
	}
	return SDValue();
	}

	/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
	static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::AND);

	MVT VT = N->getSimpleValueType(0);
	if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	SDValue X, Y;
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	if (SDValue Not = IsNOT(N0, DAG)) {
	X = Not;
	Y = N1;
	} else if (SDValue Not = IsNOT(N1, DAG)) {
	X = Not;
	Y = N0;
	} else
	return SDValue();

	X = DAG.getBitcast(VT, X);
	Y = DAG.getBitcast(VT, Y);
	return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
	}

	// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
	// register. In most cases we actually compare or select YMM-sized registers
	// and mixing the two types creates horrible code. This method optimizes
	// some of the transition sequences.
	// Even with AVX-512 this is still useful for removing casts around logical
	// operations on vXi1 mask types.
	static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	assert(VT.isVector() && "Expected vector type");

	assert((N->getOpcode() == ISD::ANY_EXTEND \|\|
	N->getOpcode() == ISD::ZERO_EXTEND \|\|
	N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");

	SDValue Narrow = N->getOperand(0);
	EVT NarrowVT = Narrow.getValueType();

	if (Narrow->getOpcode() != ISD::XOR &&
	Narrow->getOpcode() != ISD::AND &&
	Narrow->getOpcode() != ISD::OR)
	return SDValue();

	SDValue N0 = Narrow->getOperand(0);
	SDValue N1 = Narrow->getOperand(1);
	SDLoc DL(Narrow);

	// The Left side has to be a trunc.
	if (N0.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	// The type of the truncated inputs.
	if (N0.getOperand(0).getValueType() != VT)
	return SDValue();

	// The right side has to be a 'trunc' or a constant vector.
	bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getValueType() == VT;
	if (!RHSTrunc &&
	!ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
	return SDValue();

	// Set N0 and N1 to hold the inputs to the new wide operation.
	N0 = N0.getOperand(0);
	if (RHSTrunc)
	N1 = N1.getOperand(0);
	else
	N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

	// Generate the wide operation.
	SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
	unsigned Opcode = N->getOpcode();
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::ANY_EXTEND:
	return Op;
	case ISD::ZERO_EXTEND:
	return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
	case ISD::SIGN_EXTEND:
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
	Op, DAG.getValueType(NarrowVT));
	}
	}

	/// If both input operands of a logic op are being cast from floating point
	/// types, try to convert this into a floating point logic node to avoid
	/// unnecessary moves from SSE to integer registers.
	static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	if (N0.getOpcode() != ISD::BITCAST \|\| N1.getOpcode() != ISD::BITCAST)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N10 = N1.getOperand(0);
	EVT N00Type = N00.getValueType();
	EVT N10Type = N10.getValueType();

	// Ensure that both types are the same and are legal scalar fp types.
	if (N00Type != N10Type \|\|
	!((Subtarget.hasSSE1() && N00Type == MVT::f32) \|\|
	(Subtarget.hasSSE2() && N00Type == MVT::f64)))
	return SDValue();

	unsigned FPOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected input node for FP logic conversion");
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	}

	SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
	return DAG.getBitcast(VT, FPLogic);
	}

	/// If this is a zero/all-bits result that is bitwise-anded with a low bits
	/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
	/// with a shift-right to eliminate loading the vector constant mask value.
	static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
	SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
	EVT VT0 = Op0.getValueType();
	EVT VT1 = Op1.getValueType();

	if (VT0 != VT1 \|\| !VT0.isSimple() \|\| !VT0.isInteger())
	return SDValue();

	APInt SplatVal;
	if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) \|\|
	!SplatVal.isMask())
	return SDValue();

	// Don't prevent creation of ANDN.
	if (isBitwiseNot(Op0))
	return SDValue();

	if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
	return SDValue();

	unsigned EltBitWidth = VT0.getScalarSizeInBits();
	if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
	return SDValue();

	SDLoc DL(N);
	unsigned ShiftVal = SplatVal.countTrailingOnes();
	SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
	SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
	return DAG.getBitcast(N->getValueType(0), Shift);
	}

	// Get the index node from the lowered DAG of a GEP IR instruction with one
	// indexing dimension.
	static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
	if (Ld->isIndexed())
	return SDValue();

	SDValue Base = Ld->getBasePtr();

	if (Base.getOpcode() != ISD::ADD)
	return SDValue();

	SDValue ShiftedIndex = Base.getOperand(0);

	if (ShiftedIndex.getOpcode() != ISD::SHL)
	return SDValue();

	return ShiftedIndex.getOperand(0);

	}

	static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
	if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
	switch (VT.getSizeInBits()) {
	default: return false;
	case 64: return Subtarget.is64Bit() ? true : false;
	case 32: return true;
	}
	}
	return false;
	}

	// This function recognizes cases where X86 bzhi instruction can replace and
	// 'and-load' sequence.
	// In case of loading integer value from an array of constants which is defined
	// as follows:
	//
	// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
	//
	// then applying a bitwise and on the result with another input.
	// It's equivalent to performing bzhi (zero high bits) on the input, with the
	// same index of the load.
	static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Node->getSimpleValueType(0);
	SDLoc dl(Node);

	// Check if subtarget has BZHI instruction for the node's type
	if (!hasBZHI(Subtarget, VT))
	return SDValue();

	// Try matching the pattern for both operands.
	for (unsigned i = 0; i < 2; i++) {
	SDValue N = Node->getOperand(i);
	LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

	// continue if the operand is not a load instruction
	if (!Ld)
	return SDValue();

	const Value *MemOp = Ld->getMemOperand()->getValue();

	if (!MemOp)
	return SDValue();

	if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
	if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
	if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

	Constant *Init = GV->getInitializer();
	Type *Ty = Init->getType();
	if (!isa<ConstantDataArray>(Init) \|\|
	!Ty->getArrayElementType()->isIntegerTy() \|\|
	Ty->getArrayElementType()->getScalarSizeInBits() !=
	VT.getSizeInBits() \|\|
	Ty->getArrayNumElements() >
	Ty->getArrayElementType()->getScalarSizeInBits())
	continue;

	// Check if the array's constant elements are suitable to our case.
	uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
	bool ConstantsMatch = true;
	for (uint64_t j = 0; j < ArrayElementCount; j++) {
	ConstantInt *Elem =
	dyn_cast<ConstantInt>(Init->getAggregateElement(j));
	if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
	ConstantsMatch = false;
	break;
	}
	}
	if (!ConstantsMatch)
	continue;

	// Do the transformation (For 32-bit type):
	// -> (and (load arr[idx]), inp)
	// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
	// that will be replaced with one bzhi instruction.
	SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
	SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);

	// Get the Node which indexes into the array.
	SDValue Index = getIndexFromUnindexedLoad(Ld);
	if (!Index)
	return SDValue();
	Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);

	SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
	Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);

	SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
	SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

	return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
	}
	}
	}
	}
	return SDValue();
	}

	// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
	// Turn it into series of XORs and a setnp.
	static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	// We only support 64-bit and 32-bit. 64-bit requires special handling
	// unless the 64-bit popcnt instruction is legal.
	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// LHS needs to be a single use CTPOP.
	if (N0.getOpcode() != ISD::CTPOP \|\| !N0.hasOneUse())
	return SDValue();

	// RHS needs to be 1.
	if (!isOneConstant(N1))
	return SDValue();

	SDLoc DL(N);
	SDValue X = N0.getOperand(0);

	// If this is 64-bit, its always best to xor the two 32-bit pieces together
	// even if we have popcnt.
	if (VT == MVT::i64) {
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
	DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(32, DL, MVT::i8)));
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
	X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
	// Generate a 32-bit parity idiom. This will bring us back here if we need
	// to expand it too.
	SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
	DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
	DAG.getConstant(1, DL, MVT::i32));
	return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
	}
	assert(VT == MVT::i32 && "Unexpected VT!");

	// Xor the high and low 16-bits together using a 32-bit operation.
	SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(16, DL, MVT::i8));
	X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);

	// Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
	// This should allow an h-reg to be used to save a shift.
	// FIXME: We only get an h-reg in 32-bit mode.
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	DAG.getNode(ISD::SRL, DL, VT, X,
	DAG.getConstant(8, DL, MVT::i8)));
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
	SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
	SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);

	// Copy the inverse of the parity flag into a register with setcc.
	SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
	// Zero extend to original type.
	return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
	}

	static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FAND to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	// Use a 32-bit and+zext if upper bits known zero.
	if (VT == MVT::i64 && Subtarget.is64Bit() &&
	!isa<ConstantSDNode>(N->getOperand(1))) {
	APInt HiMask = APInt::getHighBitsSet(64, 32);
	if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) \|\|
	DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
	SDLoc dl(N);
	SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
	SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
	DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
	}
	}

	// This must be done before legalization has expanded the ctpop.
	if (SDValue V = combineParity(N, DAG, Subtarget))
	return V;

	// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
	// TODO: Support multiple SrcOps.
	if (VT == MVT::i1) {
	SmallVector<SDValue, 2> SrcOps;
	if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
	SrcOps.size() == 1) {
	SDLoc dl(N);
	unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
	EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
	if (Mask) {
	APInt AllBits = APInt::getAllOnesValue(NumElts);
	return DAG.getSetCC(dl, MVT::i1, Mask,
	DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ);
	}
	}
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
	return R;

	if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
	return ShiftRight;

	if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
	return R;

	// Attempt to recursively combine a bitmask AND with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	// Attempt to combine a scalar bitmask AND with an extracted shuffle.
	if ((VT.getScalarSizeInBits() % 8) == 0 &&
	N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
	SDValue BitMask = N->getOperand(1);
	SDValue SrcVec = N->getOperand(0).getOperand(0);
	EVT SrcVecVT = SrcVec.getValueType();

	// Check that the constant bitmask masks whole bytes.
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (VT == SrcVecVT.getScalarType() &&
	N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
	getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
	llvm::all_of(EltBits, [](APInt M) {
	return M.isNullValue() \|\| M.isAllOnesValue();
	})) {
	unsigned NumElts = SrcVecVT.getVectorNumElements();
	unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
	unsigned Idx = N->getOperand(0).getConstantOperandVal(1);

	// Create a root shuffle mask from the byte mask and the extracted index.
	SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
	for (unsigned i = 0; i != Scale; ++i) {
	if (UndefElts[i])
	continue;
	int VecIdx = Scale * Idx + i;
	ShuffleMask[VecIdx] =
	EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
	}

	if (SDValue Shuffle = combineX86ShufflesRecursively(
	{SrcVec}, 0, SrcVec, ShuffleMask, {}, /Depth/ 2,
	/HasVarMask/ false, /AllowVarMask/ true, DAG, Subtarget))
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
	N->getOperand(0).getOperand(1));
	}
	}

	return SDValue();
	}

	// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
	static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| (VT.getScalarSizeInBits() % 8) != 0)
	return SDValue();

	SDValue N0 = peekThroughBitcasts(N->getOperand(0));
	SDValue N1 = peekThroughBitcasts(N->getOperand(1));
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != ISD::AND)
	return SDValue();

	// On XOP we'll lower to PCMOV so accept one use, otherwise only
	// do this if either mask has multiple uses already.
	if (!(Subtarget.hasXOP() \|\| !N0.getOperand(1).hasOneUse() \|\|
	!N1.getOperand(1).hasOneUse()))
	return SDValue();

	// Attempt to extract constant byte masks.
	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
	false, false))
	return SDValue();
	if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
	false, false))
	return SDValue();

	for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
	// TODO - add UNDEF elts support.
	if (UndefElts0[i] \|\| UndefElts1[i])
	return SDValue();
	if (EltBits0[i] != ~EltBits1[i])
	return SDValue();
	}

	SDLoc DL(N);
	SDValue X = N->getOperand(0);
	SDValue Y =
	DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
	DAG.getBitcast(VT, N1.getOperand(0)));
	return DAG.getNode(ISD::OR, DL, VT, X, Y);
	}

	// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
	static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
	if (N->getOpcode() != ISD::OR)
	return false;

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Canonicalize AND to LHS.
	if (N1.getOpcode() == ISD::AND)
	std::swap(N0, N1);

	// Attempt to match OR(AND(M,Y),ANDNP(M,X)).
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != X86ISD::ANDNP)
	return false;

	Mask = N1.getOperand(0);
	X = N1.getOperand(1);

	// Check to see if the mask appeared in both the AND and ANDNP.
	if (N0.getOperand(0) == Mask)
	Y = N0.getOperand(1);
	else if (N0.getOperand(1) == Mask)
	Y = N0.getOperand(0);
	else
	return false;

	// TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
	// ANDNP combine allows other combines to happen that prevent matching.
	return true;
	}

	// Try to match:
	// (or (and (M, (sub 0, X)), (pandn M, X)))
	// which is a special case of vselect:
	// (vselect M, (sub 0, X), X)
	// Per:
	// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
	// We know that, if fNegate is 0 or 1:
	// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
	//
	// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
	// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
	// ( M ? -X : X) == ((X ^ M ) + (M & 1))
	// This lets us transform our vselect to:
	// (add (xor X, M), (and M, 1))
	// And further to:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoConditionalNegate(
	EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	EVT MaskVT = Mask.getValueType();
	assert(MaskVT.isInteger() &&
	DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
	"Mask must be zero/all-bits");

	if (X.getValueType() != MaskVT \|\| Y.getValueType() != MaskVT)
	return SDValue();
	if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
	return SDValue();

	auto IsNegV = [](SDNode *N, SDValue V) {
	return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
	ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
	};

	SDValue V;
	if (IsNegV(Y.getNode(), X))
	V = X;
	else if (IsNegV(X.getNode(), Y))
	V = Y;
	else
	return SDValue();

	SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
	SDValue SubOp2 = Mask;

	// If the negate was on the false side of the select, then
	// the operands of the SUB need to be swapped. PR 27251.
	// This is because the pattern being matched above is
	// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
	// but if the pattern matched was
	// (vselect M, X, (sub (0, X))), that is really negation of the pattern
	// above, -(vselect M, (sub 0, X), X), and therefore the replacement
	// pattern also needs to be a negation of the replacement pattern above.
	// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
	// sub accomplishes the negation of the replacement pattern.
	if (V == Y)
	std::swap(SubOp1, SubOp2);

	SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
	return DAG.getBitcast(VT, Res);
	}

	// Try to fold:
	// (or (and (m, y), (pandn m, x)))
	// into:
	// (vselect m, x, y)
	// As a special case, try to fold:
	// (or (and (m, (sub 0, x)), (pandn m, x)))
	// into:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	EVT VT = N->getValueType(0);
	if (!((VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256())))
	return SDValue();

	SDValue X, Y, Mask;
	if (!matchLogicBlend(N, X, Y, Mask))
	return SDValue();

	// Validate that X, Y, and Mask are bitcasts, and see through them.
	Mask = peekThroughBitcasts(Mask);
	X = peekThroughBitcasts(X);
	Y = peekThroughBitcasts(Y);

	EVT MaskVT = Mask.getValueType();
	unsigned EltBits = MaskVT.getScalarSizeInBits();

	// TODO: Attempt to handle floating point cases as well?
	if (!MaskVT.isInteger() \|\| DAG.ComputeNumSignBits(Mask) != EltBits)
	return SDValue();

	SDLoc DL(N);

	// Attempt to combine to conditional negate: (sub (xor X, M), M)
	if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
	DAG, Subtarget))
	return Res;

	// PBLENDVB is only available on SSE 4.1.
	if (!Subtarget.hasSSE41())
	return SDValue();

	MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;

	X = DAG.getBitcast(BlendVT, X);
	Y = DAG.getBitcast(BlendVT, Y);
	Mask = DAG.getBitcast(BlendVT, Mask);
	Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
	return DAG.getBitcast(VT, Mask);
	}

	// Helper function for combineOrCmpEqZeroToCtlzSrl
	// Transforms:
	// seteq(cmp x, 0)
	// into:
	// srl(ctlz x), log2(bitsize(x))
	// Input pattern is checked by caller.
	static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
	SelectionDAG &DAG) {
	SDValue Cmp = Op.getOperand(1);
	EVT VT = Cmp.getOperand(0).getValueType();
	unsigned Log2b = Log2_32(VT.getSizeInBits());
	SDLoc dl(Op);
	SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
	// The result of the shift is true or false, and on X86, the 32-bit
	// encoding of shr and lzcnt is more desirable.
	SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
	SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
	DAG.getConstant(Log2b, dl, MVT::i8));
	return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
	}

	// Try to transform:
	// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
	// into:
	// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
	// Will also attempt to match more generic cases, eg:
	// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
	// Only applies if the target supports the FastLZCNT feature.
	static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize() \|\| !Subtarget.getTargetLowering()->isCtlzFast())
	return SDValue();

	auto isORCandidate = [](SDValue N) {
	return (N->getOpcode() == ISD::OR && N->hasOneUse());
	};

	// Check the zero extend is extending to 32-bit or more. The code generated by
	// srl(ctlz) for 16-bit or less variants of the pattern would require extra
	// instructions to clear the upper bits.
	if (!N->hasOneUse() \|\| !N->getSimpleValueType(0).bitsGE(MVT::i32) \|\|
	!isORCandidate(N->getOperand(0)))
	return SDValue();

	// Check the node matches: setcc(eq, cmp 0)
	auto isSetCCCandidate = [](SDValue N) {
	return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
	X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
	N->getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(N->getOperand(1).getOperand(1)) &&
	N->getOperand(1).getValueType().bitsGE(MVT::i32);
	};

	SDNode *OR = N->getOperand(0).getNode();
	SDValue LHS = OR->getOperand(0);
	SDValue RHS = OR->getOperand(1);

	// Save nodes matching or(or, setcc(eq, cmp 0)).
	SmallVector<SDNode *, 2> ORNodes;
	while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
	ORNodes.push_back(OR);
	OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	}

	// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
	if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	!isORCandidate(SDValue(OR, 0)))
	return SDValue();

	// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
	// to
	// or(srl(ctlz),srl(ctlz)).
	// The dag combiner can then fold it into:
	// srl(or(ctlz, ctlz)).
	EVT VT = OR->getValueType(0);
	SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
	SDValue Ret, NewRHS;
	if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);

	if (!Ret)
	return SDValue();

	// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
	while (ORNodes.size() > 0) {
	OR = ORNodes.pop_back_val();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
	if (RHS->getOpcode() == ISD::OR)
	std::swap(LHS, RHS);
	NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
	if (!NewRHS)
	return SDValue();
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
	}

	if (Ret)
	Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

	return Ret;
	}

	static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(MVT::v4i32,
	DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N0),
	DAG.getBitcast(MVT::v4f32, N1)));
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
	return R;

	if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
	return R;

	// Attempt to recursively combine an OR of shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	// fold (or (x << c) \| (y >> (64 - c))) ==> (shld64 x, y, c)
	bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
	unsigned Bits = VT.getScalarSizeInBits();

	// SHLD/SHRD instructions have lower register pressure, but on some
	// platforms they have higher latency than the equivalent
	// series of shifts/or that would otherwise be generated.
	// Don't fold (or (x << c) \| (y >> (64 - c))) if SHLD/SHRD instructions
	// have higher latencies and we are not optimizing for size.
	if (!OptForSize && Subtarget.isSHLDSlow())
	return SDValue();

	if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
	std::swap(N0, N1);
	if (N0.getOpcode() != ISD::SHL \|\| N1.getOpcode() != ISD::SRL)
	return SDValue();
	if (!N0.hasOneUse() \|\| !N1.hasOneUse())
	return SDValue();

	SDValue ShAmt0 = N0.getOperand(1);
	if (ShAmt0.getValueType() != MVT::i8)
	return SDValue();
	SDValue ShAmt1 = N1.getOperand(1);
	if (ShAmt1.getValueType() != MVT::i8)
	return SDValue();

	// Peek through any modulo shift masks.
	SDValue ShMsk0;
	if (ShAmt0.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
	ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {
	ShMsk0 = ShAmt0;
	ShAmt0 = ShAmt0.getOperand(0);
	}
	SDValue ShMsk1;
	if (ShAmt1.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
	ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {
	ShMsk1 = ShAmt1;
	ShAmt1 = ShAmt1.getOperand(0);
	}

	if (ShAmt0.getOpcode() == ISD::TRUNCATE)
	ShAmt0 = ShAmt0.getOperand(0);
	if (ShAmt1.getOpcode() == ISD::TRUNCATE)
	ShAmt1 = ShAmt1.getOperand(0);

	SDLoc DL(N);
	unsigned Opc = ISD::FSHL;
	SDValue Op0 = N0.getOperand(0);
	SDValue Op1 = N1.getOperand(0);
	if (ShAmt0.getOpcode() == ISD::SUB \|\| ShAmt0.getOpcode() == ISD::XOR) {
	Opc = ISD::FSHR;
	std::swap(Op0, Op1);
	std::swap(ShAmt0, ShAmt1);
	std::swap(ShMsk0, ShMsk1);
	}

	auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1,
	SDValue Amt) {
	if (Opc == ISD::FSHR)
	std::swap(Op0, Op1);
	return DAG.getNode(Opc, DL, VT, Op0, Op1,
	DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt));
	};

	// OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
	// OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C )
	// OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )
	// OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )
	// OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C )
	// OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C )
	if (ShAmt1.getOpcode() == ISD::SUB) {
	SDValue Sum = ShAmt1.getOperand(0);
	if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
	SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
	if (ShAmt1Op1.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) &&
	ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) {
	ShMsk1 = ShAmt1Op1;
	ShAmt1Op1 = ShAmt1Op1.getOperand(0);
	}
	if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op1 = ShAmt1Op1.getOperand(0);
	if ((SumC->getAPIntValue() == Bits \|\|
	(SumC->getAPIntValue() == 0 && ShMsk1)) &&
	ShAmt1Op1 == ShAmt0)
	return GetFunnelShift(Op0, Op1, ShAmt0);
	}
	} else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
	auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
	if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
	return GetFunnelShift(Op0, Op1, ShAmt0);
	} else if (ShAmt1.getOpcode() == ISD::XOR) {
	SDValue Mask = ShAmt1.getOperand(1);
	if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
	unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);
	SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
	if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op0 = ShAmt1Op0.getOperand(0);
	if (MaskC->getSExtValue() == (Bits - 1) &&
	(ShAmt1Op0 == ShAmt0 \|\| ShAmt1Op0 == ShMsk0)) {
	if (Op1.getOpcode() == InnerShift &&
	isa<ConstantSDNode>(Op1.getOperand(1)) &&
	Op1.getConstantOperandAPInt(1) == 1) {
	return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
	}
	// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
	if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
	Op1.getOperand(0) == Op1.getOperand(1)) {
	return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
	}
	}
	}
	}

	return SDValue();
	}

	/// Try to turn tests against the signbit in the form of:
	/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
	/// into:
	/// SETGT(X, -1)
	static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
	// This is only worth doing if the output type is i8 or i1.
	EVT ResultType = N->getValueType(0);
	if (ResultType != MVT::i8 && ResultType != MVT::i1)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// We should be performing an xor against a truncated shift.
	if (N0.getOpcode() != ISD::TRUNCATE \|\| !N0.hasOneUse())
	return SDValue();

	// Make sure we are performing an xor against one.
	if (!isOneConstant(N1))
	return SDValue();

	// SetCC on x86 zero extends so only act on this if it's a logical shift.
	SDValue Shift = N0.getOperand(0);
	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse())
	return SDValue();

	// Make sure we are truncating from one of i16, i32 or i64.
	EVT ShiftTy = Shift.getValueType();
	if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
	return SDValue();

	// Make sure the shift amount extracts the sign bit.
	if (!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
	return SDValue();

	// Create a greater-than comparison against -1.
	// N.B. Using SETGE against 0 works but we want a canonical looking
	// comparison, using SETGT matches up with what TranslateX86CC.
	SDLoc DL(N);
	SDValue ShiftOp = Shift.getOperand(0);
	EVT ShiftOpTy = ShiftOp.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), ResultType);
	SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
	DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
	if (SetCCResultType != ResultType)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
	return Cond;
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// pcmpgt X, -1
	///
	/// This should be called before type legalization because the pattern may not
	/// persist after that.
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isSimple())
	return SDValue();

	switch (VT.getSimpleVT().SimpleTy) {
	default: return SDValue();
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
	case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
	}

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != ISD::SRA \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftAmt =
	isConstOrConstSplat(Shift.getOperand(1), /AllowUndefs/ true);
	if (!ShiftAmt \|\|
	ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
	return SDValue();

	// Create a greater-than comparison against -1. We don't use the more obvious
	// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
	return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
	}

	/// Check if truncation with saturation form type \p SrcVT to \p DstVT
	/// is valid for the given \p Subtarget.
	static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX512())
	return false;

	// FIXME: Scalar type may be supported if we move it to vector register.
	if (!SrcVT.isVector())
	return false;

	EVT SrcElVT = SrcVT.getScalarType();
	EVT DstElVT = DstVT.getScalarType();
	if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
	return false;
	if (SrcVT.is512BitVector() \|\| Subtarget.hasVLX())
	return SrcElVT.getSizeInBits() >= 32 \|\| Subtarget.hasBWI();
	return false;
	}

	/// Detect patterns of truncation with unsigned saturation:
	///
	/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// Return the source value x to be truncated or SDValue() if the pattern was
	/// not matched.
	///
	/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
	/// where C1 >= 0 and C2 is unsigned max of destination type.
	///
	/// (truncate (smax (smin (x, C2), C1)) to dest_type)
	/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
	///
	/// These two patterns are equivalent to:
	/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
	/// So return the smax(x, C1) value to be truncated or SDValue() if the
	/// pattern was not matched.
	static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const SDLoc &DL) {
	EVT InVT = In.getValueType();

	// Saturation with truncation. We truncate from InVT to VT.
	assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
	"Unexpected types for truncate operation");

	// Match min/max and return limit value as a parameter.
	auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
	if (V.getOpcode() == Opcode &&
	ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
	return V.getOperand(0);
	return SDValue();
	};

	APInt C1, C2;
	if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
	// C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
	// the element size of the destination type.
	if (C2.isMask(VT.getScalarSizeInBits()))
	return UMin;

	if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
	if (MatchMinMax(SMin, ISD::SMAX, C1))
	if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
	return SMin;

	if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
	if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
	if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
	C2.uge(C1)) {
	return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
	}

	return SDValue();
	}

	/// Detect patterns of truncation with signed saturation:
	/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
	/// signed_max_of_dest_type)) to dest_type)
	/// or:
	/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
	/// signed_min_of_dest_type)) to dest_type).
	/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
	unsigned NumDstBits = VT.getScalarSizeInBits();
	unsigned NumSrcBits = In.getScalarValueSizeInBits();
	assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");

	auto MatchMinMax = [](SDValue V, unsigned Opcode,
	const APInt &Limit) -> SDValue {
	APInt C;
	if (V.getOpcode() == Opcode &&
	ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
	return V.getOperand(0);
	return SDValue();
	};

	APInt SignedMax, SignedMin;
	if (MatchPackUS) {
	SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
	SignedMin = APInt(NumSrcBits, 0);
	} else {
	SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
	SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
	}

	if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
	if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
	return SMax;

	if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
	if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
	return SMin;

	return SDValue();
	}

	/// Detect a pattern of truncation with signed saturation.
	/// The types should allow to use VPMOVSS* instruction on AVX512.
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
	const X86Subtarget &Subtarget,
	const TargetLowering &TLI) {
	if (!TLI.isTypeLegal(In.getValueType()))
	return SDValue();
	if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return SDValue();
	return detectSSatPattern(In, VT);
	}

	/// Detect a pattern of truncation with saturation:
	/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// The types should allow to use VPMOVUS* instruction on AVX512.
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const SDLoc &DL,
	const X86Subtarget &Subtarget,
	const TargetLowering &TLI) {
	if (!TLI.isTypeLegal(In.getValueType()))
	return SDValue();
	if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return SDValue();
	return detectUSatPattern(In, VT, DAG, DL);
	}

	static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT SVT = VT.getScalarType();
	EVT InVT = In.getValueType();
	EVT InSVT = InVT.getScalarType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
	isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
	if (auto SSatVal = detectSSatPattern(In, VT))
	return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
	if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
	}
	if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
	!Subtarget.hasAVX512() &&
	(SVT == MVT::i8 \|\| SVT == MVT::i16) &&
	(InSVT == MVT::i16 \|\| InSVT == MVT::i32)) {
	if (auto USatVal = detectSSatPattern(In, VT, true)) {
	// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
	if (SVT == MVT::i8 && InSVT == MVT::i32) {
	EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements());
	SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
	DAG, Subtarget);
	if (Mid)
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
	Subtarget);
	} else if (SVT == MVT::i8 \|\| Subtarget.hasSSE41())
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
	Subtarget);
	}
	if (auto SSatVal = detectSSatPattern(In, VT))
	return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
	Subtarget);
	}
	return SDValue();
	}

	/// This function detects the AVG pattern between vectors of unsigned i8/i16,
	/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
	/// X86ISD::AVG instruction.
	static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector())
	return SDValue();
	EVT InVT = In.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	EVT ScalarVT = VT.getVectorElementType();
	if (!((ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16) &&
	NumElems >= 2 && isPowerOf2_32(NumElems)))
	return SDValue();

	// InScalarVT is the intermediate type in AVG pattern and it should be greater
	// than the original input type (i8/i16).
	EVT InScalarVT = InVT.getVectorElementType();
	if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();

	// Detect the following pattern:
	//
	// %1 = zext <N x i8> %a to <N x i32>
	// %2 = zext <N x i8> %b to <N x i32>
	// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
	// %4 = add nuw nsw <N x i32> %3, %2
	// %5 = lshr <N x i32> %N, <i32 1 x N>
	// %6 = trunc <N x i32> %5 to <N x i8>
	//
	// In AVX512, the last instruction can also be a trunc store.
	if (In.getOpcode() != ISD::SRL)
	return SDValue();

	// A lambda checking the given SDValue is a constant vector and each element
	// is in the range [Min, Max].
	auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| !BV->isConstant())
	return false;
	for (SDValue Op : V->ops()) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return false;
	const APInt &Val = C->getAPIntValue();
	if (Val.ult(Min) \|\| Val.ugt(Max))
	return false;
	}
	return true;
	};

	// Check if each element of the vector is left-shifted by one.
	auto LHS = In.getOperand(0);
	auto RHS = In.getOperand(1);
	if (!IsConstVectorInRange(RHS, 1, 1))
	return SDValue();
	if (LHS.getOpcode() != ISD::ADD)
	return SDValue();

	// Detect a pattern of a + b + 1 where the order doesn't matter.
	SDValue Operands[3];
	Operands[0] = LHS.getOperand(0);
	Operands[1] = LHS.getOperand(1);

	auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
	};

	// Take care of the case when one of the operands is a constant vector whose
	// element is in the range [1, 256].
	if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
	Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
	Operands[0].getOperand(0).getValueType() == VT) {
	// The pattern is detected. Subtract one from the constant vector, then
	// demote it and emit X86ISD::AVG instruction.
	SDValue VecOnes = DAG.getConstant(1, DL, InVT);
	Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
	Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
	return SplitOpsAndApply(DAG, Subtarget, DL, VT,
	{ Operands[0].getOperand(0), Operands[1] },
	AVGBuilder);
	}

	// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
	// Match the or case only if its 'add-like' - can be replaced by an add.
	auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
	if (ISD::ADD == V.getOpcode()) {
	Op0 = V.getOperand(0);
	Op1 = V.getOperand(1);
	return true;
	}
	if (ISD::ZERO_EXTEND != V.getOpcode())
	return false;
	V = V.getOperand(0);
	if (V.getValueType() != VT \|\| ISD::OR != V.getOpcode() \|\|
	!DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
	return false;
	Op0 = V.getOperand(0);
	Op1 = V.getOperand(1);
	return true;
	};

	SDValue Op0, Op1;
	if (FindAddLike(Operands[0], Op0, Op1))
	std::swap(Operands[0], Operands[1]);
	else if (!FindAddLike(Operands[1], Op0, Op1))
	return SDValue();
	Operands[2] = Op0;
	Operands[1] = Op1;

	// Now we have three operands of two additions. Check that one of them is a
	// constant vector with ones, and the other two can be promoted from i8/i16.
	for (int i = 0; i < 3; ++i) {
	if (!IsConstVectorInRange(Operands[i], 1, 1))
	continue;
	std::swap(Operands[i], Operands[2]);

	// Check if Operands[0] and Operands[1] are results of type promotion.
	for (int j = 0; j < 2; ++j)
	if (Operands[j].getValueType() != VT) {
	if (Operands[j].getOpcode() != ISD::ZERO_EXTEND \|\|
	Operands[j].getOperand(0).getValueType() != VT)
	return SDValue();
	Operands[j] = Operands[j].getOperand(0);
	}

	// The pattern is detected, emit X86ISD::AVG instruction(s).
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
	AVGBuilder);
	}

	return SDValue();
	}

	static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	LoadSDNode *Ld = cast<LoadSDNode>(N);
	EVT RegVT = Ld->getValueType(0);
	EVT MemVT = Ld->getMemoryVT();
	SDLoc dl(Ld);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// For chips with slow 32-byte unaligned loads, break the 32-byte operation
	// into two 16-byte operations. Also split non-temporal aligned loads on
	// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
	ISD::LoadExtType Ext = Ld->getExtensionType();
	bool Fast;
	unsigned Alignment = Ld->getAlignment();
	if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
	Ext == ISD::NON_EXTLOAD &&
	((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) \|\|
	(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
	*Ld->getMemOperand(), &Fast) &&
	!Fast))) {
	unsigned NumElems = RegVT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	unsigned HalfAlign = 16;
	SDValue Ptr1 = Ld->getBasePtr();
	SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
	EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	NumElems / 2);
	SDValue Load1 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
	Alignment, Ld->getMemOperand()->getFlags());
	SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
	Ld->getPointerInfo().getWithOffset(HalfAlign),
	MinAlign(Alignment, HalfAlign),
	Ld->getMemOperand()->getFlags());
	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	Load1.getValue(1), Load2.getValue(1));

	SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
	return DCI.CombineTo(N, NewVec, TF, true);
	}

	// Bool vector load - attempt to cast to an integer, as we have good
	// (vXiY *ext(vXi1 bitcast(iX))) handling.
	if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
	RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
	unsigned NumElts = RegVT.getVectorNumElements();
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
	if (TLI.isTypeLegal(IntVT)) {
	SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Alignment,
	Ld->getMemOperand()->getFlags());
	SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
	return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
	}
	}

	return SDValue();
	}

	/// If V is a build vector of boolean constants and exactly one of those
	/// constants is true, return the operand index of that true element.
	/// Otherwise, return -1.
	static int getOneTrueElt(SDValue V) {
	// This needs to be a build vector of booleans.
	// TODO: Checking for the i1 type matches the IR definition for the mask,
	// but the mask check could be loosened to i8 or other types. That might
	// also require checking more than 'allOnesValue'; eg, the x86 HW
	// instructions only require that the MSB is set for each mask element.
	// The ISD::MSTORE comments/definition do not specify how the mask operand
	// is formatted.
	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| BV->getValueType(0).getVectorElementType() != MVT::i1)
	return -1;

	int TrueIndex = -1;
	unsigned NumElts = BV->getValueType(0).getVectorNumElements();
	for (unsigned i = 0; i < NumElts; ++i) {
	const SDValue &Op = BV->getOperand(i);
	if (Op.isUndef())
	continue;
	auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
	if (!ConstNode)
	return -1;
	if (ConstNode->getAPIntValue().isAllOnesValue()) {
	// If we already found a one, this is too many.
	if (TrueIndex >= 0)
	return -1;
	TrueIndex = i;
	}
	}
	return TrueIndex;
	}

	/// Given a masked memory load/store operation, return true if it has one mask
	/// bit set. If it has one mask bit set, then also return the memory address of
	/// the scalar element to load/store, the vector index to insert/extract that
	/// scalar element, and the alignment for the scalar memory access.
	static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
	SelectionDAG &DAG, SDValue &Addr,
	SDValue &Index, unsigned &Alignment) {
	int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
	if (TrueMaskElt < 0)
	return false;

	// Get the address of the one scalar element that is specified by the mask
	// using the appropriate offset from the base pointer.
	EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
	Addr = MaskedOp->getBasePtr();
	if (TrueMaskElt != 0) {
	unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
	Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
	}

	Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
	Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
	return true;
	}

	/// If exactly one element of the mask is set for a non-extending masked load,
	/// it is a scalar load and vector insert.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue
	reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Load the one scalar element that is specified by the mask using the
	// appropriate offset from the base pointer.
	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDValue Load =
	DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
	Alignment, ML->getMemOperand()->getFlags());

	// Insert the loaded element into the appropriate place in the vector.
	SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
	ML->getPassThru(), Load, VecIndex);
	return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
	}

	static SDValue
	combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
	return SDValue();

	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);

	// If we are loading the first and last elements of a vector, it is safe and
	// always faster to load the whole vector. Replace the masked load with a
	// vector load and select.
	unsigned NumElts = VT.getVectorNumElements();
	BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
	bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
	bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
	if (LoadFirstElt && LoadLastElt) {
	SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMemOperand());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
	ML->getPassThru());
	return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
	}

	// Convert a masked load with a constant mask into a masked load and a select.
	// This allows the select operation to use a faster kind of select instruction
	// (for example, vblendvps -> vblendps).

	// Don't try this if the pass-through operand is already undefined. That would
	// cause an infinite loop because that's what we're about to create.
	if (ML->getPassThru().isUndef())
	return SDValue();

	if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
	return SDValue();

	// The new masked load has an undef pass-through operand. The select uses the
	// original pass-through operand.
	SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMask(), DAG.getUNDEF(VT),
	ML->getMemoryVT(), ML->getMemOperand(),
	ML->getExtensionType());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
	ML->getPassThru());

	return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
	}

	static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);

	// TODO: Expanding load with constant mask may be optimized as well.
	if (Mld->isExpandingLoad())
	return SDValue();

	if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
	if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
	return ScalarLoad;
	// TODO: Do some AVX512 subsets benefit from this transform?
	if (!Subtarget.hasAVX512())
	if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
	return Blend;
	}

	if (Mld->getExtensionType() != ISD::EXTLOAD)
	return SDValue();

	// Resolve extending loads.
	EVT VT = Mld->getValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	EVT LdVT = Mld->getMemoryVT();
	SDLoc dl(Mld);

	assert(LdVT != VT && "Cannot extend to the same type");
	unsigned ToSz = VT.getScalarSizeInBits();
	unsigned FromSz = LdVT.getScalarSizeInBits();
	// From/To sizes and ElemCount must be pow of two.
	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
	"Unexpected size for extending masked load");

	unsigned SizeRatio = ToSz / FromSz;
	assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	LdVT.getScalarType(), NumElems*SizeRatio);
	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	// Convert PassThru value.
	SDValue WidePassThru = DAG.getBitcast(WideVecVT, Mld->getPassThru());
	if (!Mld->getPassThru().isUndef()) {
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");
	WidePassThru = DAG.getVectorShuffle(WideVecVT, dl, WidePassThru,
	DAG.getUNDEF(WideVecVT), ShuffleVec);
	}

	// Prepare the new mask.
	SDValue NewMask;
	SDValue Mask = Mld->getMask();
	if (Mask.getValueType() == VT) {
	// Mask and original value have the same type.
	NewMask = DAG.getBitcast(WideVecVT, Mask);
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;
	for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
	ShuffleVec[i] = NumElems * SizeRatio;
	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
	DAG.getConstant(0, dl, WideVecVT),
	ShuffleVec);
	} else {
	assert(Mask.getValueType().getVectorElementType() == MVT::i1);
	unsigned WidenNumElts = NumElems*SizeRatio;
	unsigned MaskNumElts = VT.getVectorNumElements();
	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	WidenNumElts);

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
	SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
	Ops[0] = Mask;
	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
	}

	SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
	Mld->getBasePtr(), NewMask, WidePassThru,
	Mld->getMemoryVT(), Mld->getMemOperand(),
	ISD::NON_EXTLOAD);

	SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd);
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i * SizeRatio] = i;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");
	SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
	DAG.getUNDEF(WideVecVT), ShuffleVec);
	SlicedVec = DAG.getBitcast(VT, SlicedVec);

	return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true);
	}

	/// If exactly one element of the mask is set for a non-truncating masked store,
	/// it is a vector extract and scalar store.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
	SelectionDAG &DAG) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Extract the one scalar element that is actually being stored.
	SDLoc DL(MS);
	EVT VT = MS->getValue().getValueType();
	EVT EltVT = VT.getVectorElementType();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
	MS->getValue(), VecIndex);

	// Store that element at the appropriate offset from the base pointer.
	return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
	Alignment, MS->getMemOperand()->getFlags());
	}

	static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
	if (Mst->isCompressingStore())
	return SDValue();

	EVT VT = Mst->getValue().getValueType();
	EVT StVT = Mst->getMemoryVT();
	SDLoc dl(Mst);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (!Mst->isTruncatingStore()) {
	if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
	return ScalarStore;

	// If the mask value has been legalized to a non-boolean vector, try to
	// simplify ops leading up to it. We only demand the MSB of each lane.
	SDValue Mask = Mst->getMask();
	if (Mask.getScalarValueSizeInBits() != 1) {
	APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
	return SDValue(N, 0);
	}

	// TODO: AVX512 targets should also be able to simplify something like the
	// pattern above, but that pattern will be different. It will either need to
	// match setcc more generally or match PCMPGTM later (in tablegen?).

	SDValue Value = Mst->getValue();
	if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
	TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
	Mst->getMemoryVT())) {
	return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
	Mst->getBasePtr(), Mask,
	Mst->getMemoryVT(), Mst->getMemOperand(), true);
	}

	return SDValue();
	}

	// Resolve truncating stores.
	unsigned NumElems = VT.getVectorNumElements();

	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromSz = VT.getScalarSizeInBits();
	unsigned ToSz = StVT.getScalarSizeInBits();

	// The truncating store is legal in some cases. For example
	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
	// are designated for truncate store.
	// In this case we don't need any further transformations.
	if (TLI.isTruncStoreLegal(VT, StVT))
	return SDValue();

	// From/To sizes and ElemCount must be pow of two.
	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
	"Unexpected size for truncating masked store");
	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	assert (((NumElems * FromSz) % ToSz) == 0 &&
	"Unexpected ratio for truncating masked store");

	unsigned SizeRatio = FromSz / ToSz;
	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	StVT.getScalarType(), NumElems*SizeRatio);

	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");

	SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
	DAG.getUNDEF(WideVecVT),
	ShuffleVec);

	SDValue NewMask;
	SDValue Mask = Mst->getMask();
	if (Mask.getValueType() == VT) {
	// Mask and original value have the same type.
	NewMask = DAG.getBitcast(WideVecVT, Mask);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;
	for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
	ShuffleVec[i] = NumElems*SizeRatio;
	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
	DAG.getConstant(0, dl, WideVecVT),
	ShuffleVec);
	} else {
	assert(Mask.getValueType().getVectorElementType() == MVT::i1);
	unsigned WidenNumElts = NumElems*SizeRatio;
	unsigned MaskNumElts = VT.getVectorNumElements();
	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	WidenNumElts);

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
	SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
	Ops[0] = Mask;
	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
	}

	return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
	Mst->getBasePtr(), NewMask, StVT,
	Mst->getMemOperand(), false);
	}

	static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	StoreSDNode *St = cast<StoreSDNode>(N);
	EVT VT = St->getValue().getValueType();
	EVT StVT = St->getMemoryVT();
	SDLoc dl(St);
	unsigned Alignment = St->getAlignment();
	SDValue StoredVal = St->getOperand(1);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Convert a store of vXi1 into a store of iX and a bitcast.
	if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
	VT.getVectorElementType() == MVT::i1) {

	EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
	StoredVal = DAG.getBitcast(NewVT, StoredVal);

	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	// If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
	// This will avoid a copy to k-register.
	if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
	StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	StoredVal.getOperand(0).getValueType() == MVT::i8) {
	return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
	St->getBasePtr(), St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	}

	// Widen v2i1/v4i1 stores to v8i1.
	if ((VT == MVT::v2i1 \|\| VT == MVT::v4i1) && VT == StVT &&
	Subtarget.hasAVX512()) {
	unsigned NumConcats = 8 / VT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
	Ops[0] = StoredVal;
	StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	// Turn vXi1 stores of constants into a scalar store.
	if ((VT == MVT::v8i1 \|\| VT == MVT::v16i1 \|\| VT == MVT::v32i1 \|\|
	VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
	ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
	// If its a v64i1 store without 64-bit support, we need two stores.
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
	StoredVal->ops().slice(0, 32));
	Lo = combinevXi1ConstantToInteger(Lo, DAG);
	SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
	StoredVal->ops().slice(32, 32));
	Hi = combinevXi1ConstantToInteger(Hi, DAG);

	SDValue Ptr0 = St->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);

	SDValue Ch0 =
	DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
	Alignment, St->getMemOperand()->getFlags());
	SDValue Ch1 =
	DAG.getStore(St->getChain(), dl, Hi, Ptr1,
	St->getPointerInfo().getWithOffset(4),
	MinAlign(Alignment, 4U),
	St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
	}

	StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
	return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	// If we are saving a concatenation of two XMM registers and 32-byte stores
	// are slow, such as on Sandy Bridge, perform two 16-byte stores.
	bool Fast;
	if (VT.is256BitVector() && StVT == VT &&
	TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	*St->getMemOperand(), &Fast) &&
	!Fast) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	return splitVectorStore(St, DAG);
	}

	// Split under-aligned vector non-temporal stores.
	if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
	// ZMM/YMM nt-stores - either it can be stored as a series of shorter
	// vectors or the legalizer can scalarize it to use MOVNTI.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();
	return splitVectorStore(St, DAG);
	}

	// XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
	// to use MOVNTI.
	if (VT.is128BitVector() && Subtarget.hasSSE2()) {
	MVT NTVT = Subtarget.hasSSE4A()
	? MVT::v2f64
	: (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
	return scalarizeVectorStore(St, NTVT, DAG);
	}
	}

	// Try to optimize v16i16->v16i8 truncating stores when BWI is not
	// supported, but avx512f is by extending to v16i32 and truncating.
	if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
	St->getValue().getOpcode() == ISD::TRUNCATE &&
	St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
	TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) &&
	!DCI.isBeforeLegalizeOps()) {
	SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
	return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
	MVT::v16i8, St->getMemOperand());
	}

	// Optimize trunc store (of multiple scalars) to shuffle and store.
	// First, pack all of the elements in one place. Next, store to memory
	// in fewer chunks.
	if (St->isTruncatingStore() && VT.isVector()) {
	// Check if we can detect an AVG pattern from the truncation. If yes,
	// replace the trunc store by a normal store with the result of X86ISD::AVG
	// instruction.
	if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
	Subtarget, dl))
	return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());

	if (SDValue Val =
	detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
	TLI))
	return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);
	if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
	DAG, dl, Subtarget, TLI))
	return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);

	unsigned NumElems = VT.getVectorNumElements();
	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromSz = VT.getScalarSizeInBits();
	unsigned ToSz = StVT.getScalarSizeInBits();

	// The truncating store is legal in some cases. For example
	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
	// are designated for truncate store.
	// In this case we don't need any further transformations.
	if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
	return SDValue();

	// From, To sizes and ElemCount must be pow of two
	if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	if (0 != (NumElems * FromSz) % ToSz) return SDValue();

	unsigned SizeRatio = FromSz / ToSz;

	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	StVT.getScalarType(), NumElems*SizeRatio);

	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
	SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	if (!TLI.isTypeLegal(WideVecVT))
	return SDValue();

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
	DAG.getUNDEF(WideVecVT),
	ShuffleVec);
	// At this point all of the data is stored at the bottom of the
	// register. We now need to save it to mem.

	// Find the largest store unit
	MVT StoreType = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
	StoreType = Tp;
	}

	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
	if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
	(64 <= NumElems * ToSz))
	StoreType = MVT::f64;

	// Bitcast the original vector into a vector of store-size units
	EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
	StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
	assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
	SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
	SmallVector<SDValue, 8> Chains;
	SDValue Ptr = St->getBasePtr();

	// Perform one or more big stores into memory.
	for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
	StoreType, ShuffWide,
	DAG.getIntPtrConstant(i, dl));
	SDValue Ch =
	DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
	Chains.push_back(Ch);
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	}

	// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
	// the FP state in cases where an emms may be missing.
	// A preferable solution to the general problem is to figure out the right
	// places to insert EMMS. This qualifies as a quick hack.

	// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
	if (VT.getSizeInBits() != 64)
	return SDValue();

	const Function &F = DAG.getMachineFunction().getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool F64IsLegal =
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
	if (((VT.isVector() && !VT.isFloatingPoint()) \|\|
	(VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
	isa<LoadSDNode>(St->getValue()) &&
	!cast<LoadSDNode>(St->getValue())->isVolatile() &&
	St->getChain().hasOneUse() && !St->isVolatile()) {
	LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
	SmallVector<SDValue, 8> Ops;

	if (!ISD::isNormalLoad(Ld))
	return SDValue();

	// If this is not the MMX case, i.e. we are just turning i64 load/store
	// into f64 load/store, avoid the transformation if there are multiple
	// uses of the loaded value.
	if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
	return SDValue();

	SDLoc LdDL(Ld);
	SDLoc StDL(N);
	// If we are a 64-bit capable x86, lower to a single movq load/store pair.
	// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
	// pair instead.
	if (Subtarget.is64Bit() \|\| F64IsLegal) {
	MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
	SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
	Ld->getMemOperand());

	// Make sure new load is placed in same chain order.
	DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
	return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
	St->getMemOperand());
	}

	// Otherwise, lower to two pairs of 32-bit loads / stores.
	SDValue LoAddr = Ld->getBasePtr();
	SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);

	SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
	Ld->getPointerInfo().getWithOffset(4),
	MinAlign(Ld->getAlignment(), 4),
	Ld->getMemOperand()->getFlags());
	// Make sure new loads are placed in same chain order.
	DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
	DAG.makeEquivalentMemoryOrdering(Ld, HiLd);

	LoAddr = St->getBasePtr();
	HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);

	SDValue LoSt =
	DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
	St->getPointerInfo().getWithOffset(4),
	MinAlign(St->getAlignment(), 4),
	St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
	}

	// This is similar to the above case, but here we handle a scalar 64-bit
	// integer store that is extracted from a vector on a 32-bit target.
	// If we have SSE2, then we can treat it like a floating-point double
	// to get past legalization. The execution dependencies fixup pass will
	// choose the optimal machine instruction for the store if this really is
	// an integer or v2f32 rather than an f64.
	if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
	St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	SDValue OldExtract = St->getOperand(1);
	SDValue ExtOp0 = OldExtract.getOperand(0);
	unsigned VecSize = ExtOp0.getValueSizeInBits();
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
	SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
	SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	BitCast, OldExtract.getOperand(1));
	return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	return SDValue();
	}

	/// Return 'true' if this vector operation is "horizontal"
	/// and return the operands for the horizontal operation in LHS and RHS. A
	/// horizontal operation performs the binary operation on successive elements
	/// of its first operand, then on successive elements of its second operand,
	/// returning the resulting values in a vector. For example, if
	/// A = < float a0, float a1, float a2, float a3 >
	/// and
	/// B = < float b0, float b1, float b2, float b3 >
	/// then the result of doing a horizontal operation on A and B is
	/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
	/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
	/// A horizontal-op B, for some already available A and B, and if so then LHS is
	/// set to A, RHS to B, and the routine returns 'true'.
	static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool IsCommutative) {
	// If either operand is undef, bail out. The binop should be simplified.
	if (LHS.isUndef() \|\| RHS.isUndef())
	return false;

	// Look for the following pattern:
	// A = < float a0, float a1, float a2, float a3 >
	// B = < float b0, float b1, float b2, float b3 >
	// and
	// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
	// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
	// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
	// which is A horizontal-op B.

	MVT VT = LHS.getSimpleValueType();
	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Unsupported vector type for horizontal add/sub");
	unsigned NumElts = VT.getVectorNumElements();

	// TODO - can we make a general helper method that does all of this for us?
	auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
	SmallVectorImpl<int> &ShuffleMask) {
	if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!Op.getOperand(0).isUndef())
	N0 = Op.getOperand(0);
	if (!Op.getOperand(1).isUndef())
	N1 = Op.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
	ShuffleMask.append(Mask.begin(), Mask.end());
	return;
	}
	bool UseSubVector = false;
	if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	Op.getOperand(0).getValueType().is256BitVector() &&
	llvm::isNullConstant(Op.getOperand(1))) {
	Op = Op.getOperand(0);
	UseSubVector = true;
	}
	bool IsUnary;
	SmallVector<SDValue, 2> SrcOps;
	SmallVector<int, 16> SrcShuffleMask;
	SDValue BC = peekThroughBitcasts(Op);
	if (isTargetShuffle(BC.getOpcode()) &&
	getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
	SrcOps, SrcShuffleMask, IsUnary)) {
	if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
	SrcOps.size() <= 2) {
	N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
	N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
	ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
	}
	if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
	SrcOps.size() == 1) {
	N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
	N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
	ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
	ShuffleMask.append(Mask.begin(), Mask.end());
	}
	}
	};

	// View LHS in the form
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// If LHS is not a shuffle, then pretend it is the identity shuffle:
	// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
	// NOTE: A default initialized SDValue represents an UNDEF of type VT.
	SDValue A, B;
	SmallVector<int, 16> LMask;
	GetShuffle(LHS, A, B, LMask);

	// Likewise, view RHS in the form
	// RHS = VECTOR_SHUFFLE C, D, RMask
	SDValue C, D;
	SmallVector<int, 16> RMask;
	GetShuffle(RHS, C, D, RMask);

	// At least one of the operands should be a vector shuffle.
	unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
	if (NumShuffles == 0)
	return false;

	if (LMask.empty()) {
	A = LHS;
	for (unsigned i = 0; i != NumElts; ++i)
	LMask.push_back(i);
	}

	if (RMask.empty()) {
	C = RHS;
	for (unsigned i = 0; i != NumElts; ++i)
	RMask.push_back(i);
	}

	// If A and B occur in reverse order in RHS, then canonicalize by commuting
	// RHS operands and shuffle mask.
	if (A != C) {
	std::swap(C, D);
	ShuffleVectorSDNode::commuteMask(RMask);
	}
	// Check that the shuffles are both shuffling the same vectors.
	if (!(A == C && B == D))
	return false;

	// LHS and RHS are now:
	// LHS = shuffle A, B, LMask
	// RHS = shuffle A, B, RMask
	// Check that the masks correspond to performing a horizontal operation.
	// AVX defines horizontal add/sub to operate independently on 128-bit lanes,
	// so we just repeat the inner loop if this is a 256-bit op.
	unsigned Num128BitChunks = VT.getSizeInBits() / 128;
	unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
	assert((NumEltsPer128BitChunk % 2 == 0) &&
	"Vector type should have an even number of elements in each lane");
	for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
	for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
	// Ignore undefined components.
	int LIdx = LMask[i + j], RIdx = RMask[i + j];
	if (LIdx < 0 \|\| RIdx < 0 \|\|
	(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|
	(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))
	continue;

	// The low half of the 128-bit result must choose from A.
	// The high half of the 128-bit result must choose from B,
	// unless B is undef. In that case, we are always choosing from A.
	unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
	unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;

	// Check that successive elements are being operated on. If not, this is
	// not a horizontal operation.
	int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
	if (!(LIdx == Index && RIdx == Index + 1) &&
	!(IsCommutative && LIdx == Index + 1 && RIdx == Index))
	return false;
	}
	}

	LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
	RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

	if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
	return false;

	LHS = DAG.getBitcast(VT, LHS);
	RHS = DAG.getBitcast(VT, RHS);
	return true;
	}

	/// Do target-specific dag combines on floating-point adds/subs.
	static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	bool IsFadd = N->getOpcode() == ISD::FADD;
	auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
	assert((IsFadd \|\| N->getOpcode() == ISD::FSUB) && "Wrong opcode");

	// Try to synthesize horizontal add/sub from adds/subs of shuffles.
	if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasAVX() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&
	isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
	return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);

	return SDValue();
	}

	/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
	/// the codegen.
	/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
	/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
	/// anything that is guaranteed to be transformed by DAGCombiner.
	static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
	SDValue Src = N->getOperand(0);
	unsigned SrcOpcode = Src.getOpcode();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();

	auto IsFreeTruncation = [VT](SDValue Op) {
	unsigned TruncSizeInBits = VT.getScalarSizeInBits();

	// See if this has been extended from a smaller/equal size to
	// the truncation size, allowing a truncation to combine with the extend.
	unsigned Opcode = Op.getOpcode();
	if ((Opcode == ISD::ANY_EXTEND \|\| Opcode == ISD::SIGN_EXTEND \|\|
	Opcode == ISD::ZERO_EXTEND) &&
	Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	// See if this is a single use constant which can be constant folded.
	// NOTE: We don't peek throught bitcasts here because there is currently
	// no support for constant folding truncate+bitcast+vector_of_constants. So
	// we'll just send up with a truncate on both operands which will
	// get turned back into (truncate (binop)) causing an infinite loop.
	return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
	};

	auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
	SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
	SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
	};

	// Don't combine if the operation has other uses.
	if (!Src.hasOneUse())
	return SDValue();

	// Only support vector truncation for now.
	// TODO: i64 scalar math would benefit as well.
	if (!VT.isVector())
	return SDValue();

	// In most cases its only worth pre-truncating if we're only facing the cost
	// of one truncation.
	// i.e. if one of the inputs will constant fold or the input is repeated.
	switch (SrcOpcode) {
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) &&
	(Op0 == Op1 \|\| IsFreeTruncation(Op0) \|\| IsFreeTruncation(Op1)))
	return TruncateArithmetic(Op0, Op1);
	break;
	}

	case ISD::MUL:
	// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
	// better to truncate if we have the chance.
	if (SrcVT.getScalarType() == MVT::i64 &&
	TLI.isOperationLegal(SrcOpcode, VT) &&
	!TLI.isOperationLegal(SrcOpcode, SrcVT))
	return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
	LLVM_FALLTHROUGH;
	case ISD::ADD: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegal(SrcOpcode, VT) &&
	(Op0 == Op1 \|\| IsFreeTruncation(Op0) \|\| IsFreeTruncation(Op1)))
	return TruncateArithmetic(Op0, Op1);
	break;
	}
	case ISD::SUB: {
	// TODO: ISD::SUB We are conservative and require both sides to be freely
	// truncatable to avoid interfering with combineSubToSubus.
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegal(SrcOpcode, VT) &&
	(Op0 == Op1 \|\| (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
	return TruncateArithmetic(Op0, Op1);
	break;
	}
	}

	return SDValue();
	}

	/// Truncate using ISD::AND mask and X86ISD::PACKUS.
	/// e.g. trunc <8 x i32> X to <8 x i16> -->
	/// MaskX = X & 0xffff (clear high bits to prevent saturation)
	/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
	static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	EVT OutVT = N->getValueType(0);

	APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
	OutVT.getScalarSizeInBits());
	In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
	return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
	}

	/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
	static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = N->getOperand(0);
	EVT InVT = In.getValueType();
	EVT OutVT = N->getValueType(0);
	In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
	DAG.getValueType(OutVT));
	return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
	}

	/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
	/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
	/// legalization the truncation will be translated into a BUILD_VECTOR with each
	/// element that is extracted from a vector and then truncated, and it is
	/// difficult to do this optimization based on them.
	static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OutVT = N->getValueType(0);
	if (!OutVT.isVector())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	EVT InVT = In.getValueType();
	unsigned NumElems = OutVT.getVectorNumElements();

	// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
	// SSE2, and we need to take care of it specially.
	// AVX512 provides vpmovdb.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX2())
	return SDValue();

	EVT OutSVT = OutVT.getVectorElementType();
	EVT InSVT = InVT.getVectorElementType();
	if (!((InSVT == MVT::i32 \|\| InSVT == MVT::i64) &&
	(OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
	NumElems >= 8))
	return SDValue();

	// SSSE3's pshufb results in less instructions in the cases below.
	if (Subtarget.hasSSSE3() && NumElems == 8 &&
	((OutSVT == MVT::i8 && InSVT != MVT::i64) \|\|
	(InSVT == MVT::i32 && OutSVT == MVT::i16)))
	return SDValue();

	SDLoc DL(N);
	// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
	// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
	// truncate 2 x v4i32 to v8i16.
	if (Subtarget.hasSSE41() \|\| OutSVT == MVT::i8)
	return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
	if (InSVT == MVT::i32)
	return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);

	return SDValue();
	}

	/// This function transforms vector truncation of 'extended sign-bits' or
	/// 'extended zero-bits' values.
	/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
	static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Requires SSE2 but AVX512 has fast truncate.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	if (!N->getValueType(0).isVector() \|\| !N->getValueType(0).isSimple())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	MVT VT = N->getValueType(0).getSimpleVT();
	MVT SVT = VT.getScalarType();

	MVT InVT = In.getValueType().getSimpleVT();
	MVT InSVT = InVT.getScalarType();

	// Check we have a truncation suited for PACKSS/PACKUS.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
	return SDValue();
	if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
	return SDValue();

	unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
	unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

	// Use PACKUS if the input has zero-bits that extend all the way to the
	// packed/truncated value. e.g. masks, zext_in_reg, etc.
	KnownBits Known = DAG.computeKnownBits(In);
	unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
	if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

	// Use PACKSS if the input has sign-bits that extend all the way to the
	// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
	unsigned NumSignBits = DAG.ComputeNumSignBits(In);
	if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
	return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

	return SDValue();
	}

	// Try to form a MULHU or MULHS node by looking for
	// (trunc (srl (mul ext, ext), 16))
	// TODO: This is X86 specific because we want to be able to handle wide types
	// before type legalization. But we can only do it if the vector will be
	// legalized via widening/splitting. Type legalization can't handle promotion
	// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
	// combiner.
	static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
	SelectionDAG &DAG, const X86Subtarget &Subtarget) {
	// First instruction should be a right shift of a multiply.
	if (Src.getOpcode() != ISD::SRL \|\|
	Src.getOperand(0).getOpcode() != ISD::MUL)
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();

	// Only handle vXi16 types that are at least 128-bits unless they will be
	// widened.
	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i16 \|\|
	(!ExperimentalVectorWideningLegalization &&
	VT.getVectorNumElements() < 8))
	return SDValue();

	// Input type should be vXi32.
	EVT InVT = Src.getValueType();
	if (InVT.getVectorElementType() != MVT::i32)
	return SDValue();

	// Need a shift by 16.
	APInt ShiftAmt;
	if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) \|\|
	ShiftAmt != 16)
	return SDValue();

	SDValue LHS = Src.getOperand(0).getOperand(0);
	SDValue RHS = Src.getOperand(0).getOperand(1);

	unsigned ExtOpc = LHS.getOpcode();
	if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) \|\|
	RHS.getOpcode() != ExtOpc)
	return SDValue();

	// Peek through the extends.
	LHS = LHS.getOperand(0);
	RHS = RHS.getOperand(0);

	// Ensure the input types match.
	if (LHS.getValueType() != VT \|\| RHS.getValueType() != VT)
	return SDValue();

	unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
	return DAG.getNode(Opc, DL, VT, LHS, RHS);
	}

	// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
	// from one vector with signed bytes from another vector, adds together
	// adjacent pairs of 16-bit products, and saturates the result before
	// truncating to 16-bits.
	//
	// Which looks something like this:
	// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
	// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
	static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector() \|\| !Subtarget.hasSSSE3())
	return SDValue();

	unsigned NumElems = VT.getVectorNumElements();
	EVT ScalarVT = VT.getVectorElementType();
	if (ScalarVT != MVT::i16 \|\| NumElems < 8 \|\| !isPowerOf2_32(NumElems))
	return SDValue();

	SDValue SSatVal = detectSSatPattern(In, VT);
	if (!SSatVal \|\| SSatVal.getOpcode() != ISD::ADD)
	return SDValue();

	// Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
	// of multiplies from even/odd elements.
	SDValue N0 = SSatVal.getOperand(0);
	SDValue N1 = SSatVal.getOperand(1);

	if (N0.getOpcode() != ISD::MUL \|\| N1.getOpcode() != ISD::MUL)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);

	// TODO: Handle constant vectors and use knownbits/computenumsignbits?
	// Canonicalize zero_extend to LHS.
	if (N01.getOpcode() == ISD::ZERO_EXTEND)
	std::swap(N00, N01);
	if (N11.getOpcode() == ISD::ZERO_EXTEND)
	std::swap(N10, N11);

	// Ensure we have a zero_extend and a sign_extend.
	if (N00.getOpcode() != ISD::ZERO_EXTEND \|\|
	N01.getOpcode() != ISD::SIGN_EXTEND \|\|
	N10.getOpcode() != ISD::ZERO_EXTEND \|\|
	N11.getOpcode() != ISD::SIGN_EXTEND)
	return SDValue();

	// Peek through the extends.
	N00 = N00.getOperand(0);
	N01 = N01.getOperand(0);
	N10 = N10.getOperand(0);
	N11 = N11.getOperand(0);

	// Ensure the extend is from vXi8.
	if (N00.getValueType().getVectorElementType() != MVT::i8 \|\|
	N01.getValueType().getVectorElementType() != MVT::i8 \|\|
	N10.getValueType().getVectorElementType() != MVT::i8 \|\|
	N11.getValueType().getVectorElementType() != MVT::i8)
	return SDValue();

	// All inputs should be build_vectors.
	if (N00.getOpcode() != ISD::BUILD_VECTOR \|\|
	N01.getOpcode() != ISD::BUILD_VECTOR \|\|
	N10.getOpcode() != ISD::BUILD_VECTOR \|\|
	N11.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// N00/N10 are zero extended. N01/N11 are sign extended.

	// For each element, we need to ensure we have an odd element from one vector
	// multiplied by the odd element of another vector and the even element from
	// one of the same vectors being multiplied by the even element from the
	// other vector. So we need to make sure for each element i, this operator
	// is being performed:
	// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
	SDValue ZExtIn, SExtIn;
	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue N00Elt = N00.getOperand(i);
	SDValue N01Elt = N01.getOperand(i);
	SDValue N10Elt = N10.getOperand(i);
	SDValue N11Elt = N11.getOperand(i);
	// TODO: Be more tolerant to undefs.
	if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
	auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
	auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
	auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
	if (!ConstN00Elt \|\| !ConstN01Elt \|\| !ConstN10Elt \|\| !ConstN11Elt)
	return SDValue();
	unsigned IdxN00 = ConstN00Elt->getZExtValue();
	unsigned IdxN01 = ConstN01Elt->getZExtValue();
	unsigned IdxN10 = ConstN10Elt->getZExtValue();
	unsigned IdxN11 = ConstN11Elt->getZExtValue();
	// Add is commutative so indices can be reordered.
	if (IdxN00 > IdxN10) {
	std::swap(IdxN00, IdxN10);
	std::swap(IdxN01, IdxN11);
	}
	// N0 indices be the even element. N1 indices must be the next odd element.
	if (IdxN00 != 2 * i \|\| IdxN10 != 2 * i + 1 \|\|
	IdxN01 != 2 * i \|\| IdxN11 != 2 * i + 1)
	return SDValue();
	SDValue N00In = N00Elt.getOperand(0);
	SDValue N01In = N01Elt.getOperand(0);
	SDValue N10In = N10Elt.getOperand(0);
	SDValue N11In = N11Elt.getOperand(0);
	// First time we find an input capture it.
	if (!ZExtIn) {
	ZExtIn = N00In;
	SExtIn = N01In;
	}
	if (ZExtIn != N00In \|\| SExtIn != N01In \|\|
	ZExtIn != N10In \|\| SExtIn != N11In)
	return SDValue();
	}

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Shrink by adding truncate nodes and let DAGCombine fold with the
	// sources.
	EVT InVT = Ops[0].getValueType();
	assert(InVT.getScalarType() == MVT::i8 &&
	"Unexpected scalar element type");
	assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	InVT.getVectorNumElements() / 2);
	return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
	PMADDBuilder);
	}

	static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// Attempt to pre-truncate inputs to arithmetic ops instead.
	if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
	return V;

	// Try to detect AVG pattern first.
	if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
	return Avg;

	// Try to detect PMADD
	if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
	return PMAdd;

	// Try to combine truncation with signed/unsigned saturation.
	if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
	return Val;

	// Try to combine PMULHUW/PMULHW for vXi16.
	if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
	return V;

	// The bitcast source is a direct mmx result.
	// Detect bitcasts between i32 to x86mmx
	if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
	SDValue BCSrc = Src.getOperand(0);
	if (BCSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
	}

	// Try to truncate extended sign/zero bits with PACKSS/PACKUS.
	if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
	return V;

	return combineVectorTruncation(N, DAG, Subtarget);
	}

	/// Returns the negated value if the node \p N flips sign of FP value.
	///
	/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
	/// or FSUB(0, x)
	/// AVX512F does not have FXOR, so FNEG is lowered as
	/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
	/// In this case we go though all bitcasts.
	/// This also recognizes splat of a negated value and returns the splat of that
	/// value.
	static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
	if (N->getOpcode() == ISD::FNEG)
	return N->getOperand(0);

	unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();

	SDValue Op = peekThroughBitcasts(SDValue(N, 0));
	EVT VT = Op->getValueType(0);
	// Make sure the element size does't change.
	if (VT.getScalarSizeInBits() != ScalarSize)
	return SDValue();

	if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) {
	// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
	// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
	if (!SVOp->getOperand(1).isUndef())
	return SDValue();
	if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode()))
	if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
	return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
	SVOp->getMask());
	return SDValue();
	}
	unsigned Opc = Op.getOpcode();
	if (Opc == ISD::INSERT_VECTOR_ELT) {
	// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
	// -V, INDEX).
	SDValue InsVector = Op.getOperand(0);
	SDValue InsVal = Op.getOperand(1);
	if (!InsVector.isUndef())
	return SDValue();
	if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode()))
	if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
	NegInsVal, Op.getOperand(2));
	return SDValue();
	}

	if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB)
	return SDValue();

	SDValue Op1 = Op.getOperand(1);
	SDValue Op0 = Op.getOperand(0);

	// For XOR and FXOR, we want to check if constant bits of Op1 are sign bit
	// masks. For FSUB, we have to check if constant bits of Op0 are sign bit
	// masks and hence we swap the operands.
	if (Opc == ISD::FSUB)
	std::swap(Op0, Op1);

	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	// Extract constant bits and see if they are all sign bit masks. Ignore the
	// undef elements.
	if (getTargetConstantBitsFromNode(Op1, ScalarSize,
	UndefElts, EltBits,
	/* AllowWholeUndefs */ true,
	/* AllowPartialUndefs */ false)) {
	for (unsigned I = 0, E = EltBits.size(); I < E; I++)
	if (!UndefElts[I] && !EltBits[I].isSignMask())
	return SDValue();

	return peekThroughBitcasts(Op0);
	}

	return SDValue();
	}

	/// Do target-specific dag combines on floating point negations.
	static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OrigVT = N->getValueType(0);
	SDValue Arg = isFNEG(DAG, N);
	if (!Arg)
	return SDValue();

	EVT VT = Arg.getValueType();
	EVT SVT = VT.getScalarType();
	SDLoc DL(N);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// If we're negating a FMUL node on a target with FMA, then we can avoid the
	// use of a constant by performing (-0 - A*B) instead.
	// FIXME: Check rounding control flags as well once it becomes available.
	if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 \|\| SVT == MVT::f64) &&
	Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
	SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
	SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
	Arg.getOperand(1), Zero);
	return DAG.getBitcast(OrigVT, NewNode);
	}

	// If we're negating an FMA node, then we can adjust the
	// instruction to include the extra negation.
	unsigned NewOpcode = 0;
	if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
	switch (Arg.getOpcode()) {
	case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
	case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
	case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
	// We can't handle scalar intrinsic node here because it would only
	// invert one element and not the whole vector. But we could try to handle
	// a negation of the lower element only.
	}
	}
	if (NewOpcode)
	return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
	Arg.getNode()->ops()));

	return SDValue();
	}

	static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	// If we have integer vector types available, use the integer opcodes.
	if (!VT.isVector() \|\| !Subtarget.hasSSE2())
	return SDValue();

	SDLoc dl(N);

	unsigned IntBits = VT.getScalarSizeInBits();
	MVT IntSVT = MVT::getIntegerVT(IntBits);
	MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);

	SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
	SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
	unsigned IntOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected FP logic op");
	case X86ISD::FOR: IntOpcode = ISD::OR; break;
	case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
	case X86ISD::FAND: IntOpcode = ISD::AND; break;
	case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
	}
	SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
	return DAG.getBitcast(VT, IntOp);
	}


	/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
	static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() != ISD::XOR)
	return SDValue();

	SDValue LHS = N->getOperand(0);
	auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!RHSC \|\| RHSC->getZExtValue() != 1 \|\| LHS->getOpcode() != X86ISD::SETCC)
	return SDValue();

	X86::CondCode NewCC = X86::GetOppositeBranchCondition(
	X86::CondCode(LHS->getConstantOperandVal(0)));
	SDLoc DL(N);
	return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
	}

	static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// If this is SSE1 only convert to FXOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
	N->getValueType(0) == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue SetCC = foldXor1SetCC(N, DAG))
	return SetCC;

	if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
	return RV;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	return combineFneg(N, DAG, Subtarget);
	}

	static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	unsigned NumBits = VT.getSizeInBits();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// TODO - Constant Folding.
	if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
	// Reduce Cst1 to the bottom 16-bits.
	// NOTE: SimplifyDemandedBits won't do this for constants.
	const APInt &Val1 = Cst1->getAPIntValue();
	APInt MaskedVal1 = Val1 & 0xFFFF;
	if (MaskedVal1 != Val1)
	return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
	DAG.getConstant(MaskedVal1, SDLoc(N), VT));
	}

	// Only bottom 16-bits of the control bits are required.
	APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
	if (TLI.SimplifyDemandedBits(Op1, DemandedMask, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static bool isNullFPScalarOrVectorConst(SDValue V) {
	return isNullFPConstant(V) \|\| ISD::isBuildVectorAllZeros(V.getNode());
	}

	/// If a value is a scalar FP zero or a vector FP zero (potentially including
	/// undefined elements), return a zero constant that may be used to fold away
	/// that value. In the case of a vector, the returned constant will not contain
	/// undefined elements even if the input parameter does. This makes it suitable
	/// to be used as a replacement operand with operations (eg, bitwise-and) where
	/// an undef should not propagate.
	static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!isNullFPScalarOrVectorConst(V))
	return SDValue();

	if (V.getValueType().isVector())
	return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

	return V;
	}

	static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
	if (!((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::f64 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
	return SDValue();

	auto isAllOnesConstantFP = [](SDValue V) {
	if (V.getSimpleValueType().isVector())
	return ISD::isBuildVectorAllOnes(V.getNode());
	auto *C = dyn_cast<ConstantFPSDNode>(V);
	return C && C->getConstantFPValue()->isAllOnesValue();
	};

	// fand (fxor X, -1), Y --> fandn X, Y
	if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

	// fand X, (fxor Y, -1) --> fandn Y, X
	if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::FAND nodes.
	static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FAND(0.0, x) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
	return V;

	// FAND(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FANDN nodes.
	static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FANDN(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// FANDN(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
	static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR);

	// F[X]OR(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// F[X]OR(x, 0.0) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(1)))
	return N->getOperand(0);

	if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
	return NewVal;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
	static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD::FMAX);

	// Only perform optimizations if UnsafeMath is used.
	if (!DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
	// into FMINC and FMAXC, which are Commutative operations.
	unsigned NewOp = 0;
	switch (N->getOpcode()) {
	default: llvm_unreachable("unknown opcode");
	case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
	case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
	}

	return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N->getOperand(1));
	}

	static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Subtarget.useSoftFloat())
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64) \|\|
	(VT.isVector() && TLI.isTypeLegal(VT))))
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDLoc DL(N);
	auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;

	// If we don't have to respect NaN inputs, this is a direct translation to x86
	// min/max instructions.
	if (DAG.getTarget().Options.NoNaNsFPMath \|\| N->getFlags().hasNoNaNs())
	return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

	// If one of the operands is known non-NaN use the native min/max instructions
	// with the non-NaN input as second operand.
	if (DAG.isKnownNeverNaN(Op1))
	return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
	if (DAG.isKnownNeverNaN(Op0))
	return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());

	// If we have to respect NaN inputs, this takes at least 3 instructions.
	// Favor a library call when operating on a scalar and minimizing code size.
	if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
	return SDValue();

	EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	VT);

	// There are 4 possibilities involving NaN inputs, and these are the required
	// outputs:
	// Op1
	// Num NaN
	// ----------------
	// Num \| Max \| Op0 \|
	// Op0 ----------------
	// NaN \| Op1 \| NaN \|
	// ----------------
	//
	// The SSE FP max/min instructions were not designed for this case, but rather
	// to implement:
	// Min = Op1 < Op0 ? Op1 : Op0
	// Max = Op1 > Op0 ? Op1 : Op0
	//
	// So they always return Op0 if either input is a NaN. However, we can still
	// use those instructions for fmaxnum by selecting away a NaN input.

	// If either operand is NaN, the 2nd source operand (Op0) is passed through.
	SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
	SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);

	// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
	// are NaN, the NaN value of Op1 is the result.
	return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
	}

	static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	APInt KnownUndef, KnownZero;
	APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
	if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
	KnownZero, DCI))
	return SDValue(N, 0);

	// Convert a full vector load into vzload when not all bits are needed.
	SDValue In = N->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
	ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
	assert(InVT.is128BitVector() && "Expected 128-bit input vector");
	LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
	// Unless the load is volatile.
	if (!LN->isVolatile()) {
	SDLoc dl(N);
	unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
	MVT MemVT = MVT::getIntegerVT(NumBits);
	MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
	SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue VZLoad =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
	LN->getPointerInfo(),
	LN->getAlignment(),
	LN->getMemOperand()->getFlags());
	SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
	DAG.getBitcast(InVT, VZLoad));
	DCI.CombineTo(N, Convert);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);

	// Convert a full vector load into vzload when not all bits are needed.
	SDValue In = N->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
	ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
	assert(InVT.is128BitVector() && "Expected 128-bit input vector");
	LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
	// Unless the load is volatile.
	if (!LN->isVolatile()) {
	SDLoc dl(N);
	unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
	MVT MemVT = MVT::getFloatingPointVT(NumBits);
	MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
	SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
	SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
	SDValue VZLoad =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
	LN->getPointerInfo(),
	LN->getAlignment(),
	LN->getMemOperand()->getFlags());
	SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
	DAG.getBitcast(InVT, VZLoad));
	DCI.CombineTo(N, Convert);
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::ANDNP nodes.
	static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);

	// ANDNP(0, x) -> x
	if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
	return N->getOperand(1);

	// ANDNP(x, 0) -> 0
	if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
	return DAG.getConstant(0, SDLoc(N), VT);

	// Turn ANDNP back to AND if input is inverted.
	if (SDValue Not = IsNOT(N->getOperand(0), DAG))
	return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
	N->getOperand(1));

	// Attempt to recursively combine a bitmask ANDNP with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// BT ignores high bits in the bit index operand.
	unsigned BitWidth = N1.getValueSizeInBits();
	APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
	if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
	return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);

	return SDValue();
	}

	// Try to combine sext_in_reg of a cmov of constants by extending the constants.
	static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);

	EVT DstVT = N->getValueType(0);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

	if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
	return SDValue();

	// Look through single use any_extends / truncs.
	SDValue IntermediateBitwidthOp;
	if ((N0.getOpcode() == ISD::ANY_EXTEND \|\| N0.getOpcode() == ISD::TRUNCATE) &&
	N0.hasOneUse()) {
	IntermediateBitwidthOp = N0;
	N0 = N0.getOperand(0);
	}

	// See if we have a single use cmov.
	if (N0.getOpcode() != X86ISD::CMOV \|\| !N0.hasOneUse())
	return SDValue();

	SDValue CMovOp0 = N0.getOperand(0);
	SDValue CMovOp1 = N0.getOperand(1);

	// Make sure both operands are constants.
	if (!isa<ConstantSDNode>(CMovOp0.getNode()) \|\|
	!isa<ConstantSDNode>(CMovOp1.getNode()))
	return SDValue();

	SDLoc DL(N);

	// If we looked through an any_extend/trunc above, add one to the constants.
	if (IntermediateBitwidthOp) {
	unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
	CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
	CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
	}

	CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
	CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);

	EVT CMovVT = DstVT;
	// We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
	if (DstVT == MVT::i16) {
	CMovVT = MVT::i32;
	CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
	CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
	}

	SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
	N0.getOperand(2), N0.getOperand(3));

	if (CMovVT != DstVT)
	CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);

	return CMov;
	}

	static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);

	if (SDValue V = combineSextInRegCmov(N, DAG))
	return V;

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
	SDLoc dl(N);

	// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
	// both SSE and AVX2 since there is no sign-extended shift right
	// operation on a vector with 64-bit elements.
	//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
	// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
	if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND)) {
	SDValue N00 = N0.getOperand(0);

	// EXTLOAD has a better solution on AVX2,
	// it may be replaced with X86ISD::VSEXT node.
	if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
	if (!ISD::isNormalLoad(N00.getNode()))
	return SDValue();

	if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
	SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
	N00, N1);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
	}
	}
	return SDValue();
	}

	/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
	/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
	/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
	/// opportunities to combine math ops, use an LEA, or use a complex addressing
	/// mode. This can eliminate extend, add, and shift instructions.
	static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
	Ext->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();

	// TODO: This should be valid for other integer types.
	EVT VT = Ext->getValueType(0);
	if (VT != MVT::i64)
	return SDValue();

	SDValue Add = Ext->getOperand(0);
	if (Add.getOpcode() != ISD::ADD)
	return SDValue();

	bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
	bool NSW = Add->getFlags().hasNoSignedWrap();
	bool NUW = Add->getFlags().hasNoUnsignedWrap();

	// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
	// into the 'zext'
	if ((Sext && !NSW) \|\| (!Sext && !NUW))
	return SDValue();

	// Having a constant operand to the 'add' ensures that we are not increasing
	// the instruction count because the constant is extended for free below.
	// A constant operand can also become the displacement field of an LEA.
	auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
	if (!AddOp1)
	return SDValue();

	// Don't make the 'add' bigger if there's no hope of combining it with some
	// other 'add' or 'shl' instruction.
	// TODO: It may be profitable to generate simpler LEA instructions in place
	// of single 'add' instructions, but the cost model for selecting an LEA
	// currently has a high threshold.
	bool HasLEAPotential = false;
	for (auto *User : Ext->uses()) {
	if (User->getOpcode() == ISD::ADD \|\| User->getOpcode() == ISD::SHL) {
	HasLEAPotential = true;
	break;
	}
	}
	if (!HasLEAPotential)
	return SDValue();

	// Everything looks good, so pull the '{s\|z}ext' ahead of the 'add'.
	int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
	SDValue AddOp0 = Add.getOperand(0);
	SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
	SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

	// The wider add is guaranteed to not wrap because both operands are
	// sign-extended.
	SDNodeFlags Flags;
	Flags.setNoSignedWrap(NSW);
	Flags.setNoUnsignedWrap(NUW);
	return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
	}

	// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
	// operands and the result of CMOV is not used anywhere else - promote CMOV
	// itself instead of promoting its result. This could be beneficial, because:
	// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
	// (or more) pseudo-CMOVs only when they go one-after-another and
	// getting rid of result extension code after CMOV will help that.
	// 2) Promotion of constant CMOV arguments is free, hence the
	// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
	// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
	// promotion is also good in terms of code-size.
	// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
	// promotion).
	static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
	SDValue CMovN = Extend->getOperand(0);
	if (CMovN.getOpcode() != X86ISD::CMOV \|\| !CMovN.hasOneUse())
	return SDValue();

	EVT TargetVT = Extend->getValueType(0);
	unsigned ExtendOpcode = Extend->getOpcode();
	SDLoc DL(Extend);

	EVT VT = CMovN.getValueType();
	SDValue CMovOp0 = CMovN.getOperand(0);
	SDValue CMovOp1 = CMovN.getOperand(1);

	if (!isa<ConstantSDNode>(CMovOp0.getNode()) \|\|
	!isa<ConstantSDNode>(CMovOp1.getNode()))
	return SDValue();

	// Only extend to i32 or i64.
	if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
	return SDValue();

	// Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
	// are free.
	if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
	return SDValue();

	// If this a zero extend to i64, we should only extend to i32 and use a free
	// zero extend to finish.
	EVT ExtendVT = TargetVT;
	if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
	ExtendVT = MVT::i32;

	CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
	CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);

	SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
	CMovN.getOperand(2), CMovN.getOperand(3));

	// Finish extending if needed.
	if (ExtendVT != TargetVT)
	Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);

	return Res;
	}

	// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
	// This is more or less the reverse of combineBitcastvxi1.
	static SDValue
	combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
	Opcode != ISD::ANY_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InSVT = N0.getValueType().getScalarType();
	unsigned EltSizeInBits = SVT.getSizeInBits();

	// Input type must be extending a bool vector (bit-casted from a scalar
	// integer) to legal integer types.
	if (!VT.isVector())
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
	return SDValue();
	if (InSVT != MVT::i1 \|\| N0.getOpcode() != ISD::BITCAST)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	EVT SclVT = N0.getOperand(0).getValueType();
	if (!SclVT.isScalarInteger())
	return SDValue();

	SDLoc DL(N);
	SDValue Vec;
	SmallVector<int, 32> ShuffleMask;
	unsigned NumElts = VT.getVectorNumElements();
	assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");

	// Broadcast the scalar integer to the vector elements.
	if (NumElts > EltSizeInBits) {
	// If the scalar integer is greater than the vector element size, then we
	// must split it down into sub-sections for broadcasting. For example:
	// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
	// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
	assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
	unsigned Scale = NumElts / EltSizeInBits;
	EVT BroadcastVT =
	EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
	Vec = DAG.getBitcast(VT, Vec);

	for (unsigned i = 0; i != Scale; ++i)
	ShuffleMask.append(EltSizeInBits, i);
	} else {
	// For smaller scalar integers, we can simply any-extend it to the vector
	// element size (we don't care about the upper bits) and broadcast it to all
	// elements.
	SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
	ShuffleMask.append(NumElts, 0);
	}
	Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

	// Now, mask the relevant bit in each element.
	SmallVector<SDValue, 32> Bits;
	for (unsigned i = 0; i != NumElts; ++i) {
	int BitIdx = (i % EltSizeInBits);
	APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
	Bits.push_back(DAG.getConstant(Bit, DL, SVT));
	}
	SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
	Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

	// Compare against the bitmask and extend the result.
	EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
	Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
	Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

	// For SEXT, this is now done, otherwise shift the result down for
	// zero-extension.
	if (Opcode == ISD::SIGN_EXTEND)
	return Vec;
	return DAG.getNode(ISD::SRL, DL, VT, Vec,
	DAG.getConstant(EltSizeInBits - 1, DL, VT));
	}

	/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
	/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
	/// with UNDEFs) of the input to vectors of the same size as the target type
	/// which then extends the lowest elements.
	static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (ExperimentalVectorWideningLegalization)
	return SDValue();

	unsigned Opcode = N->getOpcode();
	// TODO - add ANY_EXTEND support.
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InVT = N0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// FIXME: Generic DAGCombiner previously had a bug that would cause a
	// sign_extend of setcc to sometimes return the original node and tricked it
	// into thinking CombineTo was used which prevented the target combines from
	// running.
	// Earlying out here to avoid regressions like this
	// (v4i32 (sext (v4i1 (setcc (v4i16)))))
	// Becomes
	// (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef))))
	// Type legalized to
	// (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32)))))))
	// Leading to a packssdw+pmovsxwd
	// We could write a DAG combine to fix this, but really we shouldn't be
	// creating sext_invec that's forcing v8i16 into the DAG.
	if (N0.getOpcode() == ISD::SETCC)
	return SDValue();

	// Input type must be a vector and we must be extending legal integer types.
	if (!VT.isVector() \|\| VT.getVectorNumElements() < 2)
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();

	// If the input/output types are both legal then we have at least AVX1 and
	// we will be able to use SIGN_EXTEND/ZERO_EXTEND directly.
	if (DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	DAG.getTargetLoweringInfo().isTypeLegal(InVT))
	return SDValue();

	SDLoc DL(N);

	auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
	EVT SrcVT = N.getValueType();
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
	Size / SrcVT.getScalarSizeInBits());
	SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(),
	DAG.getUNDEF(SrcVT));
	Opnds[0] = N;
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds);
	};

	// If target-size is less than 128-bits, extend to a type that would extend
	// to 128 bits, extend that and extract the original target vector.
	if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
	unsigned Scale = 128 / VT.getSizeInBits();
	EVT ExVT =
	EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
	SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
	SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
	DAG.getIntPtrConstant(0, DL));
	}

	// If target-size is 128-bits (or 256-bits on AVX target), then convert to
	// ISD::_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::VEXT.
	// Also use this if we don't have SSE41 to allow the legalizer do its job.
	if (!Subtarget.hasSSE41() \|\| VT.is128BitVector() \|\|
	(VT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(VT.is512BitVector() && Subtarget.useAVX512Regs())) {
	SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
	Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
	return DAG.getNode(Opcode, DL, VT, ExOp);
	}

	auto SplitAndExtendInReg = [&](unsigned SplitSize) {
	unsigned NumVecs = VT.getSizeInBits() / SplitSize;
	unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
	EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
	EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);

	unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode);
	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
	SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
	DAG.getIntPtrConstant(Offset, DL));
	SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
	SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec);
	Opnds.push_back(SrcVec);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
	};

	// On pre-AVX targets, split into 128-bit nodes of
	// ISD::*_EXTEND_VECTOR_INREG.
	if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128))
	return SplitAndExtendInReg(128);

	// On pre-AVX512 targets, split into 256-bit nodes of
	// ISD::*_EXTEND_VECTOR_INREG.
	if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
	return SplitAndExtendInReg(256);

	return SDValue();
	}

	// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
	// result type.
	static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	SDLoc dl(N);

	// Only do this combine with AVX512 for vector extends.
	if (!Subtarget.hasAVX512() \|\| !VT.isVector() \|\| N0.getOpcode() != ISD::SETCC)
	return SDValue();

	// Only combine legal element types.
	EVT SVT = VT.getVectorElementType();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
	SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
	return SDValue();

	// We can only do this if the vector size in 256 bits or less.
	unsigned Size = VT.getSizeInBits();
	if (Size > 256)
	return SDValue();

	// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
	// that's the only integer compares with we have.
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
	if (ISD::isUnsignedIntSetCC(CC))
	return SDValue();

	// Only do this combine if the extension will be fully consumed by the setcc.
	EVT N00VT = N0.getOperand(0).getValueType();
	EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
	if (Size != MatchingVecType.getSizeInBits())
	return SDValue();

	SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);

	if (N->getOpcode() == ISD::ZERO_EXTEND)
	Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());

	return Res;
	}

	static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	SDLoc DL(N);

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
	return V;

	if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
	isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
	// Invert and sign-extend a boolean is the same as zero-extend and subtract
	// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
	// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
	// sext (xor Bool, -1) --> sub (zext Bool), 1
	SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
	}

	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	return SDValue();
	}

	static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
	if (NegMul) {
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::FMA: Opcode = X86ISD::FNMADD; break;
	case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
	case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FNMADD: Opcode = ISD::FMA; break;
	case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
	case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
	case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
	}
	}

	if (NegAcc) {
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::FMA: Opcode = X86ISD::FMSUB; break;
	case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FMSUB: Opcode = ISD::FMA; break;
	case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
	case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
	case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
	case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
	}
	}

	return Opcode;
	}

	static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	EVT ScalarVT = VT.getScalarType();
	if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) \|\| !Subtarget.hasAnyFMA())
	return SDValue();

	SDValue A = N->getOperand(0);
	SDValue B = N->getOperand(1);
	SDValue C = N->getOperand(2);

	auto invertIfNegative = [&DAG](SDValue &V) {
	if (SDValue NegVal = isFNEG(DAG, V.getNode())) {
	V = DAG.getBitcast(V.getValueType(), NegVal);
	return true;
	}
	// Look through extract_vector_elts. If it comes from an FNEG, create a
	// new extract from the FNEG input.
	if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isNullConstant(V.getOperand(1))) {
	if (SDValue NegVal = isFNEG(DAG, V.getOperand(0).getNode())) {
	NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
	V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
	NegVal, V.getOperand(1));
	return true;
	}
	}

	return false;
	};

	// Do not convert the passthru input of scalar intrinsics.
	// FIXME: We could allow negations of the lower element only.
	bool NegA = invertIfNegative(A);
	bool NegB = invertIfNegative(B);
	bool NegC = invertIfNegative(C);

	if (!NegA && !NegB && !NegC)
	return SDValue();

	unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);

	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, A, B, C);
	}

	// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
	static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	SDValue NegVal = isFNEG(DAG, N->getOperand(2).getNode());
	if (!NegVal)
	return SDValue();

	// FIXME: Should we bitcast instead?
	if (NegVal.getValueType() != VT)
	return SDValue();

	unsigned NewOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode!");
	case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
	case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
	case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
	case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
	}

	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegVal, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegVal);
	}

	static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
	// (and (i32 x86isd::setcc_carry), 1)
	// This eliminates the zext. This transformation is necessary because
	// ISD::SETCC is always legalized to i8.
	SDLoc dl(N);
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.getOpcode() == ISD::AND &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	if (!isOneConstant(N0.getOperand(1)))
	return SDValue();
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (N0.getOpcode() == ISD::TRUNCATE &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (DCI.isBeforeLegalizeOps())
	if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
	return V;

	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
	return R;

	// TODO: Combine with any target/faux shuffle.
	if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
	VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	unsigned NumSrcElts = N00.getValueType().getVectorNumElements();
	unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
	APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
	if ((N00.isUndef() \|\| DAG.MaskedValueIsZero(N00, ZeroMask)) &&
	(N01.isUndef() \|\| DAG.MaskedValueIsZero(N01, ZeroMask))) {
	return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128);
	}
	}

	return SDValue();
	}

	/// Try to map a 128-bit or larger integer comparison to vector instructions
	/// before type legalization splits it up into chunks.
	static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
	assert((CC == ISD::SETNE \|\| CC == ISD::SETEQ) && "Bad comparison predicate");

	// We're looking for an oversized integer equality comparison.
	SDValue X = SetCC->getOperand(0);
	SDValue Y = SetCC->getOperand(1);
	EVT OpVT = X.getValueType();
	unsigned OpSize = OpVT.getSizeInBits();
	if (!OpVT.isScalarInteger() \|\| OpSize < 128)
	return SDValue();

	// Ignore a comparison with zero because that gets special treatment in
	// EmitTest(). But make an exception for the special case of a pair of
	// logically-combined vector-sized operands compared to zero. This pattern may
	// be generated by the memcmp expansion pass with oversized integer compares
	// (see PR33325).
	bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
	X.getOperand(0).getOpcode() == ISD::XOR &&
	X.getOperand(1).getOpcode() == ISD::XOR;
	if (isNullConstant(Y) && !IsOrXorXorCCZero)
	return SDValue();

	// Don't perform this combine if constructing the vector will be expensive.
	auto IsVectorBitCastCheap = [](SDValue X) {
	X = peekThroughBitcasts(X);
	return isa<ConstantSDNode>(X) \|\| X.getValueType().isVector() \|\|
	X.getOpcode() == ISD::LOAD;
	};
	if ((!IsVectorBitCastCheap(X) \|\| !IsVectorBitCastCheap(Y)) &&
	!IsOrXorXorCCZero)
	return SDValue();

	// TODO: Use PXOR + PTEST for SSE4.1 or later?
	EVT VT = SetCC->getValueType(0);
	SDLoc DL(SetCC);
	if ((OpSize == 128 && Subtarget.hasSSE2()) \|\|
	(OpSize == 256 && Subtarget.hasAVX2()) \|\|
	(OpSize == 512 && Subtarget.useAVX512Regs())) {
	EVT VecVT = OpSize == 512 ? MVT::v16i32 :
	OpSize == 256 ? MVT::v32i8 :
	MVT::v16i8;
	EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT;
	SDValue Cmp;
	if (IsOrXorXorCCZero) {
	// This is a bitwise-combined equality comparison of 2 pairs of vectors:
	// setcc i128 (or (xor A, B), (xor C, D)), 0, eq\|ne
	// Use 2 vector equality compares and 'and' the results before doing a
	// MOVMSK.
	SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
	SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
	SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
	SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
	SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
	SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
	Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
	} else {
	SDValue VecX = DAG.getBitcast(VecVT, X);
	SDValue VecY = DAG.getBitcast(VecVT, Y);
	Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
	}
	// For 512-bits we want to emit a setcc that will lower to kortest.
	if (OpSize == 512)
	return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp),
	DAG.getConstant(0xFFFF, DL, MVT::i16), CC);
	// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
	// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
	// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
	// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
	// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
	SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
	SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
	MVT::i32);
	return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
	}

	return SDValue();
	}

	static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	EVT VT = N->getValueType(0);
	EVT OpVT = LHS.getValueType();
	SDLoc DL(N);

	if (CC == ISD::SETNE \|\| CC == ISD::SETEQ) {
	// 0-x == y --> x+y == 0
	// 0-x != y --> x+y != 0
	if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
	LHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}
	// x == 0-y --> x+y == 0
	// x != 0-y --> x+y != 0
	if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
	RHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}

	if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
	return V;
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	(CC == ISD::SETNE \|\| CC == ISD::SETEQ \|\| ISD::isSignedIntSetCC(CC))) {
	// Put build_vectors on the right.
	if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
	std::swap(LHS, RHS);
	CC = ISD::getSetCCSwappedOperands(CC);
	}

	bool IsSEXT0 =
	(LHS.getOpcode() == ISD::SIGN_EXTEND) &&
	(LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
	bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());

	if (IsSEXT0 && IsVZero1) {
	assert(VT == LHS.getOperand(0).getValueType() &&
	"Uexpected operand type");
	if (CC == ISD::SETGT)
	return DAG.getConstant(0, DL, VT);
	if (CC == ISD::SETLE)
	return DAG.getConstant(1, DL, VT);
	if (CC == ISD::SETEQ \|\| CC == ISD::SETGE)
	return DAG.getNOT(DL, LHS.getOperand(0), VT);

	assert((CC == ISD::SETNE \|\| CC == ISD::SETLT) &&
	"Unexpected condition code!");
	return LHS.getOperand(0);
	}
	}

	// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
	// pre-promote its result type since vXi1 vectors don't get promoted
	// during type legalization.
	// NOTE: The element count check is to ignore operand types that need to
	// go through type promotion to a 128-bit vector.
	if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
	VT.getVectorElementType() == MVT::i1 &&
	(ExperimentalVectorWideningLegalization \|\|
	VT.getVectorNumElements() > 4) &&
	(OpVT.getVectorElementType() == MVT::i8 \|\|
	OpVT.getVectorElementType() == MVT::i16)) {
	SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
	N->getOperand(2));
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
	}

	// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
	// to avoid scalarization via legalization because v4i32 is not a legal type.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
	LHS.getValueType() == MVT::v4f32)
	return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

	return SDValue();
	}

	static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue Src = N->getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = N->getSimpleValueType(0);
	unsigned NumBits = VT.getScalarSizeInBits();
	unsigned NumElts = SrcVT.getVectorNumElements();

	// Perform constant folding.
	if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
	assert(VT == MVT::i32 && "Unexpected result type");
	APInt Imm(32, 0);
	for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
	if (!Src.getOperand(Idx).isUndef() &&
	Src.getConstantOperandAPInt(Idx).isNegative())
	Imm.setBit(Idx);
	}
	return DAG.getConstant(Imm, SDLoc(N), VT);
	}

	// Look through int->fp bitcasts that don't change the element width.
	unsigned EltWidth = SrcVT.getScalarSizeInBits();
	if (Src.getOpcode() == ISD::BITCAST &&
	Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
	return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));

	// Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
	// with scalar comparisons.
	if (SDValue NotSrc = IsNOT(Src, DAG)) {
	SDLoc DL(N);
	APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
	NotSrc = DAG.getBitcast(SrcVT, NotSrc);
	return DAG.getNode(ISD::XOR, DL, VT,
	DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
	DAG.getConstant(NotMask, DL, VT));
	}

	// Simplify the inputs.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	APInt DemandedMask(APInt::getAllOnesValue(NumBits));
	if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	if (DCI.isBeforeLegalizeOps()) {
	SDValue Index = N->getOperand(4);
	// Remove any sign extends from 32 or smaller to larger than 32.
	// Only do this before LegalizeOps in case we need the sign extend for
	// legalization.
	if (Index.getOpcode() == ISD::SIGN_EXTEND) {
	if (Index.getScalarValueSizeInBits() > 32 &&
	Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index.getOperand(0);
	SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
	if (Res == N) {
	// The original sign extend has less users, add back to worklist in
	// case it needs to be removed
	DCI.AddToWorklist(Index.getNode());
	DCI.AddToWorklist(N);
	}
	return SDValue(Res, 0);
	}
	}

	// Make sure the index is either i32 or i64
	unsigned ScalarSize = Index.getScalarValueSizeInBits();
	if (ScalarSize != 32 && ScalarSize != 64) {
	MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
	EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
	Index.getValueType().getVectorNumElements());
	Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index;
	SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
	if (Res == N)
	DCI.AddToWorklist(N);
	return SDValue(Res, 0);
	}

	// Try to remove zero extends from 32->64 if we know the sign bit of
	// the input is zero.
	if (Index.getOpcode() == ISD::ZERO_EXTEND &&
	Index.getScalarValueSizeInBits() == 64 &&
	Index.getOperand(0).getScalarValueSizeInBits() == 32) {
	if (DAG.SignBitIsZero(Index.getOperand(0))) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index.getOperand(0);
	SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
	if (Res == N) {
	// The original sign extend has less users, add back to worklist in
	// case it needs to be removed
	DCI.AddToWorklist(Index.getNode());
	DCI.AddToWorklist(N);
	}
	return SDValue(Res, 0);
	}
	}
	}

	// With AVX2 we only demand the upper bit of the mask.
	if (!Subtarget.hasAVX512()) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Mask = N->getOperand(2);
	APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
	return SDValue(N, 0);
	}

	return SDValue();
	}

	// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
	static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
	SDValue EFLAGS = N->getOperand(1);

	// Try to simplify the EFLAGS and condition code operands.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
	return getSETCC(CC, Flags, DL, DAG);

	return SDValue();
	}

	/// Optimize branch condition evaluation.
	static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue EFLAGS = N->getOperand(3);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

	// Try to simplify the EFLAGS and condition code operands.
	// Make sure to not keep references to operands, as combineSetCCEFLAGS can
	// RAUW them under us.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
	SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
	N->getOperand(1), Cond, Flags);
	}

	return SDValue();
	}

	static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons producing 0 or -1 in each lane to
	// optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| N->getOperand(0)->getOpcode() != ISD::AND \|\|
	N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC \|\|
	VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (auto *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0).getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
	N->getOperand(0)->getOperand(0), MaskConst);
	SDValue Res = DAG.getBitcast(VT, NewAnd);
	return Res;
	}

	return SDValue();
	}

	/// If we are converting a value to floating-point, try to replace scalar
	/// truncate of an extracted vector element with a bitcast. This tries to keep
	/// the sequence on XMM registers rather than moving between vector and GPRs.
	static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
	// TODO: This is currently only used by combineSIntToFP, but it is generalized
	// to allow being called by any similar cast opcode.
	// TODO: Consider merging this into lowering: vectorizeExtractedCast().
	SDValue Trunc = N->getOperand(0);
	if (!Trunc.hasOneUse() \|\| Trunc.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	SDValue ExtElt = Trunc.getOperand(0);
	if (!ExtElt.hasOneUse() \|\| ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isNullConstant(ExtElt.getOperand(1)))
	return SDValue();

	EVT TruncVT = Trunc.getValueType();
	EVT SrcVT = ExtElt.getValueType();
	unsigned DestWidth = TruncVT.getSizeInBits();
	unsigned SrcWidth = SrcVT.getSizeInBits();
	if (SrcWidth % DestWidth != 0)
	return SDValue();

	// inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
	EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
	unsigned VecWidth = SrcVecVT.getSizeInBits();
	unsigned NumElts = VecWidth / DestWidth;
	EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
	SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
	SDLoc DL(N);
	SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
	BitcastVec, ExtElt.getOperand(1));
	return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
	}

	static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();

	// UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
	// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
	// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
	if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

	// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
	// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
	// the optimization here.
	if (DAG.SignBitIsZero(Op0))
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

	return SDValue();
	}

	static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// First try to optimize away the conversion entirely when it's
	// conditionally from a constant. Vectors only.
	if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
	return Res;

	// Now move on to more general possibilities.
	SDValue Op0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();

	// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
	// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
	// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
	if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Without AVX512DQ we only support i64 to float scalar conversion. For both
	// vectors and scalars, see if we know that the upper bits are all the sign
	// bit, in which case we can truncate the input to i32 and convert from that.
	if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
	unsigned BitWidth = InVT.getScalarSizeInBits();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
	if (NumSignBits >= (BitWidth - 31)) {
	EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
	if (InVT.isVector())
	TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
	InVT.getVectorNumElements());
	SDLoc dl(N);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
	}
	}

	// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
	// a 32-bit target where SSE doesn't support i64->FP operations.
	if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
	Op0.getOpcode() == ISD::LOAD) {
	LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
	EVT LdVT = Ld->getValueType(0);

	// This transformation is not supported if the result type is f16 or f128.
	if (VT == MVT::f16 \|\| VT == MVT::f128)
	return SDValue();

	// If we have AVX512DQ we can use packed conversion instructions unless
	// the VT is f80.
	if (Subtarget.hasDQI() && VT != MVT::f80)
	return SDValue();

	if (!Ld->isVolatile() && !VT.isVector() &&
	ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
	!Subtarget.is64Bit() && LdVT == MVT::i64) {
	SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
	SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
	DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
	return FILDChain;
	}
	}

	if (SDValue V = combineToFPTruncExtElt(N, DAG))
	return V;

	return SDValue();
	}

	static bool needCarryOrOverflowFlag(SDValue Flags) {
	assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");

	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;

	X86::CondCode CC;
	switch (User->getOpcode()) {
	default:
	// Be conservative.
	return true;
	case X86ISD::SETCC:
	case X86ISD::SETCC_CARRY:
	CC = (X86::CondCode)User->getConstantOperandVal(0);
	break;
	case X86ISD::BRCOND:
	CC = (X86::CondCode)User->getConstantOperandVal(2);
	break;
	case X86ISD::CMOV:
	CC = (X86::CondCode)User->getConstantOperandVal(2);
	break;
	}

	switch (CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	case X86::COND_O: case X86::COND_NO:
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	return true;
	}
	}

	return false;
	}

	static bool onlyZeroFlagUsed(SDValue Flags) {
	assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");

	for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;

	unsigned CCOpNo;
	switch (User->getOpcode()) {
	default:
	// Be conservative.
	return false;
	case X86ISD::SETCC: CCOpNo = 0; break;
	case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
	case X86ISD::BRCOND: CCOpNo = 2; break;
	case X86ISD::CMOV: CCOpNo = 2; break;
	}

	X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return false;
	}

	return true;
	}

	static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
	// Only handle test patterns.
	if (!isNullConstant(N->getOperand(1)))
	return SDValue();

	// If we have a CMP of a truncated binop, see if we can make a smaller binop
	// and use its flags directly.
	// TODO: Maybe we should try promoting compares that only use the zero flag
	// first if we can prove the upper bits with computeKnownBits?
	SDLoc dl(N);
	SDValue Op = N->getOperand(0);
	EVT VT = Op.getValueType();

	// If we have a constant logical shift that's only used in a comparison
	// against zero turn it into an equivalent AND. This allows turning it into
	// a TEST instruction later.
	if ((Op.getOpcode() == ISD::SRL \|\| Op.getOpcode() == ISD::SHL) &&
	Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
	onlyZeroFlagUsed(SDValue(N, 0))) {
	unsigned BitWidth = VT.getSizeInBits();
	const APInt &ShAmt = Op.getConstantOperandAPInt(1);
	if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
	unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
	APInt Mask = Op.getOpcode() == ISD::SRL
	? APInt::getHighBitsSet(BitWidth, MaskBits)
	: APInt::getLowBitsSet(BitWidth, MaskBits);
	if (Mask.isSignedIntN(32)) {
	Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
	DAG.getConstant(Mask, dl, VT));
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, VT));
	}
	}
	}

	// Look for a truncate with a single use.
	if (Op.getOpcode() != ISD::TRUNCATE \|\| !Op.hasOneUse())
	return SDValue();

	Op = Op.getOperand(0);

	// Arithmetic op can only have one use.
	if (!Op.hasOneUse())
	return SDValue();

	unsigned NewOpc;
	switch (Op.getOpcode()) {
	default: return SDValue();
	case ISD::AND:
	// Skip and with constant. We have special handling for and with immediate
	// during isel to generate test instructions.
	if (isa<ConstantSDNode>(Op.getOperand(1)))
	return SDValue();
	NewOpc = X86ISD::AND;
	break;
	case ISD::OR: NewOpc = X86ISD::OR; break;
	case ISD::XOR: NewOpc = X86ISD::XOR; break;
	case ISD::ADD:
	// If the carry or overflow flag is used, we can't truncate.
	if (needCarryOrOverflowFlag(SDValue(N, 0)))
	return SDValue();
	NewOpc = X86ISD::ADD;
	break;
	case ISD::SUB:
	// If the carry or overflow flag is used, we can't truncate.
	if (needCarryOrOverflowFlag(SDValue(N, 0)))
	return SDValue();
	NewOpc = X86ISD::SUB;
	break;
	}

	// We found an op we can narrow. Truncate its inputs.
	SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
	SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));

	// Use a X86 specific opcode to avoid DAG combine messing with it.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);

	// For AND, keep a CMP so that we can match the test pattern.
	if (NewOpc == X86ISD::AND)
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, VT));

	// Return the flags.
	return Op.getValue(1);
	}

	static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert((X86ISD::ADD == N->getOpcode() \|\| X86ISD::SUB == N->getOpcode()) &&
	"Expected X86ISD::ADD or X86ISD::SUB");

	SDLoc DL(N);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	MVT VT = LHS.getSimpleValueType();
	unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;

	// If we don't use the flag result, simplify back to a generic ADD/SUB.
	if (!N->hasAnyUseOfValue(1)) {
	SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
	return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
	}

	// Fold any similar generic ADD/SUB opcodes to reuse this node.
	auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
	SDValue Ops[] = {N0, N1};
	SDVTList VTs = DAG.getVTList(N->getValueType(0));
	if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
	SDValue Op(N, 0);
	if (Negate)
	Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
	DCI.CombineTo(GenericAddSub, Op);
	}
	};
	MatchGeneric(LHS, RHS, false);
	MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());

	return SDValue();
	}

	static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
	// iff the flag result is dead.
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
	!N->hasAnyUseOfValue(1))
	return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
	Op0.getOperand(1), N->getOperand(2));

	return SDValue();
	}

	// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
	static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// If the LHS and RHS of the ADC node are zero, then it can't overflow and
	// the result is either zero or one (depending on the input carry bit).
	// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
	if (X86::isZeroNode(N->getOperand(0)) &&
	X86::isZeroNode(N->getOperand(1)) &&
	// We don't have a good way to replace an EFLAGS use, so only do this when
	// dead right now.
	SDValue(N, 1).use_empty()) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
	SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL,
	MVT::i8),
	N->getOperand(2)),
	DAG.getConstant(1, DL, VT));
	return DCI.CombineTo(N, Res1, CarryOut);
	}

	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	return SDValue();
	}

	/// If this is an add or subtract where one operand is produced by a cmp+setcc,
	/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
	/// with CMP+{ADC, SBB}.
	static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
	bool IsSub = N->getOpcode() == ISD::SUB;
	SDValue X = N->getOperand(0);
	SDValue Y = N->getOperand(1);

	// If this is an add, canonicalize a zext operand to the RHS.
	// TODO: Incomplete? What if both sides are zexts?
	if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
	Y.getOpcode() != ISD::ZERO_EXTEND)
	std::swap(X, Y);

	// Look through a one-use zext.
	bool PeekedThroughZext = false;
	if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
	Y = Y.getOperand(0);
	PeekedThroughZext = true;
	}

	// If this is an add, canonicalize a setcc operand to the RHS.
	// TODO: Incomplete? What if both sides are setcc?
	// TODO: Should we allow peeking through a zext of the other operand?
	if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
	Y.getOpcode() != X86ISD::SETCC)
	std::swap(X, Y);

	if (Y.getOpcode() != X86ISD::SETCC \|\| !Y.hasOneUse())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	auto *ConstantX = dyn_cast<ConstantSDNode>(X);
	if (ConstantX) {
	if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
	// This is a complicated way to get -1 or 0 from the carry flag:
	// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	Y.getOperand(1));
	}

	if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
	SDValue EFLAGS = Y->getOperand(1);
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	// Swap the operands of a SUB, and we have the same pattern as above.
	// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
	// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
	SDValue NewSub = DAG.getNode(
	X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	NewEFLAGS);
	}
	}
	}

	if (CC == X86::COND_B) {
	// X + SETB Z --> adc X, 0
	// X - SETB Z --> sbb X, 0
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
	DAG.getVTList(VT, MVT::i32), X,
	DAG.getConstant(0, DL, VT), Y.getOperand(1));
	}

	if (CC == X86::COND_A) {
	SDValue EFLAGS = Y->getOperand(1);
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp instruction
	// cannot take an immediate as its first operand.
	//
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
	EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
	DAG.getVTList(VT, MVT::i32), X,
	DAG.getConstant(0, DL, VT), NewEFLAGS);
	}
	}

	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	SDValue Cmp = Y.getOperand(1);
	if (Cmp.getOpcode() != X86ISD::CMP \|\| !Cmp.hasOneUse() \|\|
	!X86::isZeroNode(Cmp.getOperand(1)) \|\|
	!Cmp.getOperand(0).getValueType().isInteger())
	return SDValue();

	SDValue Z = Cmp.getOperand(0);
	EVT ZVT = Z.getValueType();

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	if (ConstantX) {
	// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
	// fake operands:
	// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
	// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
	if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
	SDValue Zero = DAG.getConstant(0, DL, ZVT);
	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	}

	// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
	// with fake operands:
	// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
	// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
	if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
	}
	}

	// (cmp Z, 1) sets the carry flag if Z is 0.
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);

	// Add the flags type for ADC/SBB nodes.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
	// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
	if (CC == X86::COND_NE)
	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
	DAG.getConstant(-1ULL, DL, VT), Cmp1);

	// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
	// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
	DAG.getConstant(0, DL, VT), Cmp1);
	}

	static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	EVT VT = N->getValueType(0);

	// If the vector size is less than 128, or greater than the supported RegSize,
	// do not use PMADD.
	if (!VT.isVector() \|\| VT.getVectorNumElements() < 8)
	return SDValue();

	if (Op0.getOpcode() != ISD::MUL)
	std::swap(Op0, Op1);
	if (Op0.getOpcode() != ISD::MUL)
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(Op0.getNode(), DAG, Mode) \|\| Mode == MULU16)
	return SDValue();

	SDLoc DL(N);
	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements());
	EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	VT.getVectorNumElements() / 2);

	// Madd vector size is half of the original vector size
	auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
	return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
	};

	auto BuildPMADDWD = [&](SDValue Mul) {
	// Shrink the operands of mul.
	SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(0));
	SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(1));

	SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
	PMADDWDBuilder);
	// Fill the rest of the output with 0
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd,
	DAG.getConstant(0, DL, MAddVT));
	};

	Op0 = BuildPMADDWD(Op0);

	// It's possible that Op1 is also a mul we can reduce.
	if (Op1.getOpcode() == ISD::MUL &&
	canReduceVMulWidth(Op1.getNode(), DAG, Mode) && Mode != MULU16) {
	Op1 = BuildPMADDWD(Op1);
	}

	return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
	}

	static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// TODO: There's nothing special about i32, any integer type above i16 should
	// work just as well.
	if (!VT.isVector() \|\| !VT.isSimple() \|\|
	!(VT.getVectorElementType() == MVT::i32))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.useBWIRegs())
	RegSize = 512;
	else if (Subtarget.hasAVX())
	RegSize = 256;

	// We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (VT.getSizeInBits() / 4 > RegSize)
	return SDValue();

	// We know N is a reduction add, which means one of its operands is a phi.
	// To match SAD, we need the other operand to be a ABS.
	if (Op0.getOpcode() != ISD::ABS)
	std::swap(Op0, Op1);
	if (Op0.getOpcode() != ISD::ABS)
	return SDValue();

	auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) {
	// SAD pattern detected. Now build a SAD instruction and an addition for
	// reduction. Note that the number of elements of the result of SAD is less
	// than the number of elements of its input. Therefore, we could only update
	// part of elements in the reduction vector.
	SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);

	// The output of PSADBW is a vector of i64.
	// We need to turn the vector of i64 into a vector of i32.
	// If the reduction vector is at least as wide as the psadbw result, just
	// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
	// anyway.
	MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
	if (VT.getSizeInBits() >= ResVT.getSizeInBits())
	Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
	else
	Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);

	if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
	// Fill the upper elements with zero to match the add width.
	SDValue Zero = DAG.getConstant(0, DL, VT);
	Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
	DAG.getIntPtrConstant(0, DL));
	}

	return Sad;
	};

	// Check whether we have an abs-diff pattern feeding into the select.
	SDValue SadOp0, SadOp1;
	if (!detectZextAbsDiff(Op0, SadOp0, SadOp1))
	return SDValue();

	Op0 = BuildPSADBW(SadOp0, SadOp1);

	// It's possible we have a sad on the other side too.
	if (Op1.getOpcode() == ISD::ABS &&
	detectZextAbsDiff(Op1, SadOp0, SadOp1)) {
	Op1 = BuildPSADBW(SadOp0, SadOp1);
	}

	return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
	}

	/// Convert vector increment or decrement to sub/add with an all-ones constant:
	/// add X, <1, 1...> --> sub X, <-1, -1...>
	/// sub X, <1, 1...> --> add X, <-1, -1...>
	/// The all-ones vector constant can be materialized using a pcmpeq instruction
	/// that is commonly recognized as an idiom (has no register dependency), so
	/// that's better/smaller than loading a splat 1 constant.
	static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
	assert((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) &&
	"Unexpected opcode for increment/decrement transform");

	// Pseudo-legality check: getOnesVector() expects one of these types, so bail
	// out and wait for legalization if we have an unsupported vector length.
	EVT VT = N->getValueType(0);
	if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	APInt SplatVal;
	if (!isConstantSplat(N->getOperand(1), SplatVal) \|\| !SplatVal.isOneValue())
	return SDValue();

	SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
	}

	static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
	const SDLoc &DL, EVT VT,
	const X86Subtarget &Subtarget) {
	// Example of pattern we try to detect:
	// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
	//(add (build_vector (extract_elt t, 0),
	// (extract_elt t, 2),
	// (extract_elt t, 4),
	// (extract_elt t, 6)),
	// (build_vector (extract_elt t, 1),
	// (extract_elt t, 3),
	// (extract_elt t, 5),
	// (extract_elt t, 7)))

	if (!Subtarget.hasSSE2())
	return SDValue();

	if (Op0.getOpcode() != ISD::BUILD_VECTOR \|\|
	Op1.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32 \|\|
	VT.getVectorNumElements() < 4 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	// Check if one of Op0,Op1 is of the form:
	// (build_vector (extract_elt Mul, 0),
	// (extract_elt Mul, 2),
	// (extract_elt Mul, 4),
	// ...
	// the other is of the form:
	// (build_vector (extract_elt Mul, 1),
	// (extract_elt Mul, 3),
	// (extract_elt Mul, 5),
	// ...
	// and identify Mul.
	SDValue Mul;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
	SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
	Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
	// TODO: Be more tolerant to undefs.
	if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
	auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
	auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
	auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
	if (!Const0L \|\| !Const1L \|\| !Const0H \|\| !Const1H)
	return SDValue();
	unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
	Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
	// Commutativity of mul allows factors of a product to reorder.
	if (Idx0L > Idx1L)
	std::swap(Idx0L, Idx1L);
	if (Idx0H > Idx1H)
	std::swap(Idx0H, Idx1H);
	// Commutativity of add allows pairs of factors to reorder.
	if (Idx0L > Idx0H) {
	std::swap(Idx0L, Idx0H);
	std::swap(Idx1L, Idx1H);
	}
	if (Idx0L != 2 * i \|\| Idx1L != 2 * i + 1 \|\| Idx0H != 2 * i + 2 \|\|
	Idx1H != 2 * i + 3)
	return SDValue();
	if (!Mul) {
	// First time an extract_elt's source vector is visited. Must be a MUL
	// with 2X number of vector elements than the BUILD_VECTOR.
	// Both extracts must be from same MUL.
	Mul = Op0L->getOperand(0);
	if (Mul->getOpcode() != ISD::MUL \|\|
	Mul.getValueType().getVectorNumElements() != 2 * e)
	return SDValue();
	}
	// Check that the extract is from the same MUL previously seen.
	if (Mul != Op0L->getOperand(0) \|\| Mul != Op1L->getOperand(0) \|\|
	Mul != Op0H->getOperand(0) \|\| Mul != Op1H->getOperand(0))
	return SDValue();
	}

	// Check if the Mul source can be safely shrunk.
	ShrinkMode Mode;
	if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) \|\| Mode == MULU16)
	return SDValue();

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Shrink by adding truncate nodes and let DAGCombine fold with the
	// sources.
	EVT InVT = Ops[0].getValueType();
	assert(InVT.getScalarType() == MVT::i32 &&
	"Unexpected scalar element type");
	assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements() / 2);
	EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	InVT.getVectorNumElements());
	return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
	DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
	DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT,
	{ Mul.getOperand(0), Mul.getOperand(1) },
	PMADDBuilder);
	}

	// Attempt to turn this pattern into PMADDWD.
	// (mul (add (zext (build_vector)), (zext (build_vector))),
	// (add (zext (build_vector)), (zext (build_vector)))
	static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
	const SDLoc &DL, EVT VT,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	if (N0.getOpcode() != ISD::MUL \|\| N1.getOpcode() != ISD::MUL)
	return SDValue();

	if (!VT.isVector() \|\| VT.getVectorElementType() != MVT::i32 \|\|
	VT.getVectorNumElements() < 4 \|\|
	!isPowerOf2_32(VT.getVectorNumElements()))
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);

	// All inputs need to be sign extends.
	// TODO: Support ZERO_EXTEND from known positive?
	if (N00.getOpcode() != ISD::SIGN_EXTEND \|\|
	N01.getOpcode() != ISD::SIGN_EXTEND \|\|
	N10.getOpcode() != ISD::SIGN_EXTEND \|\|
	N11.getOpcode() != ISD::SIGN_EXTEND)
	return SDValue();

	// Peek through the extends.
	N00 = N00.getOperand(0);
	N01 = N01.getOperand(0);
	N10 = N10.getOperand(0);
	N11 = N11.getOperand(0);

	// Must be extending from vXi16.
	EVT InVT = N00.getValueType();
	if (InVT.getVectorElementType() != MVT::i16 \|\| N01.getValueType() != InVT \|\|
	N10.getValueType() != InVT \|\| N11.getValueType() != InVT)
	return SDValue();

	// All inputs should be build_vectors.
	if (N00.getOpcode() != ISD::BUILD_VECTOR \|\|
	N01.getOpcode() != ISD::BUILD_VECTOR \|\|
	N10.getOpcode() != ISD::BUILD_VECTOR \|\|
	N11.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// For each element, we need to ensure we have an odd element from one vector
	// multiplied by the odd element of another vector and the even element from
	// one of the same vectors being multiplied by the even element from the
	// other vector. So we need to make sure for each element i, this operator
	// is being performed:
	// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
	SDValue In0, In1;
	for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
	SDValue N00Elt = N00.getOperand(i);
	SDValue N01Elt = N01.getOperand(i);
	SDValue N10Elt = N10.getOperand(i);
	SDValue N11Elt = N11.getOperand(i);
	// TODO: Be more tolerant to undefs.
	if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();
	auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
	auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
	auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
	auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
	if (!ConstN00Elt \|\| !ConstN01Elt \|\| !ConstN10Elt \|\| !ConstN11Elt)
	return SDValue();
	unsigned IdxN00 = ConstN00Elt->getZExtValue();
	unsigned IdxN01 = ConstN01Elt->getZExtValue();
	unsigned IdxN10 = ConstN10Elt->getZExtValue();
	unsigned IdxN11 = ConstN11Elt->getZExtValue();
	// Add is commutative so indices can be reordered.
	if (IdxN00 > IdxN10) {
	std::swap(IdxN00, IdxN10);
	std::swap(IdxN01, IdxN11);
	}
	// N0 indices be the even element. N1 indices must be the next odd element.
	if (IdxN00 != 2 * i \|\| IdxN10 != 2 * i + 1 \|\|
	IdxN01 != 2 * i \|\| IdxN11 != 2 * i + 1)
	return SDValue();
	SDValue N00In = N00Elt.getOperand(0);
	SDValue N01In = N01Elt.getOperand(0);
	SDValue N10In = N10Elt.getOperand(0);
	SDValue N11In = N11Elt.getOperand(0);
	// First time we find an input capture it.
	if (!In0) {
	In0 = N00In;
	In1 = N01In;
	}
	// Mul is commutative so the input vectors can be in any order.
	// Canonicalize to make the compares easier.
	if (In0 != N00In)
	std::swap(N00In, N01In);
	if (In0 != N10In)
	std::swap(N10In, N11In);
	if (In0 != N00In \|\| In1 != N01In \|\| In0 != N10In \|\| In1 != N11In)
	return SDValue();
	}

	auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	// Shrink by adding truncate nodes and let DAGCombine fold with the
	// sources.
	EVT OpVT = Ops[0].getValueType();
	assert(OpVT.getScalarType() == MVT::i16 &&
	"Unexpected scalar element type");
	assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
	EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	OpVT.getVectorNumElements() / 2);
	return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
	};
	return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
	PMADDBuilder);
	}

	static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const SDNodeFlags Flags = N->getFlags();
	if (Flags.hasVectorReduction()) {
	if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
	return Sad;
	if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
	return MAdd;
	}
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
	return MAdd;
	if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
	return MAdd;

	// Try to synthesize horizontal adds from adds of shuffles.
	if ((VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\| VT == MVT::v16i16 \|\|
	VT == MVT::v8i32) &&
	Subtarget.hasSSSE3() &&
	isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) {
	auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
	HADDBuilder);
	}

	if (SDValue V = combineIncDecVector(N, DAG))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// PSUBUS is supported, starting from SSE2, but truncation for v8i32
	// is only worth it with SSSE3 (PSHUFB).
	if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16)) &&
	!(Subtarget.hasSSSE3() && (VT == MVT::v8i32 \|\| VT == MVT::v8i64)) &&
	!(Subtarget.hasAVX() && (VT == MVT::v32i8 \|\| VT == MVT::v16i16)) &&
	!(Subtarget.useBWIRegs() && (VT == MVT::v64i8 \|\| VT == MVT::v32i16 \|\|
	VT == MVT::v16i32 \|\| VT == MVT::v8i64)))
	return SDValue();

	SDValue SubusLHS, SubusRHS;
	// Try to find umax(a,b) - b or a - umin(a,b) patterns
	// they may be converted to subus(a,b).
	// TODO: Need to add IR canonicalization for this code.
	if (Op0.getOpcode() == ISD::UMAX) {
	SubusRHS = Op1;
	SDValue MaxLHS = Op0.getOperand(0);
	SDValue MaxRHS = Op0.getOperand(1);
	if (MaxLHS == Op1)
	SubusLHS = MaxRHS;
	else if (MaxRHS == Op1)
	SubusLHS = MaxLHS;
	else
	return SDValue();
	} else if (Op1.getOpcode() == ISD::UMIN) {
	SubusLHS = Op0;
	SDValue MinLHS = Op1.getOperand(0);
	SDValue MinRHS = Op1.getOperand(1);
	if (MinLHS == Op0)
	SubusRHS = MinRHS;
	else if (MinRHS == Op0)
	SubusRHS = MinLHS;
	else
	return SDValue();
	} else
	return SDValue();

	auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops);
	};

	// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
	// special preprocessing in some cases.
	if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
	{ SubusLHS, SubusRHS }, USUBSATBuilder);

	// Special preprocessing case can be only applied
	// if the value was zero extended from 16 bit,
	// so we require first 16 bits to be zeros for 32 bit
	// values, or first 48 bits for 64 bit values.
	KnownBits Known = DAG.computeKnownBits(SubusLHS);
	unsigned NumZeros = Known.countMinLeadingZeros();
	if ((VT == MVT::v8i64 && NumZeros < 48) \|\| NumZeros < 16)
	return SDValue();

	EVT ExtType = SubusLHS.getValueType();
	EVT ShrinkedType;
	if (VT == MVT::v8i32 \|\| VT == MVT::v8i64)
	ShrinkedType = MVT::v8i16;
	else
	ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;

	// If SubusLHS is zeroextended - truncate SubusRHS to it's
	// size SubusRHS = umin(0xFFF.., SubusRHS).
	SDValue SaturationConst =
	DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
	ShrinkedType.getScalarSizeInBits()),
	SDLoc(SubusLHS), ExtType);
	SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
	SaturationConst);
	SDValue NewSubusLHS =
	DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
	SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
	SDValue Psubus =
	SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
	{ NewSubusLHS, NewSubusRHS }, USUBSATBuilder);
	// Zero extend the result, it may be used somewhere as 32 bit,
	// if not zext and following trunc will shrink.
	return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
	}

	static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// X86 can't encode an immediate LHS of a sub. See if we can push the
	// negation into a preceding instruction.
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
	// If the RHS of the sub is a XOR with one use and a constant, invert the
	// immediate. Then add one to the LHS of the sub so we can turn
	// X-Y -> X+~Y+1, saving one register.
	if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
	isa<ConstantSDNode>(Op1.getOperand(1))) {
	const APInt &XorC = Op1.getConstantOperandAPInt(1);
	EVT VT = Op0.getValueType();
	SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
	Op1.getOperand(0),
	DAG.getConstant(~XorC, SDLoc(Op1), VT));
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
	DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
	}
	}

	// Try to synthesize horizontal subs from subs of shuffles.
	EVT VT = N->getValueType(0);
	if ((VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\| VT == MVT::v16i16 \|\|
	VT == MVT::v8i32) &&
	Subtarget.hasSSSE3() &&
	isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) {
	auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
	ArrayRef<SDValue> Ops) {
	return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
	};
	return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
	HSUBBuilder);
	}

	if (SDValue V = combineIncDecVector(N, DAG))
	return V;

	// Try to create PSUBUS if SUB's argument is max/min
	if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	if (N->getOperand(0) == N->getOperand(1)) {
	if (N->getOpcode() == X86ISD::PCMPEQ)
	return DAG.getConstant(-1, DL, VT);
	if (N->getOpcode() == X86ISD::PCMPGT)
	return DAG.getConstant(0, DL, VT);
	}

	return SDValue();
	}

	/// Helper that combines an array of subvector ops as if they were the operands
	/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
	/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
	static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
	ArrayRef<SDValue> Ops, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");

	if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
	return DAG.getUNDEF(VT);

	if (llvm::all_of(Ops, [](SDValue Op) {
	return ISD::isBuildVectorAllZeros(Op.getNode());
	}))
	return getZeroVector(VT, Subtarget, DAG, DL);

	SDValue Op0 = Ops[0];

	// Fold subvector loads into one.
	// If needed, look through bitcasts to get to the load.
	if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
	bool Fast;
	const X86TargetLowering *TLI = Subtarget.getTargetLowering();
	if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	*FirstLd->getMemOperand(), &Fast) &&
	Fast) {
	if (SDValue Ld =
	EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
	return Ld;
	}
	}

	// Repeated subvectors.
	if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) {
	// If this broadcast/subv_broadcast is inserted into both halves, use a
	// larger broadcast/subv_broadcast.
	if (Op0.getOpcode() == X86ISD::VBROADCAST \|\|
	Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
	return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));

	// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
	if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
	(Subtarget.hasAVX2() \|\| MayFoldLoad(Op0.getOperand(0))))
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
	Op0.getOperand(0),
	DAG.getIntPtrConstant(0, DL)));

	// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
	if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Subtarget.hasAVX2() \|\|
	(VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
	Op0.getOperand(0).getValueType() == VT.getScalarType())
	return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
	}

	bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });

	// Repeated opcode.
	// TODO - combineX86ShufflesRecursively should handle shuffle concatenation
	// but it currently struggles with different vector widths.
	if (llvm::all_of(Ops, [Op0](SDValue Op) {
	return Op.getOpcode() == Op0.getOpcode();
	})) {
	unsigned NumOps = Ops.size();
	switch (Op0.getOpcode()) {
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFD:
	if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
	Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
	SmallVector<SDValue, 2> Src;
	for (unsigned i = 0; i != NumOps; ++i)
	Src.push_back(Ops[i].getOperand(0));
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
	Op0.getOperand(1));
	}
	LLVM_FALLTHROUGH;
	case X86ISD::VPERMILPI:
	// TODO - add support for vXf64/vXi64 shuffles.
	if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 \|\| VT == MVT::v8i32) &&
	Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
	SmallVector<SDValue, 2> Src;
	for (unsigned i = 0; i != NumOps; ++i)
	Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
	Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
	Op0.getOperand(1));
	return DAG.getBitcast(VT, Res);
	}
	break;
	case X86ISD::PACKUS:
	if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) {
	SmallVector<SDValue, 2> LHS, RHS;
	for (unsigned i = 0; i != NumOps; ++i) {
	LHS.push_back(Ops[i].getOperand(0));
	RHS.push_back(Ops[i].getOperand(1));
	}
	MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
	SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
	NumOps * SrcVT.getVectorNumElements());
	return DAG.getNode(Op0.getOpcode(), DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
	DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
	}
	break;
	}
	}

	// If we're inserting all zeros into the upper half, change this to
	// an insert into an all zeros vector. We will match this to a move
	// with implicit upper bit zeroing during isel.
	if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode()))
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	getZeroVector(VT, Subtarget, DAG, DL), Ops[0],
	DAG.getIntPtrConstant(0, DL));

	return SDValue();
	}

	static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0).getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Don't do anything for i1 vectors.
	if (VT.getVectorElementType() == MVT::i1)
	return SDValue();

	if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
	SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
	if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
	DCI, Subtarget))
	return R;
	}

	return SDValue();
	}

	static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT OpVT = N->getSimpleValueType(0);

	bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;

	SDLoc dl(N);
	SDValue Vec = N->getOperand(0);
	SDValue SubVec = N->getOperand(1);

	uint64_t IdxVal = N->getConstantOperandVal(2);
	MVT SubVecVT = SubVec.getSimpleValueType();

	if (Vec.isUndef() && SubVec.isUndef())
	return DAG.getUNDEF(OpVT);

	// Inserting undefs/zeros into zeros/undefs is a zero vector.
	if ((Vec.isUndef() \|\| ISD::isBuildVectorAllZeros(Vec.getNode())) &&
	(SubVec.isUndef() \|\| ISD::isBuildVectorAllZeros(SubVec.getNode())))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// If we're inserting into a zero vector and then into a larger zero vector,
	// just insert into the larger zero vector directly.
	if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
	uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl),
	SubVec.getOperand(1),
	DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
	}

	// If we're inserting into a zero vector and our input was extracted from an
	// insert into a zero vector of the same type and the extraction was at
	// least as large as the original insertion. Just insert the original
	// subvector into a zero vector.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
	SubVec.getConstantOperandAPInt(1) == 0 &&
	SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
	SDValue Ins = SubVec.getOperand(0);
	if (Ins.getConstantOperandAPInt(2) == 0 &&
	ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
	Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl),
	Ins.getOperand(1), N->getOperand(2));
	}
	}

	// Stop here if this is an i1 vector.
	if (IsI1Vector)
	return SDValue();

	// If this is an insert of an extract, combine to a shuffle. Don't do this
	// if the insert or extract can be represented with a subregister operation.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	SubVec.getOperand(0).getSimpleValueType() == OpVT &&
	(IdxVal != 0 \|\| !Vec.isUndef())) {
	int ExtIdxVal = SubVec.getConstantOperandVal(1);
	if (ExtIdxVal != 0) {
	int VecNumElts = OpVT.getVectorNumElements();
	int SubVecNumElts = SubVecVT.getVectorNumElements();
	SmallVector<int, 64> Mask(VecNumElts);
	// First create an identity shuffle mask.
	for (int i = 0; i != VecNumElts; ++i)
	Mask[i] = i;
	// Now insert the extracted portion.
	for (int i = 0; i != SubVecNumElts; ++i)
	Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

	return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
	}
	}

	// Match concat_vector style patterns.
	SmallVector<SDValue, 2> SubVectorOps;
	if (collectConcatOps(N, SubVectorOps))
	if (SDValue Fold =
	combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
	return Fold;

	// If we are inserting into both halves of the vector, the starting vector
	// should be undef. If it isn't, make it so. Only do this if the early insert
	// has no other uses.
	// TODO: Should this be a generic DAG combine?
	// TODO: Why doesn't SimplifyDemandedVectorElts catch this?
	if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
	Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 &&
	isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() &&
	+ Vec.getOperand(1).getValueSizeInBits() == SubVecVT.getSizeInBits() &&
	Vec.hasOneUse()) {
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
	Vec.getOperand(1), Vec.getOperand(2));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
	N->getOperand(2));
	}

	// If this is a broadcast insert into an upper undef, use a larger broadcast.
	if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
	return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));

	return SDValue();
	}

	/// If we are extracting a subvector of a vector select and the select condition
	/// is composed of concatenated vectors, try to narrow the select width. This
	/// is a common pattern for AVX1 integer code because 256-bit selects may be
	/// legal, but there is almost no integer math/logic available for 256-bit.
	/// This function should only be called with legal types (otherwise, the calls
	/// to get simple value types will assert).
	static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
	SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
	SmallVector<SDValue, 4> CatOps;
	if (Sel.getOpcode() != ISD::VSELECT \|\|
	!collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
	return SDValue();

	// Note: We assume simple value types because this should only be called with
	// legal operations/types.
	// TODO: This can be extended to handle extraction to 256-bits.
	MVT VT = Ext->getSimpleValueType(0);
	if (!VT.is128BitVector())
	return SDValue();

	MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
	if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
	return SDValue();

	MVT WideVT = Ext->getOperand(0).getSimpleValueType();
	MVT SelVT = Sel.getSimpleValueType();
	assert((SelVT.is256BitVector() \|\| SelVT.is512BitVector()) &&
	"Unexpected vector type with legal operations");

	unsigned SelElts = SelVT.getVectorNumElements();
	unsigned CastedElts = WideVT.getVectorNumElements();
	unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue();
	if (SelElts % CastedElts == 0) {
	// The select has the same or more (narrower) elements than the extract
	// operand. The extraction index gets scaled by that factor.
	ExtIdx *= (SelElts / CastedElts);
	} else if (CastedElts % SelElts == 0) {
	// The select has less (wider) elements than the extract operand. Make sure
	// that the extraction index can be divided evenly.
	unsigned IndexDivisor = CastedElts / SelElts;
	if (ExtIdx % IndexDivisor != 0)
	return SDValue();
	ExtIdx /= IndexDivisor;
	} else {
	llvm_unreachable("Element count of simple vector types are not divisible?");
	}

	unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
	unsigned NarrowElts = SelElts / NarrowingFactor;
	MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
	SDLoc DL(Ext);
	SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
	SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
	SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
	SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
	return DAG.getBitcast(VT, NarrowSel);
	}

	static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// For AVX1 only, if we are extracting from a 256-bit and+not (which will
	// eventually get combined/lowered into ANDNP) with a concatenated operand,
	// split the 'and' into 128-bit ops to avoid the concatenate and extract.
	// We let generic combining take over from there to simplify the
	// insert/extract and 'not'.
	// This pattern emerges during AVX1 legalization. We handle it before lowering
	// to avoid complications like splitting constant vector loads.

	// Capture the original wide type in the likely case that we need to bitcast
	// back to this type.
	if (!N->getValueType(0).isSimple())
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	EVT WideVecVT = N->getOperand(0).getValueType();
	SDValue WideVec = peekThroughBitcasts(N->getOperand(0));
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
	TLI.isTypeLegal(WideVecVT) &&
	WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) {
	auto isConcatenatedNot = [] (SDValue V) {
	V = peekThroughBitcasts(V);
	if (!isBitwiseNot(V))
	return false;
	SDValue NotOp = V->getOperand(0);
	return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
	};
	if (isConcatenatedNot(WideVec.getOperand(0)) \|\|
	isConcatenatedNot(WideVec.getOperand(1))) {
	// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
	SDValue Concat = split256IntArith(WideVec, DAG);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
	DAG.getBitcast(WideVecVT, Concat), N->getOperand(1));
	}
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue V = narrowExtractedVectorSelect(N, DAG))
	return V;

	SDValue InVec = N->getOperand(0);
	unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();

	if (ISD::isBuildVectorAllZeros(InVec.getNode()))
	return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

	if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
	if (VT.getScalarType() == MVT::i1)
	return DAG.getConstant(1, SDLoc(N), VT);
	return getOnesVector(VT, DAG, SDLoc(N));
	}

	if (InVec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(
	VT, SDLoc(N),
	InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));

	// Try to move vector bitcast after extract_subv by scaling extraction index:
	// extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
	// TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
	if (InVec.getOpcode() == ISD::BITCAST &&
	InVec.getOperand(0).getValueType().isVector()) {
	SDValue SrcOp = InVec.getOperand(0);
	EVT SrcVT = SrcOp.getValueType();
	unsigned SrcNumElts = SrcVT.getVectorNumElements();
	unsigned DestNumElts = InVec.getValueType().getVectorNumElements();
	if ((DestNumElts % SrcNumElts) == 0) {
	unsigned DestSrcRatio = DestNumElts / SrcNumElts;
	if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
	unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
	EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
	SrcVT.getScalarType(), NewExtNumElts);
	if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
	TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
	unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
	SDLoc DL(N);
	SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
	SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
	SrcOp, NewIndex);
	return DAG.getBitcast(VT, NewExtract);
	}
	}
	}
	}

	// If we're extracting from a broadcast then we're better off just
	// broadcasting to the smaller type directly, assuming this is the only use.
	// As its a broadcast we don't care about the extraction index.
	if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
	InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
	return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));

	// If we're extracting the lowest subvector and we're the only user,
	// we may be able to perform this with a smaller vector width.
	if (IdxVal == 0 && InVec.hasOneUse()) {
	unsigned InOpcode = InVec.getOpcode();
	if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
	// v2f64 CVTDQ2PD(v4i32).
	if (InOpcode == ISD::SINT_TO_FP &&
	InVec.getOperand(0).getValueType() == MVT::v4i32) {
	return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
	}
	// v2f64 CVTUDQ2PD(v4i32).
	if (InOpcode == ISD::UINT_TO_FP &&
	InVec.getOperand(0).getValueType() == MVT::v4i32) {
	return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
	}
	// v2f64 CVTPS2PD(v4f32).
	if (InOpcode == ISD::FP_EXTEND &&
	InVec.getOperand(0).getValueType() == MVT::v4f32) {
	return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
	}
	}
	if ((InOpcode == ISD::ANY_EXTEND \|\|
	InOpcode == ISD::ANY_EXTEND_VECTOR_INREG \|\|
	InOpcode == ISD::ZERO_EXTEND \|\|
	InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	InOpcode == ISD::SIGN_EXTEND \|\|
	InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
	VT.is128BitVector() &&
	InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
	unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
	return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
	}
	if (InOpcode == ISD::VSELECT &&
	InVec.getOperand(0).getValueType().is256BitVector() &&
	InVec.getOperand(1).getValueType().is256BitVector() &&
	InVec.getOperand(2).getValueType().is256BitVector()) {
	SDLoc DL(N);
	SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
	SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
	SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
	return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
	}
	}

	return SDValue();
	}

	static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
	// This occurs frequently in our masked scalar intrinsic code and our
	// floating point select lowering with AVX512.
	// TODO: SimplifyDemandedBits instead?
	if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
	if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
	if (C->getAPIntValue().isOneValue())
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
	Src.getOperand(0));

	// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
	if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
	Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
	if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
	if (C->isNullValue())
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
	Src.getOperand(1));

	// Reduce v2i64 to v4i32 if we don't need the upper bits.
	// TODO: Move to DAGCombine?
	if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND &&
	Src.getValueType() == MVT::i64 && Src.hasOneUse() &&
	Src.getOperand(0).getScalarValueSizeInBits() <= 32)
	return DAG.getBitcast(
	VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
	DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));

	return SDValue();
	}

	// Simplify PMULDQ and PMULUDQ operations.
	static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	// Canonicalize constant to RHS.
	if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(RHS))
	return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);

	// Multiply by zero.
	if (ISD::isBuildVectorAllZeros(RHS.getNode()))
	return RHS;

	// Aggressively peek through ops to get at the demanded low bits.
	APInt DemandedMask = APInt::getLowBitsSet(64, 32);
	SDValue DemandedLHS = DAG.GetDemandedBits(LHS, DemandedMask);
	SDValue DemandedRHS = DAG.GetDemandedBits(RHS, DemandedMask);
	if (DemandedLHS \|\| DemandedRHS)
	return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
	DemandedLHS ? DemandedLHS : LHS,
	DemandedRHS ? DemandedRHS : RHS);

	// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
	return SDValue(N, 0);

	return SDValue();
	}

	static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue In = N->getOperand(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Try to merge vector loads and extend_inreg to an extload.
	if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
	In.hasOneUse()) {
	auto *Ld = cast<LoadSDNode>(In);
	if (!Ld->isVolatile()) {
	MVT SVT = In.getSimpleValueType().getVectorElementType();
	ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
	EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
	VT.getVectorNumElements());
	if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
	SDValue Load =
	DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
	return Load;
	}
	}
	}

	// Disabling for widening legalization for now. We can enable if we find a
	// case that needs it. Otherwise it can be deleted when we switch to
	// widening legalization.
	if (ExperimentalVectorWideningLegalization)
	return SDValue();

	// Combine (ext_invec (ext_invec X)) -> (ext_invec X)
	if (In.getOpcode() == N->getOpcode() &&
	TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType()))
	return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0));

	// Attempt to combine as a shuffle.
	// TODO: SSE41 support
	if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
	SDValue Op(N, 0);
	if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
	if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
	return Res;
	}

	return SDValue();
	}

	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default: break;
	case ISD::SCALAR_TO_VECTOR:
	return combineScalarToVector(N, DAG);
	case ISD::EXTRACT_VECTOR_ELT:
	case X86ISD::PEXTRW:
	case X86ISD::PEXTRB:
	return combineExtractVectorElt(N, DAG, DCI, Subtarget);
	case ISD::CONCAT_VECTORS:
	return combineConcatVectors(N, DAG, DCI, Subtarget);
	case ISD::INSERT_SUBVECTOR:
	return combineInsertSubvector(N, DAG, DCI, Subtarget);
	case ISD::EXTRACT_SUBVECTOR:
	return combineExtractSubvector(N, DAG, DCI, Subtarget);
	case ISD::VSELECT:
	case ISD::SELECT:
	case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
	case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
	case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
	case X86ISD::CMP: return combineCMP(N, DAG);
	case ISD::ADD: return combineAdd(N, DAG, Subtarget);
	case ISD::SUB: return combineSub(N, DAG, Subtarget);
	case X86ISD::ADD:
	case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
	case X86ISD::SBB: return combineSBB(N, DAG);
	case X86ISD::ADC: return combineADC(N, DAG, DCI);
	case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
	case ISD::SHL: return combineShiftLeft(N, DAG);
	case ISD::SRA: return combineShiftRightArithmetic(N, DAG);
	case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI);
	case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
	case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
	case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
	case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
	case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
	case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
	case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
	case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
	case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
	case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
	case ISD::FADD:
	case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
	case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
	case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
	case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
	case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
	case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
	case X86ISD::FXOR:
	case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
	case X86ISD::FMIN:
	case X86ISD::FMAX: return combineFMinFMax(N, DAG);
	case ISD::FMINNUM:
	case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
	case X86ISD::CVTSI2P:
	case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
	case X86ISD::CVTP2SI:
	case X86ISD::CVTP2UI:
	case X86ISD::CVTTP2SI:
	case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI);
	case X86ISD::BT: return combineBT(N, DAG, DCI);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
	case ISD::ANY_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
	Subtarget);
	case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
	case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
	case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
	case X86ISD::VSHL:
	case X86ISD::VSRA:
	case X86ISD::VSRL:
	return combineVectorShiftVar(N, DAG, DCI, Subtarget);
	case X86ISD::VSHLI:
	case X86ISD::VSRAI:
	case X86ISD::VSRLI:
	return combineVectorShiftImm(N, DAG, DCI, Subtarget);
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
	case X86ISD::SHUFP: // Handle all target specific shuffles
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::BLENDI:
	case X86ISD::UNPCKH:
	case X86ISD::UNPCKL:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLHPS:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::VBROADCAST:
	case X86ISD::VPPERM:
	case X86ISD::VPERMI:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::SHUF128:
	case X86ISD::VZEXT_MOVL:
	case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
	case X86ISD::FMADD_RND:
	case X86ISD::FMSUB:
	case X86ISD::FMSUB_RND:
	case X86ISD::FNMADD:
	case X86ISD::FNMADD_RND:
	case X86ISD::FNMSUB:
	case X86ISD::FNMSUB_RND:
	case ISD::FMA: return combineFMA(N, DAG, Subtarget);
	case X86ISD::FMADDSUB_RND:
	case X86ISD::FMSUBADD_RND:
	case X86ISD::FMADDSUB:
	case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
	case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
	case X86ISD::MGATHER:
	case X86ISD::MSCATTER:
	case ISD::MGATHER:
	case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
	case X86ISD::PCMPEQ:
	case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
	case X86ISD::PMULDQ:
	case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI);
	}

	return SDValue();
	}

	bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
	if (!isTypeLegal(VT))
	return false;

	// There are no vXi8 shifts.
	if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
	return false;

	// TODO: Almost no 8-bit ops are desirable because they have no actual
	// size/speed advantages vs. 32-bit ops, but they do have a major
	// potential disadvantage by causing partial register stalls.
	//
	// 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
	// we have specializations to turn 32-bit multiply/shl into LEA or other ops.
	// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
	// check for a constant operand to the multiply.
	if ((Opc == ISD::MUL \|\| Opc == ISD::SHL) && VT == MVT::i8)
	return false;

	// i16 instruction encodings are longer and some i16 instructions are slow,
	// so those are not desirable.
	if (VT == MVT::i16) {
	switch (Opc) {
	default:
	break;
	case ISD::LOAD:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SUB:
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	return false;
	}
	}

	// Any legal type not explicitly accounted for above here is desirable.
	return true;
	}

	SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
	SDValue Value, SDValue Addr,
	SelectionDAG &DAG) const {
	const Module *M = DAG.getMachineFunction().getMMI().getModule();
	Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
	if (IsCFProtectionSupported) {
	// In case control-flow branch protection is enabled, we need to add
	// notrack prefix to the indirect branch.
	// In order to do that we create NT_BRIND SDNode.
	// Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
	return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
	}

	return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
	}

	bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
	EVT VT = Op.getValueType();
	bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
	isa<ConstantSDNode>(Op.getOperand(1));

	// i16 is legal, but undesirable since i16 instruction encodings are longer
	// and some i16 instructions are slow.
	// 8-bit multiply-by-constant can usually be expanded to something cheaper
	// using LEA and/or other ALU ops.
	if (VT != MVT::i16 && !Is8BitMulByConstant)
	return false;

	auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
	if (!Op.hasOneUse())
	return false;
	SDNode User = Op->use_begin();
	if (!ISD::isNormalStore(User))
	return false;
	auto *Ld = cast<LoadSDNode>(Load);
	auto *St = cast<StoreSDNode>(User);
	return Ld->getBasePtr() == St->getBasePtr();
	};

	auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
	if (!Load.hasOneUse() \|\| Load.getOpcode() != ISD::ATOMIC_LOAD)
	return false;
	if (!Op.hasOneUse())
	return false;
	SDNode User = Op->use_begin();
	if (User->getOpcode() != ISD::ATOMIC_STORE)
	return false;
	auto *Ld = cast<AtomicSDNode>(Load);
	auto *St = cast<AtomicSDNode>(User);
	return Ld->getBasePtr() == St->getBasePtr();
	};

	bool Commute = false;
	switch (Op.getOpcode()) {
	default: return false;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL: {
	SDValue N0 = Op.getOperand(0);
	// Look out for (store (shl (load), x)).
	if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
	return false;
	break;
	}
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	Commute = true;
	LLVM_FALLTHROUGH;
	case ISD::SUB: {
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	// Avoid disabling potential load folding opportunities.
	if (MayFoldLoad(N1) &&
	(!Commute \|\| !isa<ConstantSDNode>(N0) \|\|
	(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
	return false;
	if (MayFoldLoad(N0) &&
	((Commute && !isa<ConstantSDNode>(N1)) \|\|
	(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
	return false;
	if (IsFoldableAtomicRMW(N0, Op) \|\|
	(Commute && IsFoldableAtomicRMW(N1, Op)))
	return false;
	}
	}

	PVT = MVT::i32;
	return true;
	}

	bool X86TargetLowering::
	isDesirableToCombineBuildVectorToShuffleTruncate(
	ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {

	assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
	"Element count mismatch");
	assert(
	Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
	"Shuffle Mask expected to be legal");

	// For 32-bit elements VPERMD is better than shuffle+truncate.
	// TODO: After we improve lowerBuildVector, add execption for VPERMW.
	if (SrcVT.getScalarSizeInBits() == 32 \|\| !Subtarget.hasAVX2())
	return false;

	if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
	return false;

	return true;
	}

	//===----------------------------------------------------------------------===//
	// X86 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Helper to match a string separated by whitespace.
	static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
	S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

	for (StringRef Piece : Pieces) {
	if (!S.startswith(Piece)) // Check if the piece matches.
	return false;

	S = S.substr(Piece.size());
	StringRef::size_type Pos = S.find_first_not_of(" \t");
	if (Pos == 0) // We matched a prefix.
	return false;

	S = S.substr(Pos);
	}

	return S.empty();
	}

	static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

	if (AsmPieces.size() == 3 \|\| AsmPieces.size() == 4) {
	if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {

	if (AsmPieces.size() == 3)
	return true;
	else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
	return true;
	}
	}
	return false;
	}

	bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
	InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());

	const std::string &AsmStr = IA->getAsmString();

	IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
	if (!Ty \|\| Ty->getBitWidth() % 16 != 0)
	return false;

	// TODO: should remove alternatives from the asmstring: "foo {a\|b}" -> "foo a"
	SmallVector<StringRef, 4> AsmPieces;
	SplitString(AsmStr, AsmPieces, ";\n");

	switch (AsmPieces.size()) {
	default: return false;
	case 1:
	// FIXME: this should verify that we are targeting a 486 or better. If not,
	// we will turn this bswap into something that will be lowered to logical
	// ops instead of emitting the bswap asm. For now, we don't support 486 or
	// lower so don't worry about this.
	// bswap $0
	if (matchAsm(AsmPieces[0], {"bswap", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
	// No need to check constraints, nothing other than the equivalent of
	// "=r,0" would be valid here.
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	// rorw $$8, ${0:w} --> llvm.bswap.i16
	if (CI->getType()->isIntegerTy(16) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) \|\|
	matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	break;
	case 3:
	if (CI->getType()->isIntegerTy(32) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
	matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
	matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	if (CI->getType()->isIntegerTy(64)) {
	InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
	if (Constraints.size() >= 2 &&
	Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
	Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
	// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
	if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
	matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
	matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	}
	break;
	}
	return false;
	}

	static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
	X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
	.Case("{@cca}", X86::COND_A)
	.Case("{@ccae}", X86::COND_AE)
	.Case("{@ccb}", X86::COND_B)
	.Case("{@ccbe}", X86::COND_BE)
	.Case("{@ccc}", X86::COND_B)
	.Case("{@cce}", X86::COND_E)
	.Case("{@ccz}", X86::COND_E)
	.Case("{@ccg}", X86::COND_G)
	.Case("{@ccge}", X86::COND_GE)
	.Case("{@ccl}", X86::COND_L)
	.Case("{@ccle}", X86::COND_LE)
	.Case("{@ccna}", X86::COND_BE)
	.Case("{@ccnae}", X86::COND_B)
	.Case("{@ccnb}", X86::COND_AE)
	.Case("{@ccnbe}", X86::COND_A)
	.Case("{@ccnc}", X86::COND_AE)
	.Case("{@ccne}", X86::COND_NE)
	.Case("{@ccnz}", X86::COND_NE)
	.Case("{@ccng}", X86::COND_LE)
	.Case("{@ccnge}", X86::COND_L)
	.Case("{@ccnl}", X86::COND_GE)
	.Case("{@ccnle}", X86::COND_G)
	.Case("{@ccno}", X86::COND_NO)
	.Case("{@ccnp}", X86::COND_P)
	.Case("{@ccns}", X86::COND_NS)
	.Case("{@cco}", X86::COND_O)
	.Case("{@ccp}", X86::COND_P)
	.Case("{@ccs}", X86::COND_S)
	.Default(X86::COND_INVALID);
	return Cond;
	}

	/// Given a constraint letter, return the type of constraint for this target.
	X86TargetLowering::ConstraintType
	X86TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'R':
	case 'q':
	case 'Q':
	case 'f':
	case 't':
	case 'u':
	case 'y':
	case 'x':
	case 'v':
	case 'Y':
	case 'l':
	case 'k': // AVX512 masking registers.
	return C_RegisterClass;
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	return C_Register;
	case 'I':
	case 'J':
	case 'K':
	- case 'L':
	- case 'M':
	case 'N':
	case 'G':
	+ case 'L':
	+ case 'M':
	+ return C_Immediate;
	case 'C':
	case 'e':
	case 'Z':
	return C_Other;
	default:
	break;
	}
	}
	else if (Constraint.size() == 2) {
	switch (Constraint[0]) {
	default:
	break;
	case 'Y':
	switch (Constraint[1]) {
	default:
	break;
	case 'z':
	case '0':
	return C_Register;
	case 'i':
	case 'm':
	case 'k':
	case 't':
	case '2':
	return C_RegisterClass;
	}
	}
	} else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
	return C_Other;
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	X86TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	LLVM_FALLTHROUGH;
	case 'R':
	case 'q':
	case 'Q':
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	if (CallOperandVal->getType()->isIntegerTy())
	weight = CW_SpecificReg;
	break;
	case 'f':
	case 't':
	case 'u':
	if (type->isFloatingPointTy())
	weight = CW_SpecificReg;
	break;
	case 'y':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	weight = CW_SpecificReg;
	break;
	case 'Y': {
	unsigned Size = StringRef(constraint).size();
	// Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
	char NextChar = Size == 2 ? constraint[1] : 'i';
	if (Size > 2)
	break;
	switch (NextChar) {
	default:
	return CW_Invalid;
	// XMM0
	case 'z':
	case '0':
	if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
	return CW_SpecificReg;
	return CW_Invalid;
	// Conditional OpMask regs (AVX512)
	case 'k':
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	return CW_Register;
	return CW_Invalid;
	// Any MMX reg
	case 'm':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	return weight;
	return CW_Invalid;
	// Any SSE reg when ISA >= SSE2, same as 'Y'
	case 'i':
	case 't':
	case '2':
	if (!Subtarget.hasSSE2())
	return CW_Invalid;
	break;
	}
	// Fall through (handle "Y" constraint).
	LLVM_FALLTHROUGH;
	}
	case 'v':
	if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
	weight = CW_Register;
	LLVM_FALLTHROUGH;
	case 'x':
	if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) \|\|
	((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
	weight = CW_Register;
	break;
	case 'k':
	// Enable conditional vector operations using %k<#> registers.
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	weight = CW_Register;
	break;
	case 'I':
	if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
	if (C->getZExtValue() <= 31)
	weight = CW_Constant;
	}
	break;
	case 'J':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 63)
	weight = CW_Constant;
	}
	break;
	case 'K':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
	weight = CW_Constant;
	}
	break;
	case 'L':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getZExtValue() == 0xff) \|\| (C->getZExtValue() == 0xffff))
	weight = CW_Constant;
	}
	break;
	case 'M':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 3)
	weight = CW_Constant;
	}
	break;
	case 'N':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xff)
	weight = CW_Constant;
	}
	break;
	case 'G':
	case 'C':
	if (isa<ConstantFP>(CallOperandVal)) {
	weight = CW_Constant;
	}
	break;
	case 'e':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80000000LL) &&
	(C->getSExtValue() <= 0x7fffffffLL))
	weight = CW_Constant;
	}
	break;
	case 'Z':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xffffffff)
	weight = CW_Constant;
	}
	break;
	}
	return weight;
	}

	/// Try to replace an X constraint, which matches anything, with another that
	/// has more specific requirements based on the type of the corresponding
	/// operand.
	const char *X86TargetLowering::
	LowerXConstraint(EVT ConstraintVT) const {
	// FP X constraints get lowered to SSE1/2 registers if available, otherwise
	// 'f' like normal targets.
	if (ConstraintVT.isFloatingPoint()) {
	if (Subtarget.hasSSE2())
	return "Y";
	if (Subtarget.hasSSE1())
	return "x";
	}

	return TargetLowering::LowerXConstraint(ConstraintVT);
	}

	// Lower @cc targets via setcc.
	SDValue X86TargetLowering::LowerAsmOutputForConstraint(
	SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
	SelectionDAG &DAG) const {
	X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
	if (Cond == X86::COND_INVALID)
	return SDValue();
	// Check that return type is valid.
	if (OpInfo.ConstraintVT.isVector() \|\| !OpInfo.ConstraintVT.isInteger() \|\|
	OpInfo.ConstraintVT.getSizeInBits() < 8)
	report_fatal_error("Flag output operand is of invalid type");

	// Get EFLAGS register. Only update chain when copyfrom is glued.
	if (Flag.getNode()) {
	Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
	Chain = Flag.getValue(1);
	} else
	Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
	// Extract CC code.
	SDValue CC = getSETCC(Cond, Flag, DL, DAG);
	// Extend to 32-bits
	SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);

	return Result;
	}

	/// Lower the specified operand into the Ops vector.
	/// If it is invalid, don't add anything to Ops.
	void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Only support length 1 constraints for now.
	if (Constraint.length() > 1) return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default: break;
	case 'I':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 31) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'J':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 63) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'K':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (isInt<8>(C->getSExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'L':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() == 0xff \|\| C->getZExtValue() == 0xffff \|\|
	(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'M':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 3) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'N':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 255) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'O':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 127) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'e': {
	// 32-bit signed value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getSExtValue())) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	}
	return;
	}
	case 'Z': {
	// 32-bit unsigned value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getZExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	return;
	}
	case 'i': {
	// Literal immediates are always ok.
	if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
	bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
	BooleanContent BCont = getBooleanContents(MVT::i64);
	ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
	: ISD::SIGN_EXTEND;
	int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
	: CST->getSExtValue();
	Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
	break;
	}

	// In any sort of PIC mode addresses need to be computed at runtime by
	// adding in a register or some sort of table lookup. These can't
	// be used as immediates.
	if (Subtarget.isPICStyleGOT() \|\| Subtarget.isPICStyleStubPIC())
	return;

	// If we are in non-pic codegen mode, we allow the address of a global (with
	// an optional displacement) to be used with 'i'.
	if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
	// If we require an extra load to get this address, as in PIC mode, we
	// can't accept it.
	if (isGlobalStubReference(
	Subtarget.classifyGlobalReference(GA->getGlobal())))
	return;
	break;
	}
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}
	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	/// Check if \p RC is a general purpose register class.
	/// I.e., GR* or one of their variant.
	static bool isGRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::GR8RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR16RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR32RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR64RegClass) \|\|
	RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
	}

	/// Check if \p RC is a vector register class.
	/// I.e., FR* / VR* or one of their variant.
	static bool isFRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::FR32XRegClass) \|\|
	RC.hasSuperClassEq(&X86::FR64XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR128XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR256XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR512RegClass);
	}

	/// Check if \p RC is a mask register class.
	/// I.e., VK* or one of their variant.
	static bool isVKClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::VK1RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK2RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK4RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK8RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK16RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK32RegClass) \|\|
	RC.hasSuperClassEq(&X86::VK64RegClass);
	}

	std::pair<unsigned, const TargetRegisterClass *>
	X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	// First, see if this is a constraint that directly corresponds to an LLVM
	// register class.
	if (Constraint.size() == 1) {
	// GCC Constraint Letters
	switch (Constraint[0]) {
	default: break;
	// 'A' means [ER]AX + [ER]DX.
	case 'A':
	if (Subtarget.is64Bit())
	return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
	assert((Subtarget.is32Bit() \|\| Subtarget.is16Bit()) &&
	"Expecting 64, 32 or 16 bit subtarget");
	return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

	// TODO: Slight differences here in allocation order and leaving
	// RIP in the class. Do they matter any more here than they do
	// in the normal allocation?
	case 'k':
	if (Subtarget.hasAVX512()) {
	if (VT == MVT::i1)
	return std::make_pair(0U, &X86::VK1RegClass);
	if (VT == MVT::i8)
	return std::make_pair(0U, &X86::VK8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::VK16RegClass);
	}
	if (Subtarget.hasBWI()) {
	if (VT == MVT::i32)
	return std::make_pair(0U, &X86::VK32RegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::VK64RegClass);
	}
	break;
	case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
	if (Subtarget.is64Bit()) {
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i64 \|\| VT == MVT::f64)
	return std::make_pair(0U, &X86::GR64RegClass);
	break;
	}
	LLVM_FALLTHROUGH;
	// 32-bit fallthrough
	case 'Q': // Q_REGS
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32_ABCDRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_ABCDRegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::GR64_ABCDRegClass);
	break;
	case 'r': // GENERAL_REGS
	case 'l': // INDEX_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32RegClass);
	return std::make_pair(0U, &X86::GR64RegClass);
	case 'R': // LEGACY_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_NOREXRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_NOREXRegClass);
	if (VT == MVT::i32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32_NOREXRegClass);
	return std::make_pair(0U, &X86::GR64_NOREXRegClass);
	case 'f': // FP Stack registers.
	// If SSE is enabled for this VT, use f80 to ensure the isel moves the
	// value to the correct fpstack register class.
	if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP32RegClass);
	if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP64RegClass);
	return std::make_pair(0U, &X86::RFP80RegClass);
	case 'y': // MMX_REGS if MMX allowed.
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'Y': // SSE_REGS if SSE2 allowed
	if (!Subtarget.hasSSE2()) break;
	LLVM_FALLTHROUGH;
	case 'v':
	case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
	if (!Subtarget.hasSSE1()) break;
	bool VConstraint = (Constraint[0] == 'v');

	switch (VT.SimpleTy) {
	default: break;
	// Scalar SSE types.
	case MVT::f32:
	case MVT::i32:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR32XRegClass);
	return std::make_pair(0U, &X86::FR32RegClass);
	case MVT::f64:
	case MVT::i64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR64XRegClass);
	return std::make_pair(0U, &X86::FR64RegClass);
	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	// Vector types.
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v2f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR128XRegClass);
	return std::make_pair(0U, &X86::VR128RegClass);
	// AVX types.
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v8f32:
	case MVT::v4f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR256XRegClass);
	if (Subtarget.hasAVX())
	return std::make_pair(0U, &X86::VR256RegClass);
	break;
	case MVT::v8f64:
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8i64:
	if (!Subtarget.hasAVX512()) break;
	if (VConstraint)
	return std::make_pair(0U, &X86::VR512RegClass);
	return std::make_pair(0U, &X86::VR512_0_15RegClass);
	}
	break;
	}
	} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
	switch (Constraint[1]) {
	default:
	break;
	case 'i':
	case 't':
	case '2':
	return getRegForInlineAsmConstraint(TRI, "Y", VT);
	case 'm':
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'z':
	case '0':
	if (!Subtarget.hasSSE1()) break;
	return std::make_pair(X86::XMM0, &X86::VR128RegClass);
	case 'k':
	// This register class doesn't allocate k0 for masked vector operation.
	if (Subtarget.hasAVX512()) {
	if (VT == MVT::i1)
	return std::make_pair(0U, &X86::VK1WMRegClass);
	if (VT == MVT::i8)
	return std::make_pair(0U, &X86::VK8WMRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::VK16WMRegClass);
	}
	if (Subtarget.hasBWI()) {
	if (VT == MVT::i32)
	return std::make_pair(0U, &X86::VK32WMRegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::VK64WMRegClass);
	}
	break;
	}
	}

	if (parseConstraintCode(Constraint) != X86::COND_INVALID)
	return std::make_pair(0U, &X86::GR32RegClass);

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<unsigned, const TargetRegisterClass*> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	// Map st(0) -> st(7) -> ST0
	if (Constraint.size() == 7 && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
	Constraint[3] == '(' &&
	(Constraint[4] >= '0' && Constraint[4] <= '7') &&
	Constraint[5] == ')' && Constraint[6] == '}') {
	// st(7) is not allocatable and thus not a member of RFP80. Return
	// singleton class in cases where we have a reference to it.
	if (Constraint[4] == '7')
	return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
	return std::make_pair(X86::FP0 + Constraint[4] - '0',
	&X86::RFP80RegClass);
	}

	// GCC allows "st(0)" to be called just plain "st".
	if (StringRef("{st}").equals_lower(Constraint))
	return std::make_pair(X86::FP0, &X86::RFP80RegClass);

	// flags -> EFLAGS
	if (StringRef("{flags}").equals_lower(Constraint))
	return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);

	// dirflag -> DF
	if (StringRef("{dirflag}").equals_lower(Constraint))
	return std::make_pair(X86::DF, &X86::DFCCRRegClass);

	// fpsr -> FPSW
	if (StringRef("{fpsr}").equals_lower(Constraint))
	return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);

	return Res;
	}

	// Make sure it isn't a register that requires 64-bit mode.
	if (!Subtarget.is64Bit() &&
	(isFRClass(Res.second) \|\| isGRClass(Res.second)) &&
	TRI->getEncodingValue(Res.first) >= 8) {
	// Register requires REX prefix, but we're in 32-bit mode.
	return std::make_pair(0, nullptr);
	}

	// Make sure it isn't a register that requires AVX512.
	if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
	TRI->getEncodingValue(Res.first) & 0x10) {
	// Register requires EVEX prefix.
	return std::make_pair(0, nullptr);
	}

	// Otherwise, check to see if this is a register class of the wrong value
	// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
	// turn into {ax},{dx}.
	// MVT::Other is used to specify clobber names.
	if (TRI->isTypeLegalForClass(*Res.second, VT) \|\| VT == MVT::Other)
	return Res; // Correct type already, nothing to do.

	// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
	// return "eax". This should even work for things like getting 64bit integer
	// registers when given an f64 type.
	const TargetRegisterClass *Class = Res.second;
	// The generic code will match the first register class that contains the
	// given register. Thus, based on the ordering of the tablegened file,
	// the "plain" GR classes might not come first.
	// Therefore, use a helper method.
	if (isGRClass(*Class)) {
	unsigned Size = VT.getSizeInBits();
	if (Size == 1) Size = 8;
	unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
	if (DestReg > 0) {
	bool is64Bit = Subtarget.is64Bit();
	const TargetRegisterClass *RC =
	Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
	: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
	: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
	: Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
	: nullptr;
	if (Size == 64 && !is64Bit) {
	// Model GCC's behavior here and select a fixed pair of 32-bit
	// registers.
	switch (DestReg) {
	case X86::RAX:
	return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
	case X86::RDX:
	return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
	case X86::RCX:
	return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
	case X86::RBX:
	return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
	case X86::RSI:
	return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
	case X86::RDI:
	return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
	case X86::RBP:
	return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
	default:
	return std::make_pair(0, nullptr);
	}
	}
	if (RC && RC->contains(DestReg))
	return std::make_pair(DestReg, RC);
	return Res;
	}
	// No register found/type mismatch.
	return std::make_pair(0, nullptr);
	} else if (isFRClass(*Class)) {
	// Handle references to XMM physical registers that got mapped into the
	// wrong class. This can happen with constraints like {xmm0} where the
	// target independent register mapper will just pick the first match it can
	// find, ignoring the required type.

	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	Res.second = &X86::FR32XRegClass;
	else if (VT == MVT::f64 \|\| VT == MVT::i64)
	Res.second = &X86::FR64XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
	Res.second = &X86::VR128XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
	Res.second = &X86::VR256XRegClass;
	else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
	Res.second = &X86::VR512RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	} else if (isVKClass(*Class)) {
	if (VT == MVT::i1)
	Res.second = &X86::VK1RegClass;
	else if (VT == MVT::i8)
	Res.second = &X86::VK8RegClass;
	else if (VT == MVT::i16)
	Res.second = &X86::VK16RegClass;
	else if (VT == MVT::i32)
	Res.second = &X86::VK32RegClass;
	else if (VT == MVT::i64)
	Res.second = &X86::VK64RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	}

	return Res;
	}

	int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
	// will take 2 allocations in the out of order engine instead of 1
	// for plain addressing mode, i.e. inst (reg1).
	// E.g.,
	// vaddps (%rsi,%rdx), %ymm0, %ymm1
	// Requires two allocations (one for the load, one for the computation)
	// whereas:
	// vaddps (%rsi), %ymm0, %ymm1
	// Requires just 1 allocation, i.e., freeing allocations for other operations
	// and having less micro operations to execute.
	//
	// For some X86 architectures, this is even worse because for instance for
	// stores, the complex addressing mode forces the instruction to use the
	// "load" ports instead of the dedicated "store" port.
	// E.g., on Haswell:
	// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
	// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1
	// as soon as we use a second register.
	return AM.Scale != 0;
	return -1;
	}

	bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on x86 is expensive. However, when aggressively optimizing
	// for code size, we prefer to use a div instruction, as it is usually smaller
	// than the alternative sequence.
	// The exception to this is vector division. Since x86 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize =
	Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	if (!Subtarget.is64Bit())
	return;

	// Update IsSplitCSR in X86MachineFunctionInfo.
	X86MachineFunctionInfo *AFI =
	Entry->getParent()->getInfo<X86MachineFunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void X86TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (X86::GR64RegClass.contains(*I))
	RC = &X86::GR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	unsigned NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(
	Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool X86TargetLowering::supportSwiftError() const {
	return Subtarget.is64Bit();
	}

	/// Returns the name of the symbol used to emit stack probes or the empty
	/// string if not applicable.
	StringRef
	X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
	// If the function specifically requests stack probes, emit them.
	if (MF.getFunction().hasFnAttribute("probe-stack"))
	return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

	// Generally, if we aren't on Windows, the platform ABI does not include
	// support for stack probes, so don't emit them.
	if (!Subtarget.isOSWindows() \|\| Subtarget.isTargetMachO() \|\|
	MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
	return "";

	// We need a stack probe to conform to the Windows ABI. Choose the right
	// symbol.
	if (Subtarget.is64Bit())
	return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
	return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
	}
	Index: vendor/llvm/dist-release_90/lib/Transforms/InstCombine/InstCombineCompares.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Transforms/InstCombine/InstCombineCompares.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Transforms/InstCombine/InstCombineCompares.cpp (revision 351303)
	@@ -1,5743 +1,5752 @@
	//===- InstCombineCompares.cpp --------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the visitICmp and visitFCmp functions.
	//
	//===----------------------------------------------------------------------===//

	#include "InstCombineInternal.h"
	#include "llvm/ADT/APSInt.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/ConstantFolding.h"
	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/IR/ConstantRange.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/KnownBits.h"

	using namespace llvm;
	using namespace PatternMatch;

	#define DEBUG_TYPE "instcombine"

	// How many times is a select replaced by one of its operands?
	STATISTIC(NumSel, "Number of select opts");


	/// Compute Result = In1+In2, returning true if the result overflowed for this
	/// type.
	static bool addWithOverflow(APInt &Result, const APInt &In1,
	const APInt &In2, bool IsSigned = false) {
	bool Overflow;
	if (IsSigned)
	Result = In1.sadd_ov(In2, Overflow);
	else
	Result = In1.uadd_ov(In2, Overflow);

	return Overflow;
	}

	/// Compute Result = In1-In2, returning true if the result overflowed for this
	/// type.
	static bool subWithOverflow(APInt &Result, const APInt &In1,
	const APInt &In2, bool IsSigned = false) {
	bool Overflow;
	if (IsSigned)
	Result = In1.ssub_ov(In2, Overflow);
	else
	Result = In1.usub_ov(In2, Overflow);

	return Overflow;
	}

	/// Given an icmp instruction, return true if any use of this comparison is a
	/// branch on sign bit comparison.
	static bool hasBranchUse(ICmpInst &I) {
	for (auto *U : I.users())
	if (isa<BranchInst>(U))
	return true;
	return false;
	}

	/// Given an exploded icmp instruction, return true if the comparison only
	/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the
	/// result of the comparison is true when the input value is signed.
	static bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
	bool &TrueIfSigned) {
	switch (Pred) {
	case ICmpInst::ICMP_SLT: // True if LHS s< 0
	TrueIfSigned = true;
	return RHS.isNullValue();
	case ICmpInst::ICMP_SLE: // True if LHS s<= RHS and RHS == -1
	TrueIfSigned = true;
	return RHS.isAllOnesValue();
	case ICmpInst::ICMP_SGT: // True if LHS s> -1
	TrueIfSigned = false;
	return RHS.isAllOnesValue();
	case ICmpInst::ICMP_UGT:
	// True if LHS u> RHS and RHS == high-bit-mask - 1
	TrueIfSigned = true;
	return RHS.isMaxSignedValue();
	case ICmpInst::ICMP_UGE:
	// True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc)
	TrueIfSigned = true;
	return RHS.isSignMask();
	default:
	return false;
	}
	}

	/// Returns true if the exploded icmp can be expressed as a signed comparison
	/// to zero and updates the predicate accordingly.
	/// The signedness of the comparison is preserved.
	/// TODO: Refactor with decomposeBitTestICmp()?
	static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
	if (!ICmpInst::isSigned(Pred))
	return false;

	if (C.isNullValue())
	return ICmpInst::isRelational(Pred);

	if (C.isOneValue()) {
	if (Pred == ICmpInst::ICMP_SLT) {
	Pred = ICmpInst::ICMP_SLE;
	return true;
	}
	} else if (C.isAllOnesValue()) {
	if (Pred == ICmpInst::ICMP_SGT) {
	Pred = ICmpInst::ICMP_SGE;
	return true;
	}
	}

	return false;
	}

	/// Given a signed integer type and a set of known zero and one bits, compute
	/// the maximum and minimum values that could have the specified known zero and
	/// known one bits, returning them in Min/Max.
	/// TODO: Move to method on KnownBits struct?
	static void computeSignedMinMaxValuesFromKnownBits(const KnownBits &Known,
	APInt &Min, APInt &Max) {
	assert(Known.getBitWidth() == Min.getBitWidth() &&
	Known.getBitWidth() == Max.getBitWidth() &&
	"KnownZero, KnownOne and Min, Max must have equal bitwidth.");
	APInt UnknownBits = ~(Known.Zero\|Known.One);

	// The minimum value is when all unknown bits are zeros, EXCEPT for the sign
	// bit if it is unknown.
	Min = Known.One;
	Max = Known.One\|UnknownBits;

	if (UnknownBits.isNegative()) { // Sign bit is unknown
	Min.setSignBit();
	Max.clearSignBit();
	}
	}

	/// Given an unsigned integer type and a set of known zero and one bits, compute
	/// the maximum and minimum values that could have the specified known zero and
	/// known one bits, returning them in Min/Max.
	/// TODO: Move to method on KnownBits struct?
	static void computeUnsignedMinMaxValuesFromKnownBits(const KnownBits &Known,
	APInt &Min, APInt &Max) {
	assert(Known.getBitWidth() == Min.getBitWidth() &&
	Known.getBitWidth() == Max.getBitWidth() &&
	"Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth.");
	APInt UnknownBits = ~(Known.Zero\|Known.One);

	// The minimum value is when the unknown bits are all zeros.
	Min = Known.One;
	// The maximum value is when the unknown bits are all ones.
	Max = Known.One\|UnknownBits;
	}

	/// This is called when we see this pattern:
	/// cmp pred (load (gep GV, ...)), cmpcst
	/// where GV is a global variable with a constant initializer. Try to simplify
	/// this into some simple computation that does not need the load. For example
	/// we can optimize "icmp eq (load (gep "foo", 0, i)), 0" into "icmp eq i, 3".
	///
	/// If AndCst is non-null, then the loaded value is masked with that constant
	/// before doing the comparison. This handles cases like "A[i]&4 == 0".
	Instruction InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst GEP,
	GlobalVariable *GV,
	CmpInst &ICI,
	ConstantInt *AndCst) {
	Constant *Init = GV->getInitializer();
	if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
	return nullptr;

	uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
	// Don't blow up on huge arrays.
	if (ArrayElementCount > MaxArraySizeForCombine)
	return nullptr;

	// There are many forms of this optimization we can handle, for now, just do
	// the simple index into a single-dimensional array.
	//
	// Require: GEP GV, 0, i {{, constant indices}}
	if (GEP->getNumOperands() < 3 \|\|
	!isa<ConstantInt>(GEP->getOperand(1)) \|\|
	!cast<ConstantInt>(GEP->getOperand(1))->isZero() \|\|
	isa<Constant>(GEP->getOperand(2)))
	return nullptr;

	// Check that indices after the variable are constants and in-range for the
	// type they index. Collect the indices. This is typically for arrays of
	// structs.
	SmallVector<unsigned, 4> LaterIndices;

	Type *EltTy = Init->getType()->getArrayElementType();
	for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) {
	ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
	if (!Idx) return nullptr; // Variable index.

	uint64_t IdxVal = Idx->getZExtValue();
	if ((unsigned)IdxVal != IdxVal) return nullptr; // Too large array index.

	if (StructType *STy = dyn_cast<StructType>(EltTy))
	EltTy = STy->getElementType(IdxVal);
	else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
	if (IdxVal >= ATy->getNumElements()) return nullptr;
	EltTy = ATy->getElementType();
	} else {
	return nullptr; // Unknown type.
	}

	LaterIndices.push_back(IdxVal);
	}

	enum { Overdefined = -3, Undefined = -2 };

	// Variables for our state machines.

	// FirstTrueElement/SecondTrueElement - Used to emit a comparison of the form
	// "i == 47 \| i == 87", where 47 is the first index the condition is true for,
	// and 87 is the second (and last) index. FirstTrueElement is -2 when
	// undefined, otherwise set to the first true element. SecondTrueElement is
	// -2 when undefined, -3 when overdefined and >= 0 when that index is true.
	int FirstTrueElement = Undefined, SecondTrueElement = Undefined;

	// FirstFalseElement/SecondFalseElement - Used to emit a comparison of the
	// form "i != 47 & i != 87". Same state transitions as for true elements.
	int FirstFalseElement = Undefined, SecondFalseElement = Undefined;

	/// TrueRangeEnd/FalseRangeEnd - In conjunction with First*Element, these
	/// define a state machine that triggers for ranges of values that the index
	/// is true or false for. This triggers on things like "abbbbc"[i] == 'b'.
	/// This is -2 when undefined, -3 when overdefined, and otherwise the last
	/// index in the range (inclusive). We use -2 for undefined here because we
	/// use relative comparisons and don't want 0-1 to match -1.
	int TrueRangeEnd = Undefined, FalseRangeEnd = Undefined;

	// MagicBitvector - This is a magic bitvector where we set a bit if the
	// comparison is true for element 'i'. If there are 64 elements or less in
	// the array, this will fully represent all the comparison results.
	uint64_t MagicBitvector = 0;

	// Scan the array and see if one of our patterns matches.
	Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
	for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) {
	Constant *Elt = Init->getAggregateElement(i);
	if (!Elt) return nullptr;

	// If this is indexing an array of structures, get the structure element.
	if (!LaterIndices.empty())
	Elt = ConstantExpr::getExtractValue(Elt, LaterIndices);

	// If the element is masked, handle it.
	if (AndCst) Elt = ConstantExpr::getAnd(Elt, AndCst);

	// Find out if the comparison would be true or false for the i'th element.
	Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt,
	CompareRHS, DL, &TLI);
	// If the result is undef for this element, ignore it.
	if (isa<UndefValue>(C)) {
	// Extend range state machines to cover this element in case there is an
	// undef in the middle of the range.
	if (TrueRangeEnd == (int)i-1)
	TrueRangeEnd = i;
	if (FalseRangeEnd == (int)i-1)
	FalseRangeEnd = i;
	continue;
	}

	// If we can't compute the result for any of the elements, we have to give
	// up evaluating the entire conditional.
	if (!isa<ConstantInt>(C)) return nullptr;

	// Otherwise, we know if the comparison is true or false for this element,
	// update our state machines.
	bool IsTrueForElt = !cast<ConstantInt>(C)->isZero();

	// State machine for single/double/range index comparison.
	if (IsTrueForElt) {
	// Update the TrueElement state machine.
	if (FirstTrueElement == Undefined)
	FirstTrueElement = TrueRangeEnd = i; // First true element.
	else {
	// Update double-compare state machine.
	if (SecondTrueElement == Undefined)
	SecondTrueElement = i;
	else
	SecondTrueElement = Overdefined;

	// Update range state machine.
	if (TrueRangeEnd == (int)i-1)
	TrueRangeEnd = i;
	else
	TrueRangeEnd = Overdefined;
	}
	} else {
	// Update the FalseElement state machine.
	if (FirstFalseElement == Undefined)
	FirstFalseElement = FalseRangeEnd = i; // First false element.
	else {
	// Update double-compare state machine.
	if (SecondFalseElement == Undefined)
	SecondFalseElement = i;
	else
	SecondFalseElement = Overdefined;

	// Update range state machine.
	if (FalseRangeEnd == (int)i-1)
	FalseRangeEnd = i;
	else
	FalseRangeEnd = Overdefined;
	}
	}

	// If this element is in range, update our magic bitvector.
	if (i < 64 && IsTrueForElt)
	MagicBitvector \|= 1ULL << i;

	// If all of our states become overdefined, bail out early. Since the
	// predicate is expensive, only check it every 8 elements. This is only
	// really useful for really huge arrays.
	if ((i & 8) == 0 && i >= 64 && SecondTrueElement == Overdefined &&
	SecondFalseElement == Overdefined && TrueRangeEnd == Overdefined &&
	FalseRangeEnd == Overdefined)
	return nullptr;
	}

	// Now that we've scanned the entire array, emit our new comparison(s). We
	// order the state machines in complexity of the generated code.
	Value *Idx = GEP->getOperand(2);

	// If the index is larger than the pointer size of the target, truncate the
	// index down like the GEP would do implicitly. We don't have to do this for
	// an inbounds GEP because the index can't be out of range.
	if (!GEP->isInBounds()) {
	Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
	unsigned PtrSize = IntPtrTy->getIntegerBitWidth();
	if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize)
	Idx = Builder.CreateTrunc(Idx, IntPtrTy);
	}

	// If the comparison is only true for one or two elements, emit direct
	// comparisons.
	if (SecondTrueElement != Overdefined) {
	// None true -> false.
	if (FirstTrueElement == Undefined)
	return replaceInstUsesWith(ICI, Builder.getFalse());

	Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement);

	// True for one element -> 'i == 47'.
	if (SecondTrueElement == Undefined)
	return new ICmpInst(ICmpInst::ICMP_EQ, Idx, FirstTrueIdx);

	// True for two elements -> 'i == 47 \| i == 72'.
	Value *C1 = Builder.CreateICmpEQ(Idx, FirstTrueIdx);
	Value *SecondTrueIdx = ConstantInt::get(Idx->getType(), SecondTrueElement);
	Value *C2 = Builder.CreateICmpEQ(Idx, SecondTrueIdx);
	return BinaryOperator::CreateOr(C1, C2);
	}

	// If the comparison is only false for one or two elements, emit direct
	// comparisons.
	if (SecondFalseElement != Overdefined) {
	// None false -> true.
	if (FirstFalseElement == Undefined)
	return replaceInstUsesWith(ICI, Builder.getTrue());

	Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement);

	// False for one element -> 'i != 47'.
	if (SecondFalseElement == Undefined)
	return new ICmpInst(ICmpInst::ICMP_NE, Idx, FirstFalseIdx);

	// False for two elements -> 'i != 47 & i != 72'.
	Value *C1 = Builder.CreateICmpNE(Idx, FirstFalseIdx);
	Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement);
	Value *C2 = Builder.CreateICmpNE(Idx, SecondFalseIdx);
	return BinaryOperator::CreateAnd(C1, C2);
	}

	// If the comparison can be replaced with a range comparison for the elements
	// where it is true, emit the range check.
	if (TrueRangeEnd != Overdefined) {
	assert(TrueRangeEnd != FirstTrueElement && "Should emit single compare");

	// Generate (i-FirstTrue) <u (TrueRangeEnd-FirstTrue+1).
	if (FirstTrueElement) {
	Value *Offs = ConstantInt::get(Idx->getType(), -FirstTrueElement);
	Idx = Builder.CreateAdd(Idx, Offs);
	}

	Value *End = ConstantInt::get(Idx->getType(),
	TrueRangeEnd-FirstTrueElement+1);
	return new ICmpInst(ICmpInst::ICMP_ULT, Idx, End);
	}

	// False range check.
	if (FalseRangeEnd != Overdefined) {
	assert(FalseRangeEnd != FirstFalseElement && "Should emit single compare");
	// Generate (i-FirstFalse) >u (FalseRangeEnd-FirstFalse).
	if (FirstFalseElement) {
	Value *Offs = ConstantInt::get(Idx->getType(), -FirstFalseElement);
	Idx = Builder.CreateAdd(Idx, Offs);
	}

	Value *End = ConstantInt::get(Idx->getType(),
	FalseRangeEnd-FirstFalseElement);
	return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End);
	}

	// If a magic bitvector captures the entire comparison state
	// of this load, replace it with computation that does:
	// ((magic_cst >> i) & 1) != 0
	{
	Type *Ty = nullptr;

	// Look for an appropriate type:
	// - The type of Idx if the magic fits
	// - The smallest fitting legal type
	if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth())
	Ty = Idx->getType();
	else
	Ty = DL.getSmallestLegalIntType(Init->getContext(), ArrayElementCount);

	if (Ty) {
	Value *V = Builder.CreateIntCast(Idx, Ty, false);
	V = Builder.CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
	V = Builder.CreateAnd(ConstantInt::get(Ty, 1), V);
	return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
	}
	}

	return nullptr;
	}

	/// Return a value that can be used to compare the offset implied by a GEP to
	/// zero. For example, if we have &A[i], we want to return 'i' for
	/// "icmp ne i, 0". Note that, in general, indices can be complex, and scales
	/// are involved. The above expression would also be legal to codegen as
	/// "icmp ne (i*4), 0" (assuming A is a pointer to i32).
	/// This latter form is less amenable to optimization though, and we are allowed
	/// to generate the first by knowing that pointer arithmetic doesn't overflow.
	///
	/// If we can't emit an optimized form for this expression, this returns null.
	///
	static Value evaluateGEPOffsetExpression(User GEP, InstCombiner &IC,
	const DataLayout &DL) {
	gep_type_iterator GTI = gep_type_begin(GEP);

	// Check to see if this gep only has a single variable index. If so, and if
	// any constant indices are a multiple of its scale, then we can compute this
	// in terms of the scale of the variable index. For example, if the GEP
	// implies an offset of "12 + i*4", then we can codegen this as "3 + i",
	// because the expression will cross zero at the same point.
	unsigned i, e = GEP->getNumOperands();
	int64_t Offset = 0;
	for (i = 1; i != e; ++i, ++GTI) {
	if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
	// Compute the aggregate offset of constant indices.
	if (CI->isZero()) continue;

	// Handle a struct index, which adds its field offset to the pointer.
	if (StructType *STy = GTI.getStructTypeOrNull()) {
	Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
	} else {
	uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
	Offset += Size*CI->getSExtValue();
	}
	} else {
	// Found our variable index.
	break;
	}
	}

	// If there are no variable indices, we must have a constant offset, just
	// evaluate it the general way.
	if (i == e) return nullptr;

	Value *VariableIdx = GEP->getOperand(i);
	// Determine the scale factor of the variable element. For example, this is
	// 4 if the variable index is into an array of i32.
	uint64_t VariableScale = DL.getTypeAllocSize(GTI.getIndexedType());

	// Verify that there are no other variable indices. If so, emit the hard way.
	for (++i, ++GTI; i != e; ++i, ++GTI) {
	ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i));
	if (!CI) return nullptr;

	// Compute the aggregate offset of constant indices.
	if (CI->isZero()) continue;

	// Handle a struct index, which adds its field offset to the pointer.
	if (StructType *STy = GTI.getStructTypeOrNull()) {
	Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
	} else {
	uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
	Offset += Size*CI->getSExtValue();
	}
	}

	// Okay, we know we have a single variable index, which must be a
	// pointer/array/vector index. If there is no offset, life is simple, return
	// the index.
	Type *IntPtrTy = DL.getIntPtrType(GEP->getOperand(0)->getType());
	unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth();
	if (Offset == 0) {
	// Cast to intptrty in case a truncation occurs. If an extension is needed,
	// we don't need to bother extending: the extension won't affect where the
	// computation crosses zero.
	if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) {
	VariableIdx = IC.Builder.CreateTrunc(VariableIdx, IntPtrTy);
	}
	return VariableIdx;
	}

	// Otherwise, there is an index. The computation we will do will be modulo
	// the pointer size.
	Offset = SignExtend64(Offset, IntPtrWidth);
	VariableScale = SignExtend64(VariableScale, IntPtrWidth);

	// To do this transformation, any constant index must be a multiple of the
	// variable scale factor. For example, we can evaluate "12 + 4*i" as "3 + i",
	// but we can't evaluate "10 + 3*i" in terms of i. Check that the offset is a
	// multiple of the variable scale.
	int64_t NewOffs = Offset / (int64_t)VariableScale;
	if (Offset != NewOffs*(int64_t)VariableScale)
	return nullptr;

	// Okay, we can do this evaluation. Start by converting the index to intptr.
	if (VariableIdx->getType() != IntPtrTy)
	VariableIdx = IC.Builder.CreateIntCast(VariableIdx, IntPtrTy,
	true /Signed/);
	Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs);
	return IC.Builder.CreateAdd(VariableIdx, OffsetVal, "offset");
	}

	/// Returns true if we can rewrite Start as a GEP with pointer Base
	/// and some integer offset. The nodes that need to be re-written
	/// for this transformation will be added to Explored.
	static bool canRewriteGEPAsOffset(Value Start, Value Base,
	const DataLayout &DL,
	SetVector<Value *> &Explored) {
	SmallVector<Value *, 16> WorkList(1, Start);
	Explored.insert(Base);

	// The following traversal gives us an order which can be used
	// when doing the final transformation. Since in the final
	// transformation we create the PHI replacement instructions first,
	// we don't have to get them in any particular order.
	//
	// However, for other instructions we will have to traverse the
	// operands of an instruction first, which means that we have to
	// do a post-order traversal.
	while (!WorkList.empty()) {
	SetVector<PHINode *> PHIs;

	while (!WorkList.empty()) {
	if (Explored.size() >= 100)
	return false;

	Value *V = WorkList.back();

	if (Explored.count(V) != 0) {
	WorkList.pop_back();
	continue;
	}

	if (!isa<IntToPtrInst>(V) && !isa<PtrToIntInst>(V) &&
	!isa<GetElementPtrInst>(V) && !isa<PHINode>(V))
	// We've found some value that we can't explore which is different from
	// the base. Therefore we can't do this transformation.
	return false;

	if (isa<IntToPtrInst>(V) \|\| isa<PtrToIntInst>(V)) {
	auto *CI = dyn_cast<CastInst>(V);
	if (!CI->isNoopCast(DL))
	return false;

	if (Explored.count(CI->getOperand(0)) == 0)
	WorkList.push_back(CI->getOperand(0));
	}

	if (auto *GEP = dyn_cast<GEPOperator>(V)) {
	// We're limiting the GEP to having one index. This will preserve
	// the original pointer type. We could handle more cases in the
	// future.
	if (GEP->getNumIndices() != 1 \|\| !GEP->isInBounds() \|\|
	GEP->getType() != Start->getType())
	return false;

	if (Explored.count(GEP->getOperand(0)) == 0)
	WorkList.push_back(GEP->getOperand(0));
	}

	if (WorkList.back() == V) {
	WorkList.pop_back();
	// We've finished visiting this node, mark it as such.
	Explored.insert(V);
	}

	if (auto *PN = dyn_cast<PHINode>(V)) {
	// We cannot transform PHIs on unsplittable basic blocks.
	if (isa<CatchSwitchInst>(PN->getParent()->getTerminator()))
	return false;
	Explored.insert(PN);
	PHIs.insert(PN);
	}
	}

	// Explore the PHI nodes further.
	for (auto *PN : PHIs)
	for (Value *Op : PN->incoming_values())
	if (Explored.count(Op) == 0)
	WorkList.push_back(Op);
	}

	// Make sure that we can do this. Since we can't insert GEPs in a basic
	// block before a PHI node, we can't easily do this transformation if
	// we have PHI node users of transformed instructions.
	for (Value *Val : Explored) {
	for (Value *Use : Val->uses()) {

	auto *PHI = dyn_cast<PHINode>(Use);
	auto *Inst = dyn_cast<Instruction>(Val);

	if (Inst == Base \|\| Inst == PHI \|\| !Inst \|\| !PHI \|\|
	Explored.count(PHI) == 0)
	continue;

	if (PHI->getParent() == Inst->getParent())
	return false;
	}
	}
	return true;
	}

	// Sets the appropriate insert point on Builder where we can add
	// a replacement Instruction for V (if that is possible).
	static void setInsertionPoint(IRBuilder<> &Builder, Value *V,
	bool Before = true) {
	if (auto *PHI = dyn_cast<PHINode>(V)) {
	Builder.SetInsertPoint(&*PHI->getParent()->getFirstInsertionPt());
	return;
	}
	if (auto *I = dyn_cast<Instruction>(V)) {
	if (!Before)
	I = &*std::next(I->getIterator());
	Builder.SetInsertPoint(I);
	return;
	}
	if (auto *A = dyn_cast<Argument>(V)) {
	// Set the insertion point in the entry block.
	BasicBlock &Entry = A->getParent()->getEntryBlock();
	Builder.SetInsertPoint(&*Entry.getFirstInsertionPt());
	return;
	}
	// Otherwise, this is a constant and we don't need to set a new
	// insertion point.
	assert(isa<Constant>(V) && "Setting insertion point for unknown value!");
	}

	/// Returns a re-written value of Start as an indexed GEP using Base as a
	/// pointer.
	static Value rewriteGEPAsOffset(Value Start, Value *Base,
	const DataLayout &DL,
	SetVector<Value *> &Explored) {
	// Perform all the substitutions. This is a bit tricky because we can
	// have cycles in our use-def chains.
	// 1. Create the PHI nodes without any incoming values.
	// 2. Create all the other values.
	// 3. Add the edges for the PHI nodes.
	// 4. Emit GEPs to get the original pointers.
	// 5. Remove the original instructions.
	Type *IndexType = IntegerType::get(
	Base->getContext(), DL.getIndexTypeSizeInBits(Start->getType()));

	DenseMap<Value , Value > NewInsts;
	NewInsts[Base] = ConstantInt::getNullValue(IndexType);

	// Create the new PHI nodes, without adding any incoming values.
	for (Value *Val : Explored) {
	if (Val == Base)
	continue;
	// Create empty phi nodes. This avoids cyclic dependencies when creating
	// the remaining instructions.
	if (auto *PHI = dyn_cast<PHINode>(Val))
	NewInsts[PHI] = PHINode::Create(IndexType, PHI->getNumIncomingValues(),
	PHI->getName() + ".idx", PHI);
	}
	IRBuilder<> Builder(Base->getContext());

	// Create all the other instructions.
	for (Value *Val : Explored) {

	if (NewInsts.find(Val) != NewInsts.end())
	continue;

	if (auto *CI = dyn_cast<CastInst>(Val)) {
	// Don't get rid of the intermediate variable here; the store can grow
	// the map which will invalidate the reference to the input value.
	Value *V = NewInsts[CI->getOperand(0)];
	NewInsts[CI] = V;
	continue;
	}
	if (auto *GEP = dyn_cast<GEPOperator>(Val)) {
	Value *Index = NewInsts[GEP->getOperand(1)] ? NewInsts[GEP->getOperand(1)]
	: GEP->getOperand(1);
	setInsertionPoint(Builder, GEP);
	// Indices might need to be sign extended. GEPs will magically do
	// this, but we need to do it ourselves here.
	if (Index->getType()->getScalarSizeInBits() !=
	NewInsts[GEP->getOperand(0)]->getType()->getScalarSizeInBits()) {
	Index = Builder.CreateSExtOrTrunc(
	Index, NewInsts[GEP->getOperand(0)]->getType(),
	GEP->getOperand(0)->getName() + ".sext");
	}

	auto *Op = NewInsts[GEP->getOperand(0)];
	if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
	NewInsts[GEP] = Index;
	else
	NewInsts[GEP] = Builder.CreateNSWAdd(
	Op, Index, GEP->getOperand(0)->getName() + ".add");
	continue;
	}
	if (isa<PHINode>(Val))
	continue;

	llvm_unreachable("Unexpected instruction type");
	}

	// Add the incoming values to the PHI nodes.
	for (Value *Val : Explored) {
	if (Val == Base)
	continue;
	// All the instructions have been created, we can now add edges to the
	// phi nodes.
	if (auto *PHI = dyn_cast<PHINode>(Val)) {
	PHINode NewPhi = static_cast<PHINode >(NewInsts[PHI]);
	for (unsigned I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
	Value *NewIncoming = PHI->getIncomingValue(I);

	if (NewInsts.find(NewIncoming) != NewInsts.end())
	NewIncoming = NewInsts[NewIncoming];

	NewPhi->addIncoming(NewIncoming, PHI->getIncomingBlock(I));
	}
	}
	}

	for (Value *Val : Explored) {
	if (Val == Base)
	continue;

	// Depending on the type, for external users we have to emit
	// a GEP or a GEP + ptrtoint.
	setInsertionPoint(Builder, Val, false);

	// If required, create an inttoptr instruction for Base.
	Value *NewBase = Base;
	if (!Base->getType()->isPointerTy())
	NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(),
	Start->getName() + "to.ptr");

	Value *GEP = Builder.CreateInBoundsGEP(
	Start->getType()->getPointerElementType(), NewBase,
	makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr");

	if (!Val->getType()->isPointerTy()) {
	Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(),
	Val->getName() + ".conv");
	GEP = Cast;
	}
	Val->replaceAllUsesWith(GEP);
	}

	return NewInsts[Start];
	}

	/// Looks through GEPs, IntToPtrInsts and PtrToIntInsts in order to express
	/// the input Value as a constant indexed GEP. Returns a pair containing
	/// the GEPs Pointer and Index.
	static std::pair<Value , Value >
	getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
	Type *IndexType = IntegerType::get(V->getContext(),
	DL.getIndexTypeSizeInBits(V->getType()));

	Constant *Index = ConstantInt::getNullValue(IndexType);
	while (true) {
	if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
	// We accept only inbouds GEPs here to exclude the possibility of
	// overflow.
	if (!GEP->isInBounds())
	break;
	if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 &&
	GEP->getType() == V->getType()) {
	V = GEP->getOperand(0);
	Constant GEPIndex = static_cast<Constant >(GEP->getOperand(1));
	Index = ConstantExpr::getAdd(
	Index, ConstantExpr::getSExtOrBitCast(GEPIndex, IndexType));
	continue;
	}
	break;
	}
	if (auto *CI = dyn_cast<IntToPtrInst>(V)) {
	if (!CI->isNoopCast(DL))
	break;
	V = CI->getOperand(0);
	continue;
	}
	if (auto *CI = dyn_cast<PtrToIntInst>(V)) {
	if (!CI->isNoopCast(DL))
	break;
	V = CI->getOperand(0);
	continue;
	}
	break;
	}
	return {V, Index};
	}

	/// Converts (CMP GEPLHS, RHS) if this change would make RHS a constant.
	/// We can look through PHIs, GEPs and casts in order to determine a common base
	/// between GEPLHS and RHS.
	static Instruction transformToIndexedCompare(GEPOperator GEPLHS, Value *RHS,
	ICmpInst::Predicate Cond,
	const DataLayout &DL) {
	if (!GEPLHS->hasAllConstantIndices())
	return nullptr;

	// Make sure the pointers have the same type.
	if (GEPLHS->getType() != RHS->getType())
	return nullptr;

	Value PtrBase, Index;
	std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL);

	// The set of nodes that will take part in this transformation.
	SetVector<Value *> Nodes;

	if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes))
	return nullptr;

	// We know we can re-write this as
	// ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)
	// Since we've only looked through inbouds GEPs we know that we
	// can't have overflow on either side. We can therefore re-write
	// this as:
	// OFFSET1 cmp OFFSET2
	Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes);

	// RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written
	// GEP having PtrBase as the pointer base, and has returned in NewRHS the
	// offset. Since Index is the offset of LHS to the base pointer, we will now
	// compare the offsets instead of comparing the pointers.
	return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Index, NewRHS);
	}

	/// Fold comparisons between a GEP instruction and something else. At this point
	/// we know that the GEP is on the LHS of the comparison.
	Instruction InstCombiner::foldGEPICmp(GEPOperator GEPLHS, Value *RHS,
	ICmpInst::Predicate Cond,
	Instruction &I) {
	// Don't transform signed compares of GEPs into index compares. Even if the
	// GEP is inbounds, the final add of the base pointer can have signed overflow
	// and would change the result of the icmp.
	// e.g. "&foo[0] <s &foo[1]" can't be folded to "true" because "foo" could be
	// the maximum signed value for the pointer type.
	if (ICmpInst::isSigned(Cond))
	return nullptr;

	// Look through bitcasts and addrspacecasts. We do not however want to remove
	// 0 GEPs.
	if (!isa<GetElementPtrInst>(RHS))
	RHS = RHS->stripPointerCasts();

	Value *PtrBase = GEPLHS->getOperand(0);
	if (PtrBase == RHS && GEPLHS->isInBounds()) {
	// ((gep Ptr, OFFSET) cmp Ptr) ---> (OFFSET cmp 0).
	// This transformation (ignoring the base and scales) is valid because we
	// know pointers can't overflow since the gep is inbounds. See if we can
	// output an optimized form.
	Value Offset = evaluateGEPOffsetExpression(GEPLHS, this, DL);

	// If not, synthesize the offset the hard way.
	if (!Offset)
	Offset = EmitGEPOffset(GEPLHS);
	return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
	Constant::getNullValue(Offset->getType()));
	} else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) {
	// If the base pointers are different, but the indices are the same, just
	// compare the base pointer.
	if (PtrBase != GEPRHS->getOperand(0)) {
	bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands();
	IndicesTheSame &= GEPLHS->getOperand(0)->getType() ==
	GEPRHS->getOperand(0)->getType();
	if (IndicesTheSame)
	for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
	if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
	IndicesTheSame = false;
	break;
	}

	// If all indices are the same, just compare the base pointers.
	Type *BaseType = GEPLHS->getOperand(0)->getType();
	if (IndicesTheSame && CmpInst::makeCmpResultType(BaseType) == I.getType())
	return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0));

	// If we're comparing GEPs with two base pointers that only differ in type
	// and both GEPs have only constant indices or just one use, then fold
	// the compare with the adjusted indices.
	if (GEPLHS->isInBounds() && GEPRHS->isInBounds() &&
	(GEPLHS->hasAllConstantIndices() \|\| GEPLHS->hasOneUse()) &&
	(GEPRHS->hasAllConstantIndices() \|\| GEPRHS->hasOneUse()) &&
	PtrBase->stripPointerCasts() ==
	GEPRHS->getOperand(0)->stripPointerCasts()) {
	Value *LOffset = EmitGEPOffset(GEPLHS);
	Value *ROffset = EmitGEPOffset(GEPRHS);

	// If we looked through an addrspacecast between different sized address
	// spaces, the LHS and RHS pointers are different sized
	// integers. Truncate to the smaller one.
	Type *LHSIndexTy = LOffset->getType();
	Type *RHSIndexTy = ROffset->getType();
	if (LHSIndexTy != RHSIndexTy) {
	if (LHSIndexTy->getPrimitiveSizeInBits() <
	RHSIndexTy->getPrimitiveSizeInBits()) {
	ROffset = Builder.CreateTrunc(ROffset, LHSIndexTy);
	} else
	LOffset = Builder.CreateTrunc(LOffset, RHSIndexTy);
	}

	Value *Cmp = Builder.CreateICmp(ICmpInst::getSignedPredicate(Cond),
	LOffset, ROffset);
	return replaceInstUsesWith(I, Cmp);
	}

	// Otherwise, the base pointers are different and the indices are
	// different. Try convert this to an indexed compare by looking through
	// PHIs/casts.
	return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
	}

	// If one of the GEPs has all zero indices, recurse.
	if (GEPLHS->hasAllZeroIndices())
	return foldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
	ICmpInst::getSwappedPredicate(Cond), I);

	// If the other GEP has all zero indices, recurse.
	if (GEPRHS->hasAllZeroIndices())
	return foldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);

	bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
	if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) {
	// If the GEPs only differ by one index, compare it.
	unsigned NumDifferences = 0; // Keep track of # differences.
	unsigned DiffOperand = 0; // The operand that differs.
	for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
	if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
	if (GEPLHS->getOperand(i)->getType()->getPrimitiveSizeInBits() !=
	GEPRHS->getOperand(i)->getType()->getPrimitiveSizeInBits()) {
	// Irreconcilable differences.
	NumDifferences = 2;
	break;
	} else {
	if (NumDifferences++) break;
	DiffOperand = i;
	}
	}

	if (NumDifferences == 0) // SAME GEP?
	return replaceInstUsesWith(I, // No comparison is needed here.
	ConstantInt::get(I.getType(), ICmpInst::isTrueWhenEqual(Cond)));

	else if (NumDifferences == 1 && GEPsInBounds) {
	Value *LHSV = GEPLHS->getOperand(DiffOperand);
	Value *RHSV = GEPRHS->getOperand(DiffOperand);
	// Make sure we do a signed comparison here.
	return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV);
	}
	}

	// Only lower this if the icmp is the only user of the GEP or if we expect
	// the result to fold to a constant!
	if (GEPsInBounds && (isa<ConstantExpr>(GEPLHS) \|\| GEPLHS->hasOneUse()) &&
	(isa<ConstantExpr>(GEPRHS) \|\| GEPRHS->hasOneUse())) {
	// ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2)
	Value *L = EmitGEPOffset(GEPLHS);
	Value *R = EmitGEPOffset(GEPRHS);
	return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
	}
	}

	// Try convert this to an indexed compare by looking through PHIs/casts as a
	// last resort.
	return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
	}

	Instruction *InstCombiner::foldAllocaCmp(ICmpInst &ICI,
	const AllocaInst *Alloca,
	const Value *Other) {
	assert(ICI.isEquality() && "Cannot fold non-equality comparison.");

	// It would be tempting to fold away comparisons between allocas and any
	// pointer not based on that alloca (e.g. an argument). However, even
	// though such pointers cannot alias, they can still compare equal.
	//
	// But LLVM doesn't specify where allocas get their memory, so if the alloca
	// doesn't escape we can argue that it's impossible to guess its value, and we
	// can therefore act as if any such guesses are wrong.
	//
	// The code below checks that the alloca doesn't escape, and that it's only
	// used in a comparison once (the current instruction). The
	// single-comparison-use condition ensures that we're trivially folding all
	// comparisons against the alloca consistently, and avoids the risk of
	// erroneously folding a comparison of the pointer with itself.

	unsigned MaxIter = 32; // Break cycles and bound to constant-time.

	SmallVector<const Use *, 32> Worklist;
	for (const Use &U : Alloca->uses()) {
	if (Worklist.size() >= MaxIter)
	return nullptr;
	Worklist.push_back(&U);
	}

	unsigned NumCmps = 0;
	while (!Worklist.empty()) {
	assert(Worklist.size() <= MaxIter);
	const Use *U = Worklist.pop_back_val();
	const Value *V = U->getUser();
	--MaxIter;

	if (isa<BitCastInst>(V) \|\| isa<GetElementPtrInst>(V) \|\| isa<PHINode>(V) \|\|
	isa<SelectInst>(V)) {
	// Track the uses.
	} else if (isa<LoadInst>(V)) {
	// Loading from the pointer doesn't escape it.
	continue;
	} else if (const auto *SI = dyn_cast<StoreInst>(V)) {
	// Storing to the pointer is fine, but storing the pointer escapes it.
	if (SI->getValueOperand() == U->get())
	return nullptr;
	continue;
	} else if (isa<ICmpInst>(V)) {
	if (NumCmps++)
	return nullptr; // Found more than one cmp.
	continue;
	} else if (const auto *Intrin = dyn_cast<IntrinsicInst>(V)) {
	switch (Intrin->getIntrinsicID()) {
	// These intrinsics don't escape or compare the pointer. Memset is safe
	// because we don't allow ptrtoint. Memcpy and memmove are safe because
	// we don't allow stores, so src cannot point to V.
	case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
	case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset:
	continue;
	default:
	return nullptr;
	}
	} else {
	return nullptr;
	}
	for (const Use &U : V->uses()) {
	if (Worklist.size() >= MaxIter)
	return nullptr;
	Worklist.push_back(&U);
	}
	}

	Type *CmpTy = CmpInst::makeCmpResultType(Other->getType());
	return replaceInstUsesWith(
	ICI,
	ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate())));
	}

	/// Fold "icmp pred (X+C), X".
	Instruction InstCombiner::foldICmpAddOpConst(Value X, const APInt &C,
	ICmpInst::Predicate Pred) {
	// From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
	// so the values can never be equal. Similarly for all other "or equals"
	// operators.
	assert(!!C && "C should not be zero!");

	// (X+1) <u X --> X >u (MAXUINT-1) --> X == 255
	// (X+2) <u X --> X >u (MAXUINT-2) --> X > 253
	// (X+MAXUINT) <u X --> X >u (MAXUINT-MAXUINT) --> X != 0
	if (Pred == ICmpInst::ICMP_ULT \|\| Pred == ICmpInst::ICMP_ULE) {
	Constant *R = ConstantInt::get(X->getType(),
	APInt::getMaxValue(C.getBitWidth()) - C);
	return new ICmpInst(ICmpInst::ICMP_UGT, X, R);
	}

	// (X+1) >u X --> X <u (0-1) --> X != 255
	// (X+2) >u X --> X <u (0-2) --> X <u 254
	// (X+MAXUINT) >u X --> X <u (0-MAXUINT) --> X <u 1 --> X == 0
	if (Pred == ICmpInst::ICMP_UGT \|\| Pred == ICmpInst::ICMP_UGE)
	return new ICmpInst(ICmpInst::ICMP_ULT, X,
	ConstantInt::get(X->getType(), -C));

	APInt SMax = APInt::getSignedMaxValue(C.getBitWidth());

	// (X+ 1) <s X --> X >s (MAXSINT-1) --> X == 127
	// (X+ 2) <s X --> X >s (MAXSINT-2) --> X >s 125
	// (X+MAXSINT) <s X --> X >s (MAXSINT-MAXSINT) --> X >s 0
	// (X+MINSINT) <s X --> X >s (MAXSINT-MINSINT) --> X >s -1
	// (X+ -2) <s X --> X >s (MAXSINT- -2) --> X >s 126
	// (X+ -1) <s X --> X >s (MAXSINT- -1) --> X != 127
	if (Pred == ICmpInst::ICMP_SLT \|\| Pred == ICmpInst::ICMP_SLE)
	return new ICmpInst(ICmpInst::ICMP_SGT, X,
	ConstantInt::get(X->getType(), SMax - C));

	// (X+ 1) >s X --> X <s (MAXSINT-(1-1)) --> X != 127
	// (X+ 2) >s X --> X <s (MAXSINT-(2-1)) --> X <s 126
	// (X+MAXSINT) >s X --> X <s (MAXSINT-(MAXSINT-1)) --> X <s 1
	// (X+MINSINT) >s X --> X <s (MAXSINT-(MINSINT-1)) --> X <s -2
	// (X+ -2) >s X --> X <s (MAXSINT-(-2-1)) --> X <s -126
	// (X+ -1) >s X --> X <s (MAXSINT-(-1-1)) --> X == -128

	assert(Pred == ICmpInst::ICMP_SGT \|\| Pred == ICmpInst::ICMP_SGE);
	return new ICmpInst(ICmpInst::ICMP_SLT, X,
	ConstantInt::get(X->getType(), SMax - (C - 1)));
	}

	/// Handle "(icmp eq/ne (ashr/lshr AP2, A), AP1)" ->
	/// (icmp eq/ne A, Log2(AP2/AP1)) ->
	/// (icmp eq/ne A, Log2(AP2) - Log2(AP1)).
	Instruction InstCombiner::foldICmpShrConstConst(ICmpInst &I, Value A,
	const APInt &AP1,
	const APInt &AP2) {
	assert(I.isEquality() && "Cannot fold icmp gt/lt");

	auto getICmp = [&I](CmpInst::Predicate Pred, Value LHS, Value RHS) {
	if (I.getPredicate() == I.ICMP_NE)
	Pred = CmpInst::getInversePredicate(Pred);
	return new ICmpInst(Pred, LHS, RHS);
	};

	// Don't bother doing any work for cases which InstSimplify handles.
	if (AP2.isNullValue())
	return nullptr;

	bool IsAShr = isa<AShrOperator>(I.getOperand(0));
	if (IsAShr) {
	if (AP2.isAllOnesValue())
	return nullptr;
	if (AP2.isNegative() != AP1.isNegative())
	return nullptr;
	if (AP2.sgt(AP1))
	return nullptr;
	}

	if (!AP1)
	// 'A' must be large enough to shift out the highest set bit.
	return getICmp(I.ICMP_UGT, A,
	ConstantInt::get(A->getType(), AP2.logBase2()));

	if (AP1 == AP2)
	return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));

	int Shift;
	if (IsAShr && AP1.isNegative())
	Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes();
	else
	Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros();

	if (Shift > 0) {
	if (IsAShr && AP1 == AP2.ashr(Shift)) {
	// There are multiple solutions if we are comparing against -1 and the LHS
	// of the ashr is not a power of two.
	if (AP1.isAllOnesValue() && !AP2.isPowerOf2())
	return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift));
	return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
	} else if (AP1 == AP2.lshr(Shift)) {
	return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
	}
	}

	// Shifting const2 will never be equal to const1.
	// FIXME: This should always be handled by InstSimplify?
	auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE);
	return replaceInstUsesWith(I, TorF);
	}

	/// Handle "(icmp eq/ne (shl AP2, A), AP1)" ->
	/// (icmp eq/ne A, TrailingZeros(AP1) - TrailingZeros(AP2)).
	Instruction InstCombiner::foldICmpShlConstConst(ICmpInst &I, Value A,
	const APInt &AP1,
	const APInt &AP2) {
	assert(I.isEquality() && "Cannot fold icmp gt/lt");

	auto getICmp = [&I](CmpInst::Predicate Pred, Value LHS, Value RHS) {
	if (I.getPredicate() == I.ICMP_NE)
	Pred = CmpInst::getInversePredicate(Pred);
	return new ICmpInst(Pred, LHS, RHS);
	};

	// Don't bother doing any work for cases which InstSimplify handles.
	if (AP2.isNullValue())
	return nullptr;

	unsigned AP2TrailingZeros = AP2.countTrailingZeros();

	if (!AP1 && AP2TrailingZeros != 0)
	return getICmp(
	I.ICMP_UGE, A,
	ConstantInt::get(A->getType(), AP2.getBitWidth() - AP2TrailingZeros));

	if (AP1 == AP2)
	return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));

	// Get the distance between the lowest bits that are set.
	int Shift = AP1.countTrailingZeros() - AP2TrailingZeros;

	if (Shift > 0 && AP2.shl(Shift) == AP1)
	return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));

	// Shifting const2 will never be equal to const1.
	// FIXME: This should always be handled by InstSimplify?
	auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE);
	return replaceInstUsesWith(I, TorF);
	}

	/// The caller has matched a pattern of the form:
	/// I = icmp ugt (add (add A, B), CI2), CI1
	/// If this is of the form:
	/// sum = a + b
	/// if (sum+128 >u 255)
	/// Then replace it with llvm.sadd.with.overflow.i8.
	///
	static Instruction processUGT_ADDCST_ADD(ICmpInst &I, Value A, Value *B,
	ConstantInt CI2, ConstantInt CI1,
	InstCombiner &IC) {
	// The transformation we're trying to do here is to transform this into an
	// llvm.sadd.with.overflow. To do this, we have to replace the original add
	// with a narrower add, and discard the add-with-constant that is part of the
	// range check (if we can't eliminate it, this isn't profitable).

	// In order to eliminate the add-with-constant, the compare can be its only
	// use.
	Instruction *AddWithCst = cast<Instruction>(I.getOperand(0));
	if (!AddWithCst->hasOneUse())
	return nullptr;

	// If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow.
	if (!CI2->getValue().isPowerOf2())
	return nullptr;
	unsigned NewWidth = CI2->getValue().countTrailingZeros();
	if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31)
	return nullptr;

	// The width of the new add formed is 1 more than the bias.
	++NewWidth;

	// Check to see that CI1 is an all-ones value with NewWidth bits.
	if (CI1->getBitWidth() == NewWidth \|\|
	CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth))
	return nullptr;

	// This is only really a signed overflow check if the inputs have been
	// sign-extended; check for that condition. For example, if CI2 is 2^31 and
	// the operands of the add are 64 bits wide, we need at least 33 sign bits.
	unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1;
	if (IC.ComputeNumSignBits(A, 0, &I) < NeededSignBits \|\|
	IC.ComputeNumSignBits(B, 0, &I) < NeededSignBits)
	return nullptr;

	// In order to replace the original add with a narrower
	// llvm.sadd.with.overflow, the only uses allowed are the add-with-constant
	// and truncates that discard the high bits of the add. Verify that this is
	// the case.
	Instruction *OrigAdd = cast<Instruction>(AddWithCst->getOperand(0));
	for (User *U : OrigAdd->users()) {
	if (U == AddWithCst)
	continue;

	// Only accept truncates for now. We would really like a nice recursive
	// predicate like SimplifyDemandedBits, but which goes downwards the use-def
	// chain to see which bits of a value are actually demanded. If the
	// original add had another add which was then immediately truncated, we
	// could still do the transformation.
	TruncInst *TI = dyn_cast<TruncInst>(U);
	if (!TI \|\| TI->getType()->getPrimitiveSizeInBits() > NewWidth)
	return nullptr;
	}

	// If the pattern matches, truncate the inputs to the narrower type and
	// use the sadd_with_overflow intrinsic to efficiently compute both the
	// result and the overflow bit.
	Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth);
	Function *F = Intrinsic::getDeclaration(
	I.getModule(), Intrinsic::sadd_with_overflow, NewType);

	InstCombiner::BuilderTy &Builder = IC.Builder;

	// Put the new code above the original add, in case there are any uses of the
	// add between the add and the compare.
	Builder.SetInsertPoint(OrigAdd);

	Value *TruncA = Builder.CreateTrunc(A, NewType, A->getName() + ".trunc");
	Value *TruncB = Builder.CreateTrunc(B, NewType, B->getName() + ".trunc");
	CallInst *Call = Builder.CreateCall(F, {TruncA, TruncB}, "sadd");
	Value *Add = Builder.CreateExtractValue(Call, 0, "sadd.result");
	Value *ZExt = Builder.CreateZExt(Add, OrigAdd->getType());

	// The inner add was the result of the narrow add, zero extended to the
	// wider type. Replace it with the result computed by the intrinsic.
	IC.replaceInstUsesWith(*OrigAdd, ZExt);

	// The original icmp gets replaced with the overflow value.
	return ExtractValueInst::Create(Call, 1, "sadd.overflow");
	}

	// Handle icmp pred X, 0
	Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) {
	CmpInst::Predicate Pred = Cmp.getPredicate();
	if (!match(Cmp.getOperand(1), m_Zero()))
	return nullptr;

	// (icmp sgt smin(PosA, B) 0) -> (icmp sgt B 0)
	if (Pred == ICmpInst::ICMP_SGT) {
	Value A, B;
	SelectPatternResult SPR = matchSelectPattern(Cmp.getOperand(0), A, B);
	if (SPR.Flavor == SPF_SMIN) {
	if (isKnownPositive(A, DL, 0, &AC, &Cmp, &DT))
	return new ICmpInst(Pred, B, Cmp.getOperand(1));
	if (isKnownPositive(B, DL, 0, &AC, &Cmp, &DT))
	return new ICmpInst(Pred, A, Cmp.getOperand(1));
	}
	}

	// Given:
	// icmp eq/ne (urem %x, %y), 0
	// Iff %x has 0 or 1 bits set, and %y has at least 2 bits set, omit 'urem':
	// icmp eq/ne %x, 0
	Value X, Y;
	if (match(Cmp.getOperand(0), m_URem(m_Value(X), m_Value(Y))) &&
	ICmpInst::isEquality(Pred)) {
	KnownBits XKnown = computeKnownBits(X, 0, &Cmp);
	KnownBits YKnown = computeKnownBits(Y, 0, &Cmp);
	if (XKnown.countMaxPopulation() == 1 && YKnown.countMinPopulation() >= 2)
	return new ICmpInst(Pred, X, Cmp.getOperand(1));
	}

	return nullptr;
	}

	/// Fold icmp Pred X, C.
	/// TODO: This code structure does not make sense. The saturating add fold
	/// should be moved to some other helper and extended as noted below (it is also
	/// possible that code has been made unnecessary - do we canonicalize IR to
	/// overflow/saturating intrinsics or not?).
	Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
	// Match the following pattern, which is a common idiom when writing
	// overflow-safe integer arithmetic functions. The source performs an addition
	// in wider type and explicitly checks for overflow using comparisons against
	// INT_MIN and INT_MAX. Simplify by using the sadd_with_overflow intrinsic.
	//
	// TODO: This could probably be generalized to handle other overflow-safe
	// operations if we worked out the formulas to compute the appropriate magic
	// constants.
	//
	// sum = a + b
	// if (sum+128 >u 255) ... -> llvm.sadd.with.overflow.i8
	CmpInst::Predicate Pred = Cmp.getPredicate();
	Value Op0 = Cmp.getOperand(0), Op1 = Cmp.getOperand(1);
	Value A, B;
	ConstantInt CI, CI2; // I = icmp ugt (add (add A, B), CI2), CI
	if (Pred == ICmpInst::ICMP_UGT && match(Op1, m_ConstantInt(CI)) &&
	match(Op0, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2))))
	if (Instruction Res = processUGT_ADDCST_ADD(Cmp, A, B, CI2, CI, this))
	return Res;

	return nullptr;
	}

	/// Canonicalize icmp instructions based on dominating conditions.
	Instruction *InstCombiner::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
	// This is a cheap/incomplete check for dominance - just match a single
	// predecessor with a conditional branch.
	BasicBlock *CmpBB = Cmp.getParent();
	BasicBlock *DomBB = CmpBB->getSinglePredecessor();
	if (!DomBB)
	return nullptr;

	Value *DomCond;
	BasicBlock TrueBB, FalseBB;
	if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB)))
	return nullptr;

	assert((TrueBB == CmpBB \|\| FalseBB == CmpBB) &&
	"Predecessor block does not point to successor?");

	// The branch should get simplified. Don't bother simplifying this condition.
	if (TrueBB == FalseBB)
	return nullptr;

	// Try to simplify this compare to T/F based on the dominating condition.
	Optional<bool> Imp = isImpliedCondition(DomCond, &Cmp, DL, TrueBB == CmpBB);
	if (Imp)
	return replaceInstUsesWith(Cmp, ConstantInt::get(Cmp.getType(), *Imp));

	CmpInst::Predicate Pred = Cmp.getPredicate();
	Value X = Cmp.getOperand(0), Y = Cmp.getOperand(1);
	ICmpInst::Predicate DomPred;
	const APInt C, DomC;
	if (match(DomCond, m_ICmp(DomPred, m_Specific(X), m_APInt(DomC))) &&
	match(Y, m_APInt(C))) {
	// We have 2 compares of a variable with constants. Calculate the constant
	// ranges of those compares to see if we can transform the 2nd compare:
	// DomBB:
	// DomCond = icmp DomPred X, DomC
	// br DomCond, CmpBB, FalseBB
	// CmpBB:
	// Cmp = icmp Pred X, C
	ConstantRange CR = ConstantRange::makeAllowedICmpRegion(Pred, *C);
	ConstantRange DominatingCR =
	(CmpBB == TrueBB) ? ConstantRange::makeExactICmpRegion(DomPred, *DomC)
	: ConstantRange::makeExactICmpRegion(
	CmpInst::getInversePredicate(DomPred), *DomC);
	ConstantRange Intersection = DominatingCR.intersectWith(CR);
	ConstantRange Difference = DominatingCR.difference(CR);
	if (Intersection.isEmptySet())
	return replaceInstUsesWith(Cmp, Builder.getFalse());
	if (Difference.isEmptySet())
	return replaceInstUsesWith(Cmp, Builder.getTrue());

	// Canonicalizing a sign bit comparison that gets used in a branch,
	// pessimizes codegen by generating branch on zero instruction instead
	// of a test and branch. So we avoid canonicalizing in such situations
	// because test and branch instruction has better branch displacement
	// than compare and branch instruction.
	bool UnusedBit;
	bool IsSignBit = isSignBitCheck(Pred, *C, UnusedBit);
	if (Cmp.isEquality() \|\| (IsSignBit && hasBranchUse(Cmp)))
	return nullptr;

	if (const APInt *EqC = Intersection.getSingleElement())
	return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder.getInt(*EqC));
	if (const APInt *NeC = Difference.getSingleElement())
	return new ICmpInst(ICmpInst::ICMP_NE, X, Builder.getInt(*NeC));
	}

	return nullptr;
	}

	/// Fold icmp (trunc X, Y), C.
	Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp,
	TruncInst *Trunc,
	const APInt &C) {
	ICmpInst::Predicate Pred = Cmp.getPredicate();
	Value *X = Trunc->getOperand(0);
	if (C.isOneValue() && C.getBitWidth() > 1) {
	// icmp slt trunc(signum(V)) 1 --> icmp slt V, 1
	Value *V = nullptr;
	if (Pred == ICmpInst::ICMP_SLT && match(X, m_Signum(m_Value(V))))
	return new ICmpInst(ICmpInst::ICMP_SLT, V,
	ConstantInt::get(V->getType(), 1));
	}

	if (Cmp.isEquality() && Trunc->hasOneUse()) {
	// Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42\|highbits if all
	// of the high bits truncated out of x are known.
	unsigned DstBits = Trunc->getType()->getScalarSizeInBits(),
	SrcBits = X->getType()->getScalarSizeInBits();
	KnownBits Known = computeKnownBits(X, 0, &Cmp);

	// If all the high bits are known, we can do this xform.
	if ((Known.Zero \| Known.One).countLeadingOnes() >= SrcBits - DstBits) {
	// Pull in the high bits from known-ones set.
	APInt NewRHS = C.zext(SrcBits);
	NewRHS \|= Known.One & APInt::getHighBitsSet(SrcBits, SrcBits - DstBits);
	return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), NewRHS));
	}
	}

	return nullptr;
	}

	/// Fold icmp (xor X, Y), C.
	Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp,
	BinaryOperator *Xor,
	const APInt &C) {
	Value *X = Xor->getOperand(0);
	Value *Y = Xor->getOperand(1);
	const APInt *XorC;
	if (!match(Y, m_APInt(XorC)))
	return nullptr;

	// If this is a comparison that tests the signbit (X < 0) or (x > -1),
	// fold the xor.
	ICmpInst::Predicate Pred = Cmp.getPredicate();
	bool TrueIfSigned = false;
	if (isSignBitCheck(Cmp.getPredicate(), C, TrueIfSigned)) {

	// If the sign bit of the XorCst is not set, there is no change to
	// the operation, just stop using the Xor.
	if (!XorC->isNegative()) {
	Cmp.setOperand(0, X);
	Worklist.Add(Xor);
	return &Cmp;
	}

	// Emit the opposite comparison.
	if (TrueIfSigned)
	return new ICmpInst(ICmpInst::ICMP_SGT, X,
	ConstantInt::getAllOnesValue(X->getType()));
	else
	return new ICmpInst(ICmpInst::ICMP_SLT, X,
	ConstantInt::getNullValue(X->getType()));
	}

	if (Xor->hasOneUse()) {
	// (icmp u/s (xor X SignMask), C) -> (icmp s/u X, (xor C SignMask))
	if (!Cmp.isEquality() && XorC->isSignMask()) {
	Pred = Cmp.isSigned() ? Cmp.getUnsignedPredicate()
	: Cmp.getSignedPredicate();
	return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC));
	}

	// (icmp u/s (xor X ~SignMask), C) -> (icmp s/u X, (xor C ~SignMask))
	if (!Cmp.isEquality() && XorC->isMaxSignedValue()) {
	Pred = Cmp.isSigned() ? Cmp.getUnsignedPredicate()
	: Cmp.getSignedPredicate();
	Pred = Cmp.getSwappedPredicate(Pred);
	return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC));
	}
	}

	// Mask constant magic can eliminate an 'xor' with unsigned compares.
	if (Pred == ICmpInst::ICMP_UGT) {
	// (xor X, ~C) >u C --> X <u ~C (when C+1 is a power of 2)
	if (*XorC == ~C && (C + 1).isPowerOf2())
	return new ICmpInst(ICmpInst::ICMP_ULT, X, Y);
	// (xor X, C) >u C --> X >u C (when C+1 is a power of 2)
	if (*XorC == C && (C + 1).isPowerOf2())
	return new ICmpInst(ICmpInst::ICMP_UGT, X, Y);
	}
	if (Pred == ICmpInst::ICMP_ULT) {
	// (xor X, -C) <u C --> X >u ~C (when C is a power of 2)
	if (*XorC == -C && C.isPowerOf2())
	return new ICmpInst(ICmpInst::ICMP_UGT, X,
	ConstantInt::get(X->getType(), ~C));
	// (xor X, C) <u C --> X >u ~C (when -C is a power of 2)
	if (*XorC == C && (-C).isPowerOf2())
	return new ICmpInst(ICmpInst::ICMP_UGT, X,
	ConstantInt::get(X->getType(), ~C));
	}
	return nullptr;
	}

	/// Fold icmp (and (sh X, Y), C2), C1.
	Instruction InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator And,
	const APInt &C1, const APInt &C2) {
	BinaryOperator *Shift = dyn_cast<BinaryOperator>(And->getOperand(0));
	if (!Shift \|\| !Shift->isShift())
	return nullptr;

	// If this is: (X >> C3) & C2 != C1 (where any shift and any compare could
	// exist), turn it into (X & (C2 << C3)) != (C1 << C3). This happens a LOT in
	// code produced by the clang front-end, for bitfield access.
	// This seemingly simple opportunity to fold away a shift turns out to be
	// rather complicated. See PR17827 for details.
	unsigned ShiftOpcode = Shift->getOpcode();
	bool IsShl = ShiftOpcode == Instruction::Shl;
	const APInt *C3;
	if (match(Shift->getOperand(1), m_APInt(C3))) {
	bool CanFold = false;
	if (ShiftOpcode == Instruction::Shl) {
	// For a left shift, we can fold if the comparison is not signed. We can
	// also fold a signed comparison if the mask value and comparison value
	// are not negative. These constraints may not be obvious, but we can
	// prove that they are correct using an SMT solver.
	if (!Cmp.isSigned() \|\| (!C2.isNegative() && !C1.isNegative()))
	CanFold = true;
	} else {
	bool IsAshr = ShiftOpcode == Instruction::AShr;
	// For a logical right shift, we can fold if the comparison is not signed.
	// We can also fold a signed comparison if the shifted mask value and the
	// shifted comparison value are not negative. These constraints may not be
	// obvious, but we can prove that they are correct using an SMT solver.
	// For an arithmetic shift right we can do the same, if we ensure
	// the And doesn't use any bits being shifted in. Normally these would
	// be turned into lshr by SimplifyDemandedBits, but not if there is an
	// additional user.
	if (!IsAshr \|\| (C2.shl(C3).lshr(C3) == C2)) {
	if (!Cmp.isSigned() \|\|
	(!C2.shl(C3).isNegative() && !C1.shl(C3).isNegative()))
	CanFold = true;
	}
	}

	if (CanFold) {
	APInt NewCst = IsShl ? C1.lshr(C3) : C1.shl(C3);
	APInt SameAsC1 = IsShl ? NewCst.shl(C3) : NewCst.lshr(C3);
	// Check to see if we are shifting out any of the bits being compared.
	if (SameAsC1 != C1) {
	// If we shifted bits out, the fold is not going to work out. As a
	// special case, check to see if this means that the result is always
	// true or false now.
	if (Cmp.getPredicate() == ICmpInst::ICMP_EQ)
	return replaceInstUsesWith(Cmp, ConstantInt::getFalse(Cmp.getType()));
	if (Cmp.getPredicate() == ICmpInst::ICMP_NE)
	return replaceInstUsesWith(Cmp, ConstantInt::getTrue(Cmp.getType()));
	} else {
	Cmp.setOperand(1, ConstantInt::get(And->getType(), NewCst));
	APInt NewAndCst = IsShl ? C2.lshr(C3) : C2.shl(C3);
	And->setOperand(1, ConstantInt::get(And->getType(), NewAndCst));
	And->setOperand(0, Shift->getOperand(0));
	Worklist.Add(Shift); // Shift is dead.
	return &Cmp;
	}
	}
	}

	// Turn ((X >> Y) & C2) == 0 into (X & (C2 << Y)) == 0. The latter is
	// preferable because it allows the C2 << Y expression to be hoisted out of a
	// loop if Y is invariant and X is not.
	if (Shift->hasOneUse() && C1.isNullValue() && Cmp.isEquality() &&
	!Shift->isArithmeticShift() && !isa<Constant>(Shift->getOperand(0))) {
	// Compute C2 << Y.
	Value *NewShift =
	IsShl ? Builder.CreateLShr(And->getOperand(1), Shift->getOperand(1))
	: Builder.CreateShl(And->getOperand(1), Shift->getOperand(1));

	// Compute X & (C2 << Y).
	Value *NewAnd = Builder.CreateAnd(Shift->getOperand(0), NewShift);
	Cmp.setOperand(0, NewAnd);
	return &Cmp;
	}

	return nullptr;
	}

	/// Fold icmp (and X, C2), C1.
	Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
	BinaryOperator *And,
	const APInt &C1) {
	bool isICMP_NE = Cmp.getPredicate() == ICmpInst::ICMP_NE;

	// For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
	// TODO: We canonicalize to the longer form for scalars because we have
	// better analysis/folds for icmp, and codegen may be better with icmp.
	if (isICMP_NE && Cmp.getType()->isVectorTy() && C1.isNullValue() &&
	match(And->getOperand(1), m_One()))
	return new TruncInst(And->getOperand(0), Cmp.getType());

	const APInt *C2;
	Value *X;
	if (!match(And, m_And(m_Value(X), m_APInt(C2))))
	return nullptr;

	// Don't perform the following transforms if the AND has multiple uses
	if (!And->hasOneUse())
	return nullptr;

	if (Cmp.isEquality() && C1.isNullValue()) {
	// Restrict this fold to single-use 'and' (PR10267).
	// Replace (and X, (1 << size(X)-1) != 0) with X s< 0
	if (C2->isSignMask()) {
	Constant *Zero = Constant::getNullValue(X->getType());
	auto NewPred = isICMP_NE ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
	return new ICmpInst(NewPred, X, Zero);
	}

	// Restrict this fold only for single-use 'and' (PR10267).
	// ((%x & C) == 0) --> %x u< (-C) iff (-C) is power of two.
	if ((~(*C2) + 1).isPowerOf2()) {
	Constant *NegBOC =
	ConstantExpr::getNeg(cast<Constant>(And->getOperand(1)));
	auto NewPred = isICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
	return new ICmpInst(NewPred, X, NegBOC);
	}
	}

	// If the LHS is an 'and' of a truncate and we can widen the and/compare to
	// the input width without changing the value produced, eliminate the cast:
	//
	// icmp (and (trunc W), C2), C1 -> icmp (and W, C2'), C1'
	//
	// We can do this transformation if the constants do not have their sign bits
	// set or if it is an equality comparison. Extending a relational comparison
	// when we're checking the sign bit would not work.
	Value *W;
	if (match(And->getOperand(0), m_OneUse(m_Trunc(m_Value(W)))) &&
	(Cmp.isEquality() \|\| (!C1.isNegative() && !C2->isNegative()))) {
	// TODO: Is this a good transform for vectors? Wider types may reduce
	// throughput. Should this transform be limited (even for scalars) by using
	// shouldChangeType()?
	if (!Cmp.getType()->isVectorTy()) {
	Type *WideType = W->getType();
	unsigned WideScalarBits = WideType->getScalarSizeInBits();
	Constant *ZextC1 = ConstantInt::get(WideType, C1.zext(WideScalarBits));
	Constant *ZextC2 = ConstantInt::get(WideType, C2->zext(WideScalarBits));
	Value *NewAnd = Builder.CreateAnd(W, ZextC2, And->getName());
	return new ICmpInst(Cmp.getPredicate(), NewAnd, ZextC1);
	}
	}

	if (Instruction I = foldICmpAndShift(Cmp, And, C1, C2))
	return I;

	// (icmp pred (and (or (lshr A, B), A), 1), 0) -->
	// (icmp pred (and A, (or (shl 1, B), 1), 0))
	//
	// iff pred isn't signed
	if (!Cmp.isSigned() && C1.isNullValue() && And->getOperand(0)->hasOneUse() &&
	match(And->getOperand(1), m_One())) {
	Constant *One = cast<Constant>(And->getOperand(1));
	Value *Or = And->getOperand(0);
	Value A, B, *LShr;
	if (match(Or, m_Or(m_Value(LShr), m_Value(A))) &&
	match(LShr, m_LShr(m_Specific(A), m_Value(B)))) {
	unsigned UsesRemoved = 0;
	if (And->hasOneUse())
	++UsesRemoved;
	if (Or->hasOneUse())
	++UsesRemoved;
	if (LShr->hasOneUse())
	++UsesRemoved;

	// Compute A & ((1 << B) \| 1)
	Value *NewOr = nullptr;
	if (auto *C = dyn_cast<Constant>(B)) {
	if (UsesRemoved >= 1)
	NewOr = ConstantExpr::getOr(ConstantExpr::getNUWShl(One, C), One);
	} else {
	if (UsesRemoved >= 3)
	NewOr = Builder.CreateOr(Builder.CreateShl(One, B, LShr->getName(),
	/HasNUW=/true),
	One, Or->getName());
	}
	if (NewOr) {
	Value *NewAnd = Builder.CreateAnd(A, NewOr, And->getName());
	Cmp.setOperand(0, NewAnd);
	return &Cmp;
	}
	}
	}

	return nullptr;
	}

	/// Fold icmp (and X, Y), C.
	Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp,
	BinaryOperator *And,
	const APInt &C) {
	if (Instruction *I = foldICmpAndConstConst(Cmp, And, C))
	return I;

	// TODO: These all require that Y is constant too, so refactor with the above.

	// Try to optimize things like "A[i] & 42 == 0" to index computations.
	Value *X = And->getOperand(0);
	Value *Y = And->getOperand(1);
	if (auto *LI = dyn_cast<LoadInst>(X))
	if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0)))
	if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
	if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
	!LI->isVolatile() && isa<ConstantInt>(Y)) {
	ConstantInt *C2 = cast<ConstantInt>(Y);
	if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, Cmp, C2))
	return Res;
	}

	if (!Cmp.isEquality())
	return nullptr;

	// X & -C == -C -> X > u ~C
	// X & -C != -C -> X <= u ~C
	// iff C is a power of 2
	if (Cmp.getOperand(1) == Y && (-C).isPowerOf2()) {
	auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT
	: CmpInst::ICMP_ULE;
	return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1))));
	}

	// (X & C2) == 0 -> (trunc X) >= 0
	// (X & C2) != 0 -> (trunc X) < 0
	// iff C2 is a power of 2 and it masks the sign bit of a legal integer type.
	const APInt *C2;
	if (And->hasOneUse() && C.isNullValue() && match(Y, m_APInt(C2))) {
	int32_t ExactLogBase2 = C2->exactLogBase2();
	if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
	Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1);
	if (And->getType()->isVectorTy())
	NTy = VectorType::get(NTy, And->getType()->getVectorNumElements());
	Value *Trunc = Builder.CreateTrunc(X, NTy);
	auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE
	: CmpInst::ICMP_SLT;
	return new ICmpInst(NewPred, Trunc, Constant::getNullValue(NTy));
	}
	}

	return nullptr;
	}

	/// Fold icmp (or X, Y), C.
	Instruction InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator Or,
	const APInt &C) {
	ICmpInst::Predicate Pred = Cmp.getPredicate();
	if (C.isOneValue()) {
	// icmp slt signum(V) 1 --> icmp slt V, 1
	Value *V = nullptr;
	if (Pred == ICmpInst::ICMP_SLT && match(Or, m_Signum(m_Value(V))))
	return new ICmpInst(ICmpInst::ICMP_SLT, V,
	ConstantInt::get(V->getType(), 1));
	}

	Value OrOp0 = Or->getOperand(0), OrOp1 = Or->getOperand(1);
	if (Cmp.isEquality() && Cmp.getOperand(1) == OrOp1) {
	// X \| C == C --> X <=u C
	// X \| C != C --> X >u C
	// iff C+1 is a power of 2 (C is a bitmask of the low bits)
	if ((C + 1).isPowerOf2()) {
	Pred = (Pred == CmpInst::ICMP_EQ) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
	return new ICmpInst(Pred, OrOp0, OrOp1);
	}
	// More general: are all bits outside of a mask constant set or not set?
	// X \| C == C --> (X & ~C) == 0
	// X \| C != C --> (X & ~C) != 0
	if (Or->hasOneUse()) {
	Value *A = Builder.CreateAnd(OrOp0, ~C);
	return new ICmpInst(Pred, A, ConstantInt::getNullValue(OrOp0->getType()));
	}
	}

	if (!Cmp.isEquality() \|\| !C.isNullValue() \|\| !Or->hasOneUse())
	return nullptr;

	Value P, Q;
	if (match(Or, m_Or(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Value(Q))))) {
	// Simplify icmp eq (or (ptrtoint P), (ptrtoint Q)), 0
	// -> and (icmp eq P, null), (icmp eq Q, null).
	Value *CmpP =
	Builder.CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType()));
	Value *CmpQ =
	Builder.CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType()));
	auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
	return BinaryOperator::Create(BOpc, CmpP, CmpQ);
	}

	// Are we using xors to bitwise check for a pair of (in)equalities? Convert to
	// a shorter form that has more potential to be folded even further.
	Value X1, X2, X3, X4;
	if (match(OrOp0, m_OneUse(m_Xor(m_Value(X1), m_Value(X2)))) &&
	match(OrOp1, m_OneUse(m_Xor(m_Value(X3), m_Value(X4))))) {
	// ((X1 ^ X2) \|\| (X3 ^ X4)) == 0 --> (X1 == X2) && (X3 == X4)
	// ((X1 ^ X2) \|\| (X3 ^ X4)) != 0 --> (X1 != X2) \|\| (X3 != X4)
	Value *Cmp12 = Builder.CreateICmp(Pred, X1, X2);
	Value *Cmp34 = Builder.CreateICmp(Pred, X3, X4);
	auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
	return BinaryOperator::Create(BOpc, Cmp12, Cmp34);
	}

	return nullptr;
	}

	/// Fold icmp (mul X, Y), C.
	Instruction *InstCombiner::foldICmpMulConstant(ICmpInst &Cmp,
	BinaryOperator *Mul,
	const APInt &C) {
	const APInt *MulC;
	if (!match(Mul->getOperand(1), m_APInt(MulC)))
	return nullptr;

	// If this is a test of the sign bit and the multiply is sign-preserving with
	// a constant operand, use the multiply LHS operand instead.
	ICmpInst::Predicate Pred = Cmp.getPredicate();
	if (isSignTest(Pred, C) && Mul->hasNoSignedWrap()) {
	if (MulC->isNegative())
	Pred = ICmpInst::getSwappedPredicate(Pred);
	return new ICmpInst(Pred, Mul->getOperand(0),
	Constant::getNullValue(Mul->getType()));
	}

	return nullptr;
	}

	/// Fold icmp (shl 1, Y), C.
	static Instruction foldICmpShlOne(ICmpInst &Cmp, Instruction Shl,
	const APInt &C) {
	Value *Y;
	if (!match(Shl, m_Shl(m_One(), m_Value(Y))))
	return nullptr;

	Type *ShiftType = Shl->getType();
	unsigned TypeBits = C.getBitWidth();
	bool CIsPowerOf2 = C.isPowerOf2();
	ICmpInst::Predicate Pred = Cmp.getPredicate();
	if (Cmp.isUnsigned()) {
	// (1 << Y) pred C -> Y pred Log2(C)
	if (!CIsPowerOf2) {
	// (1 << Y) < 30 -> Y <= 4
	// (1 << Y) <= 30 -> Y <= 4
	// (1 << Y) >= 30 -> Y > 4
	// (1 << Y) > 30 -> Y > 4
	if (Pred == ICmpInst::ICMP_ULT)
	Pred = ICmpInst::ICMP_ULE;
	else if (Pred == ICmpInst::ICMP_UGE)
	Pred = ICmpInst::ICMP_UGT;
	}

	// (1 << Y) >= 2147483648 -> Y >= 31 -> Y == 31
	// (1 << Y) < 2147483648 -> Y < 31 -> Y != 31
	unsigned CLog2 = C.logBase2();
	if (CLog2 == TypeBits - 1) {
	if (Pred == ICmpInst::ICMP_UGE)
	Pred = ICmpInst::ICMP_EQ;
	else if (Pred == ICmpInst::ICMP_ULT)
	Pred = ICmpInst::ICMP_NE;
	}
	return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, CLog2));
	} else if (Cmp.isSigned()) {
	Constant *BitWidthMinusOne = ConstantInt::get(ShiftType, TypeBits - 1);
	if (C.isAllOnesValue()) {
	// (1 << Y) <= -1 -> Y == 31
	if (Pred == ICmpInst::ICMP_SLE)
	return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);

	// (1 << Y) > -1 -> Y != 31
	if (Pred == ICmpInst::ICMP_SGT)
	return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne);
	} else if (!C) {
	// (1 << Y) < 0 -> Y == 31
	// (1 << Y) <= 0 -> Y == 31
	if (Pred == ICmpInst::ICMP_SLT \|\| Pred == ICmpInst::ICMP_SLE)
	return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);

	// (1 << Y) >= 0 -> Y != 31
	// (1 << Y) > 0 -> Y != 31
	if (Pred == ICmpInst::ICMP_SGT \|\| Pred == ICmpInst::ICMP_SGE)
	return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne);
	}
	} else if (Cmp.isEquality() && CIsPowerOf2) {
	return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, C.logBase2()));
	}

	return nullptr;
	}

	/// Fold icmp (shl X, Y), C.
	Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
	BinaryOperator *Shl,
	const APInt &C) {
	const APInt *ShiftVal;
	if (Cmp.isEquality() && match(Shl->getOperand(0), m_APInt(ShiftVal)))
	return foldICmpShlConstConst(Cmp, Shl->getOperand(1), C, *ShiftVal);

	const APInt *ShiftAmt;
	if (!match(Shl->getOperand(1), m_APInt(ShiftAmt)))
	return foldICmpShlOne(Cmp, Shl, C);

	// Check that the shift amount is in range. If not, don't perform undefined
	// shifts. When the shift is visited, it will be simplified.
	unsigned TypeBits = C.getBitWidth();
	if (ShiftAmt->uge(TypeBits))
	return nullptr;

	ICmpInst::Predicate Pred = Cmp.getPredicate();
	Value *X = Shl->getOperand(0);
	Type *ShType = Shl->getType();

	// NSW guarantees that we are only shifting out sign bits from the high bits,
	// so we can ASHR the compare constant without needing a mask and eliminate
	// the shift.
	if (Shl->hasNoSignedWrap()) {
	if (Pred == ICmpInst::ICMP_SGT) {
	// icmp Pred (shl nsw X, ShiftAmt), C --> icmp Pred X, (C >>s ShiftAmt)
	APInt ShiftedC = C.ashr(*ShiftAmt);
	return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
	}
	if ((Pred == ICmpInst::ICMP_EQ \|\| Pred == ICmpInst::ICMP_NE) &&
	C.ashr(ShiftAmt).shl(ShiftAmt) == C) {
	APInt ShiftedC = C.ashr(*ShiftAmt);
	return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
	}
	if (Pred == ICmpInst::ICMP_SLT) {
	// SLE is the same as above, but SLE is canonicalized to SLT, so convert:
	// (X << S) <=s C is equiv to X <=s (C >> S) for all C
	// (X << S) <s (C + 1) is equiv to X <s (C >> S) + 1 if C <s SMAX
	// (X << S) <s C is equiv to X <s ((C - 1) >> S) + 1 if C >s SMIN
	assert(!C.isMinSignedValue() && "Unexpected icmp slt");
	APInt ShiftedC = (C - 1).ashr(*ShiftAmt) + 1;
	return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
	}
	// If this is a signed comparison to 0 and the shift is sign preserving,
	// use the shift LHS operand instead; isSignTest may change 'Pred', so only
	// do that if we're sure to not continue on in this function.
	if (isSignTest(Pred, C))
	return new ICmpInst(Pred, X, Constant::getNullValue(ShType));
	}

	// NUW guarantees that we are only shifting out zero bits from the high bits,
	// so we can LSHR the compare constant without needing a mask and eliminate
	// the shift.
	if (Shl->hasNoUnsignedWrap()) {
	if (Pred == ICmpInst::ICMP_UGT) {
	// icmp Pred (shl nuw X, ShiftAmt), C --> icmp Pred X, (C >>u ShiftAmt)
	APInt ShiftedC = C.lshr(*ShiftAmt);
	return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
	}
	if ((Pred == ICmpInst::ICMP_EQ \|\| Pred == ICmpInst::ICMP_NE) &&
	C.lshr(ShiftAmt).shl(ShiftAmt) == C) {
	APInt ShiftedC = C.lshr(*ShiftAmt);
	return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
	}
	if (Pred == ICmpInst::ICMP_ULT) {
	// ULE is the same as above, but ULE is canonicalized to ULT, so convert:
	// (X << S) <=u C is equiv to X <=u (C >> S) for all C
	// (X << S) <u (C + 1) is equiv to X <u (C >> S) + 1 if C <u ~0u
	// (X << S) <u C is equiv to X <u ((C - 1) >> S) + 1 if C >u 0
	assert(C.ugt(0) && "ult 0 should have been eliminated");
	APInt ShiftedC = (C - 1).lshr(*ShiftAmt) + 1;
	return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
	}
	}

	if (Cmp.isEquality() && Shl->hasOneUse()) {
	// Strength-reduce the shift into an 'and'.
	Constant *Mask = ConstantInt::get(
	ShType,
	APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue()));
	Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask");
	Constant LShrC = ConstantInt::get(ShType, C.lshr(ShiftAmt));
	return new ICmpInst(Pred, And, LShrC);
	}

	// Otherwise, if this is a comparison of the sign bit, simplify to and/test.
	bool TrueIfSigned = false;
	if (Shl->hasOneUse() && isSignBitCheck(Pred, C, TrueIfSigned)) {
	// (X << 31) <s 0 --> (X & 1) != 0
	Constant *Mask = ConstantInt::get(
	ShType,
	APInt::getOneBitSet(TypeBits, TypeBits - ShiftAmt->getZExtValue() - 1));
	Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask");
	return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
	And, Constant::getNullValue(ShType));
	}

	// Simplify 'shl' inequality test into 'and' equality test.
	if (Cmp.isUnsigned() && Shl->hasOneUse()) {
	// (X l<< C2) u<=/u> C1 iff C1+1 is power of two -> X & (~C1 l>> C2) ==/!= 0
	if ((C + 1).isPowerOf2() &&
	(Pred == ICmpInst::ICMP_ULE \|\| Pred == ICmpInst::ICMP_UGT)) {
	Value *And = Builder.CreateAnd(X, (~C).lshr(ShiftAmt->getZExtValue()));
	return new ICmpInst(Pred == ICmpInst::ICMP_ULE ? ICmpInst::ICMP_EQ
	: ICmpInst::ICMP_NE,
	And, Constant::getNullValue(ShType));
	}
	// (X l<< C2) u</u>= C1 iff C1 is power of two -> X & (-C1 l>> C2) ==/!= 0
	if (C.isPowerOf2() &&
	(Pred == ICmpInst::ICMP_ULT \|\| Pred == ICmpInst::ICMP_UGE)) {
	Value *And =
	Builder.CreateAnd(X, (~(C - 1)).lshr(ShiftAmt->getZExtValue()));
	return new ICmpInst(Pred == ICmpInst::ICMP_ULT ? ICmpInst::ICMP_EQ
	: ICmpInst::ICMP_NE,
	And, Constant::getNullValue(ShType));
	}
	}

	// Transform (icmp pred iM (shl iM %v, N), C)
	// -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (C>>N))
	// Transform the shl to a trunc if (trunc (C>>N)) has no loss and M-N.
	// This enables us to get rid of the shift in favor of a trunc that may be
	// free on the target. It has the additional benefit of comparing to a
	// smaller constant that may be more target-friendly.
	unsigned Amt = ShiftAmt->getLimitedValue(TypeBits - 1);
	if (Shl->hasOneUse() && Amt != 0 && C.countTrailingZeros() >= Amt &&
	DL.isLegalInteger(TypeBits - Amt)) {
	Type *TruncTy = IntegerType::get(Cmp.getContext(), TypeBits - Amt);
	if (ShType->isVectorTy())
	TruncTy = VectorType::get(TruncTy, ShType->getVectorNumElements());
	Constant *NewC =
	ConstantInt::get(TruncTy, C.ashr(*ShiftAmt).trunc(TypeBits - Amt));
	return new ICmpInst(Pred, Builder.CreateTrunc(X, TruncTy), NewC);
	}

	return nullptr;
	}

	/// Fold icmp ({al}shr X, Y), C.
	Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
	BinaryOperator *Shr,
	const APInt &C) {
	// An exact shr only shifts out zero bits, so:
	// icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0
	Value *X = Shr->getOperand(0);
	CmpInst::Predicate Pred = Cmp.getPredicate();
	if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() &&
	C.isNullValue())
	return new ICmpInst(Pred, X, Cmp.getOperand(1));

	const APInt *ShiftVal;
	if (Cmp.isEquality() && match(Shr->getOperand(0), m_APInt(ShiftVal)))
	return foldICmpShrConstConst(Cmp, Shr->getOperand(1), C, *ShiftVal);

	const APInt *ShiftAmt;
	if (!match(Shr->getOperand(1), m_APInt(ShiftAmt)))
	return nullptr;

	// Check that the shift amount is in range. If not, don't perform undefined
	// shifts. When the shift is visited it will be simplified.
	unsigned TypeBits = C.getBitWidth();
	unsigned ShAmtVal = ShiftAmt->getLimitedValue(TypeBits);
	if (ShAmtVal >= TypeBits \|\| ShAmtVal == 0)
	return nullptr;

	bool IsAShr = Shr->getOpcode() == Instruction::AShr;
	bool IsExact = Shr->isExact();
	Type *ShrTy = Shr->getType();
	// TODO: If we could guarantee that InstSimplify would handle all of the
	// constant-value-based preconditions in the folds below, then we could assert
	// those conditions rather than checking them. This is difficult because of
	// undef/poison (PR34838).
	if (IsAShr) {
	if (Pred == CmpInst::ICMP_SLT \|\| (Pred == CmpInst::ICMP_SGT && IsExact)) {
	// icmp slt (ashr X, ShAmtC), C --> icmp slt X, (C << ShAmtC)
	// icmp sgt (ashr exact X, ShAmtC), C --> icmp sgt X, (C << ShAmtC)
	APInt ShiftedC = C.shl(ShAmtVal);
	if (ShiftedC.ashr(ShAmtVal) == C)
	return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
	}
	if (Pred == CmpInst::ICMP_SGT) {
	// icmp sgt (ashr X, ShAmtC), C --> icmp sgt X, ((C + 1) << ShAmtC) - 1
	APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1;
	if (!C.isMaxSignedValue() && !(C + 1).shl(ShAmtVal).isMinSignedValue() &&
	(ShiftedC + 1).ashr(ShAmtVal) == (C + 1))
	return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
	}
	} else {
	if (Pred == CmpInst::ICMP_ULT \|\| (Pred == CmpInst::ICMP_UGT && IsExact)) {
	// icmp ult (lshr X, ShAmtC), C --> icmp ult X, (C << ShAmtC)
	// icmp ugt (lshr exact X, ShAmtC), C --> icmp ugt X, (C << ShAmtC)
	APInt ShiftedC = C.shl(ShAmtVal);
	if (ShiftedC.lshr(ShAmtVal) == C)
	return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
	}
	if (Pred == CmpInst::ICMP_UGT) {
	// icmp ugt (lshr X, ShAmtC), C --> icmp ugt X, ((C + 1) << ShAmtC) - 1
	APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1;
	if ((ShiftedC + 1).lshr(ShAmtVal) == (C + 1))
	return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
	}
	}

	if (!Cmp.isEquality())
	return nullptr;

	// Handle equality comparisons of shift-by-constant.

	// If the comparison constant changes with the shift, the comparison cannot
	// succeed (bits of the comparison constant cannot match the shifted value).
	// This should be known by InstSimplify and already be folded to true/false.
	assert(((IsAShr && C.shl(ShAmtVal).ashr(ShAmtVal) == C) \|\|
	(!IsAShr && C.shl(ShAmtVal).lshr(ShAmtVal) == C)) &&
	"Expected icmp+shr simplify did not occur.");

	// If the bits shifted out are known zero, compare the unshifted value:
	// (X & 4) >> 1 == 2 --> (X & 4) == 4.
	if (Shr->isExact())
	return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, C << ShAmtVal));

	if (Shr->hasOneUse()) {
	// Canonicalize the shift into an 'and':
	// icmp eq/ne (shr X, ShAmt), C --> icmp eq/ne (and X, HiMask), (C << ShAmt)
	APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
	Constant *Mask = ConstantInt::get(ShrTy, Val);
	Value *And = Builder.CreateAnd(X, Mask, Shr->getName() + ".mask");
	return new ICmpInst(Pred, And, ConstantInt::get(ShrTy, C << ShAmtVal));
	}

	return nullptr;
	}

	/// Fold icmp (udiv X, Y), C.
	Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp,
	BinaryOperator *UDiv,
	const APInt &C) {
	const APInt *C2;
	if (!match(UDiv->getOperand(0), m_APInt(C2)))
	return nullptr;

	assert(*C2 != 0 && "udiv 0, X should have been simplified already.");

	// (icmp ugt (udiv C2, Y), C) -> (icmp ule Y, C2/(C+1))
	Value *Y = UDiv->getOperand(1);
	if (Cmp.getPredicate() == ICmpInst::ICMP_UGT) {
	assert(!C.isMaxValue() &&
	"icmp ugt X, UINT_MAX should have been simplified already.");
	return new ICmpInst(ICmpInst::ICMP_ULE, Y,
	ConstantInt::get(Y->getType(), C2->udiv(C + 1)));
	}

	// (icmp ult (udiv C2, Y), C) -> (icmp ugt Y, C2/C)
	if (Cmp.getPredicate() == ICmpInst::ICMP_ULT) {
	assert(C != 0 && "icmp ult X, 0 should have been simplified already.");
	return new ICmpInst(ICmpInst::ICMP_UGT, Y,
	ConstantInt::get(Y->getType(), C2->udiv(C)));
	}

	return nullptr;
	}

	/// Fold icmp ({su}div X, Y), C.
	Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
	BinaryOperator *Div,
	const APInt &C) {
	// Fold: icmp pred ([us]div X, C2), C -> range test
	// Fold this div into the comparison, producing a range check.
	// Determine, based on the divide type, what the range is being
	// checked. If there is an overflow on the low or high side, remember
	// it, otherwise compute the range [low, hi) bounding the new value.
	// See: InsertRangeTest above for the kinds of replacements possible.
	const APInt *C2;
	if (!match(Div->getOperand(1), m_APInt(C2)))
	return nullptr;

	// FIXME: If the operand types don't match the type of the divide
	// then don't attempt this transform. The code below doesn't have the
	// logic to deal with a signed divide and an unsigned compare (and
	// vice versa). This is because (x /s C2) <s C produces different
	// results than (x /s C2) <u C or (x /u C2) <s C or even
	// (x /u C2) <u C. Simply casting the operands and result won't
	// work. :( The if statement below tests that condition and bails
	// if it finds it.
	bool DivIsSigned = Div->getOpcode() == Instruction::SDiv;
	if (!Cmp.isEquality() && DivIsSigned != Cmp.isSigned())
	return nullptr;

	// The ProdOV computation fails on divide by 0 and divide by -1. Cases with
	// INT_MIN will also fail if the divisor is 1. Although folds of all these
	// division-by-constant cases should be present, we can not assert that they
	// have happened before we reach this icmp instruction.
	if (C2->isNullValue() \|\| C2->isOneValue() \|\|
	(DivIsSigned && C2->isAllOnesValue()))
	return nullptr;

	// Compute Prod = C * C2. We are essentially solving an equation of
	// form X / C2 = C. We solve for X by multiplying C2 and C.
	// By solving for X, we can turn this into a range check instead of computing
	// a divide.
	APInt Prod = C * *C2;

	// Determine if the product overflows by seeing if the product is not equal to
	// the divide. Make sure we do the same kind of divide as in the LHS
	// instruction that we're folding.
	bool ProdOV = (DivIsSigned ? Prod.sdiv(C2) : Prod.udiv(C2)) != C;

	ICmpInst::Predicate Pred = Cmp.getPredicate();

	// If the division is known to be exact, then there is no remainder from the
	// divide, so the covered range size is unit, otherwise it is the divisor.
	APInt RangeSize = Div->isExact() ? APInt(C2->getBitWidth(), 1) : *C2;

	// Figure out the interval that is being checked. For example, a comparison
	// like "X /u 5 == 0" is really checking that X is in the interval [0, 5).
	// Compute this interval based on the constants involved and the signedness of
	// the compare/divide. This computes a half-open interval, keeping track of
	// whether either value in the interval overflows. After analysis each
	// overflow variable is set to 0 if it's corresponding bound variable is valid
	// -1 if overflowed off the bottom end, or +1 if overflowed off the top end.
	int LoOverflow = 0, HiOverflow = 0;
	APInt LoBound, HiBound;

	if (!DivIsSigned) { // udiv
	// e.g. X/5 op 3 --> [15, 20)
	LoBound = Prod;
	HiOverflow = LoOverflow = ProdOV;
	if (!HiOverflow) {
	// If this is not an exact divide, then many values in the range collapse
	// to the same result value.
	HiOverflow = addWithOverflow(HiBound, LoBound, RangeSize, false);
	}
	} else if (C2->isStrictlyPositive()) { // Divisor is > 0.
	if (C.isNullValue()) { // (X / pos) op 0
	// Can't overflow. e.g. X/2 op 0 --> [-1, 2)
	LoBound = -(RangeSize - 1);
	HiBound = RangeSize;
	} else if (C.isStrictlyPositive()) { // (X / pos) op pos
	LoBound = Prod; // e.g. X/5 op 3 --> [15, 20)
	HiOverflow = LoOverflow = ProdOV;
	if (!HiOverflow)
	HiOverflow = addWithOverflow(HiBound, Prod, RangeSize, true);
	} else { // (X / pos) op neg
	// e.g. X/5 op -3 --> [-15-4, -15+1) --> [-19, -14)
	HiBound = Prod + 1;
	LoOverflow = HiOverflow = ProdOV ? -1 : 0;
	if (!LoOverflow) {
	APInt DivNeg = -RangeSize;
	LoOverflow = addWithOverflow(LoBound, HiBound, DivNeg, true) ? -1 : 0;
	}
	}
	} else if (C2->isNegative()) { // Divisor is < 0.
	if (Div->isExact())
	RangeSize.negate();
	if (C.isNullValue()) { // (X / neg) op 0
	// e.g. X/-5 op 0 --> [-4, 5)
	LoBound = RangeSize + 1;
	HiBound = -RangeSize;
	if (HiBound == *C2) { // -INTMIN = INTMIN
	HiOverflow = 1; // [INTMIN+1, overflow)
	HiBound = APInt(); // e.g. X/INTMIN = 0 --> X > INTMIN
	}
	} else if (C.isStrictlyPositive()) { // (X / neg) op pos
	// e.g. X/-5 op 3 --> [-19, -14)
	HiBound = Prod + 1;
	HiOverflow = LoOverflow = ProdOV ? -1 : 0;
	if (!LoOverflow)
	LoOverflow = addWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0;
	} else { // (X / neg) op neg
	LoBound = Prod; // e.g. X/-5 op -3 --> [15, 20)
	LoOverflow = HiOverflow = ProdOV;
	if (!HiOverflow)
	HiOverflow = subWithOverflow(HiBound, Prod, RangeSize, true);
	}

	// Dividing by a negative swaps the condition. LT <-> GT
	Pred = ICmpInst::getSwappedPredicate(Pred);
	}

	Value *X = Div->getOperand(0);
	switch (Pred) {
	default: llvm_unreachable("Unhandled icmp opcode!");
	case ICmpInst::ICMP_EQ:
	if (LoOverflow && HiOverflow)
	return replaceInstUsesWith(Cmp, Builder.getFalse());
	if (HiOverflow)
	return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
	ICmpInst::ICMP_UGE, X,
	ConstantInt::get(Div->getType(), LoBound));
	if (LoOverflow)
	return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
	ICmpInst::ICMP_ULT, X,
	ConstantInt::get(Div->getType(), HiBound));
	return replaceInstUsesWith(
	Cmp, insertRangeTest(X, LoBound, HiBound, DivIsSigned, true));
	case ICmpInst::ICMP_NE:
	if (LoOverflow && HiOverflow)
	return replaceInstUsesWith(Cmp, Builder.getTrue());
	if (HiOverflow)
	return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
	ICmpInst::ICMP_ULT, X,
	ConstantInt::get(Div->getType(), LoBound));
	if (LoOverflow)
	return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
	ICmpInst::ICMP_UGE, X,
	ConstantInt::get(Div->getType(), HiBound));
	return replaceInstUsesWith(Cmp,
	insertRangeTest(X, LoBound, HiBound,
	DivIsSigned, false));
	case ICmpInst::ICMP_ULT:
	case ICmpInst::ICMP_SLT:
	if (LoOverflow == +1) // Low bound is greater than input range.
	return replaceInstUsesWith(Cmp, Builder.getTrue());
	if (LoOverflow == -1) // Low bound is less than input range.
	return replaceInstUsesWith(Cmp, Builder.getFalse());
	return new ICmpInst(Pred, X, ConstantInt::get(Div->getType(), LoBound));
	case ICmpInst::ICMP_UGT:
	case ICmpInst::ICMP_SGT:
	if (HiOverflow == +1) // High bound greater than input range.
	return replaceInstUsesWith(Cmp, Builder.getFalse());
	if (HiOverflow == -1) // High bound less than input range.
	return replaceInstUsesWith(Cmp, Builder.getTrue());
	if (Pred == ICmpInst::ICMP_UGT)
	return new ICmpInst(ICmpInst::ICMP_UGE, X,
	ConstantInt::get(Div->getType(), HiBound));
	return new ICmpInst(ICmpInst::ICMP_SGE, X,
	ConstantInt::get(Div->getType(), HiBound));
	}

	return nullptr;
	}

	/// Fold icmp (sub X, Y), C.
	Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp,
	BinaryOperator *Sub,
	const APInt &C) {
	Value X = Sub->getOperand(0), Y = Sub->getOperand(1);
	ICmpInst::Predicate Pred = Cmp.getPredicate();
	const APInt *C2;
	APInt SubResult;

	// (icmp P (sub nuw\|nsw C2, Y), C) -> (icmp swap(P) Y, C2-C)
	if (match(X, m_APInt(C2)) &&
	((Cmp.isUnsigned() && Sub->hasNoUnsignedWrap()) \|\|
	(Cmp.isSigned() && Sub->hasNoSignedWrap())) &&
	!subWithOverflow(SubResult, *C2, C, Cmp.isSigned()))
	return new ICmpInst(Cmp.getSwappedPredicate(), Y,
	ConstantInt::get(Y->getType(), SubResult));

	// The following transforms are only worth it if the only user of the subtract
	// is the icmp.
	if (!Sub->hasOneUse())
	return nullptr;

	if (Sub->hasNoSignedWrap()) {
	// (icmp sgt (sub nsw X, Y), -1) -> (icmp sge X, Y)
	if (Pred == ICmpInst::ICMP_SGT && C.isAllOnesValue())
	return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);

	// (icmp sgt (sub nsw X, Y), 0) -> (icmp sgt X, Y)
	if (Pred == ICmpInst::ICMP_SGT && C.isNullValue())
	return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);

	// (icmp slt (sub nsw X, Y), 0) -> (icmp slt X, Y)
	if (Pred == ICmpInst::ICMP_SLT && C.isNullValue())
	return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);

	// (icmp slt (sub nsw X, Y), 1) -> (icmp sle X, Y)
	if (Pred == ICmpInst::ICMP_SLT && C.isOneValue())
	return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
	}

	if (!match(X, m_APInt(C2)))
	return nullptr;

	// C2 - Y <u C -> (Y \| (C - 1)) == C2
	// iff (C2 & (C - 1)) == C - 1 and C is a power of 2
	if (Pred == ICmpInst::ICMP_ULT && C.isPowerOf2() &&
	(*C2 & (C - 1)) == (C - 1))
	return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateOr(Y, C - 1), X);

	// C2 - Y >u C -> (Y \| C) != C2
	// iff C2 & C == C and C + 1 is a power of 2
	if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == C)
	return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateOr(Y, C), X);

	return nullptr;
	}

	/// Fold icmp (add X, Y), C.
	Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
	BinaryOperator *Add,
	const APInt &C) {
	Value *Y = Add->getOperand(1);
	const APInt *C2;
	if (Cmp.isEquality() \|\| !match(Y, m_APInt(C2)))
	return nullptr;

	// Fold icmp pred (add X, C2), C.
	Value *X = Add->getOperand(0);
	Type *Ty = Add->getType();
	CmpInst::Predicate Pred = Cmp.getPredicate();

	if (!Add->hasOneUse())
	return nullptr;

	// If the add does not wrap, we can always adjust the compare by subtracting
	// the constants. Equality comparisons are handled elsewhere. SGE/SLE/UGE/ULE
	// are canonicalized to SGT/SLT/UGT/ULT.
	if ((Add->hasNoSignedWrap() &&
	(Pred == ICmpInst::ICMP_SGT \|\| Pred == ICmpInst::ICMP_SLT)) \|\|
	(Add->hasNoUnsignedWrap() &&
	(Pred == ICmpInst::ICMP_UGT \|\| Pred == ICmpInst::ICMP_ULT))) {
	bool Overflow;
	APInt NewC =
	Cmp.isSigned() ? C.ssub_ov(C2, Overflow) : C.usub_ov(C2, Overflow);
	// If there is overflow, the result must be true or false.
	// TODO: Can we assert there is no overflow because InstSimplify always
	// handles those cases?
	if (!Overflow)
	// icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2)
	return new ICmpInst(Pred, X, ConstantInt::get(Ty, NewC));
	}

	auto CR = ConstantRange::makeExactICmpRegion(Pred, C).subtract(*C2);
	const APInt &Upper = CR.getUpper();
	const APInt &Lower = CR.getLower();
	if (Cmp.isSigned()) {
	if (Lower.isSignMask())
	return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantInt::get(Ty, Upper));
	if (Upper.isSignMask())
	return new ICmpInst(ICmpInst::ICMP_SGE, X, ConstantInt::get(Ty, Lower));
	} else {
	if (Lower.isMinValue())
	return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantInt::get(Ty, Upper));
	if (Upper.isMinValue())
	return new ICmpInst(ICmpInst::ICMP_UGE, X, ConstantInt::get(Ty, Lower));
	}

	// X+C <u C2 -> (X & -C2) == C
	// iff C & (C2-1) == 0
	// C2 is a power of 2
	if (Pred == ICmpInst::ICMP_ULT && C.isPowerOf2() && (*C2 & (C - 1)) == 0)
	return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateAnd(X, -C),
	ConstantExpr::getNeg(cast<Constant>(Y)));

	// X+C >u C2 -> (X & ~C2) != C
	// iff C & C2 == 0
	// C2+1 is a power of 2
	if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == 0)
	return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateAnd(X, ~C),
	ConstantExpr::getNeg(cast<Constant>(Y)));

	return nullptr;
	}

	bool InstCombiner::matchThreeWayIntCompare(SelectInst SI, Value &LHS,
	Value &RHS, ConstantInt &Less,
	ConstantInt *&Equal,
	ConstantInt *&Greater) {
	// TODO: Generalize this to work with other comparison idioms or ensure
	// they get canonicalized into this form.

	// select i1 (a == b), i32 Equal, i32 (select i1 (a < b), i32 Less, i32
	// Greater), where Equal, Less and Greater are placeholders for any three
	// constants.
	ICmpInst::Predicate PredA, PredB;
	if (match(SI->getTrueValue(), m_ConstantInt(Equal)) &&
	match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) &&
	PredA == ICmpInst::ICMP_EQ &&
	match(SI->getFalseValue(),
	m_Select(m_ICmp(PredB, m_Specific(LHS), m_Specific(RHS)),
	m_ConstantInt(Less), m_ConstantInt(Greater))) &&
	PredB == ICmpInst::ICMP_SLT) {
	return true;
	}
	return false;
	}

	Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp,
	SelectInst *Select,
	ConstantInt *C) {

	assert(C && "Cmp RHS should be a constant int!");
	// If we're testing a constant value against the result of a three way
	// comparison, the result can be expressed directly in terms of the
	// original values being compared. Note: We could possibly be more
	// aggressive here and remove the hasOneUse test. The original select is
	// really likely to simplify or sink when we remove a test of the result.
	Value OrigLHS, OrigRHS;
	ConstantInt C1LessThan, C2Equal, *C3GreaterThan;
	if (Cmp.hasOneUse() &&
	matchThreeWayIntCompare(Select, OrigLHS, OrigRHS, C1LessThan, C2Equal,
	C3GreaterThan)) {
	assert(C1LessThan && C2Equal && C3GreaterThan);

	bool TrueWhenLessThan =
	ConstantExpr::getCompare(Cmp.getPredicate(), C1LessThan, C)
	->isAllOnesValue();
	bool TrueWhenEqual =
	ConstantExpr::getCompare(Cmp.getPredicate(), C2Equal, C)
	->isAllOnesValue();
	bool TrueWhenGreaterThan =
	ConstantExpr::getCompare(Cmp.getPredicate(), C3GreaterThan, C)
	->isAllOnesValue();

	// This generates the new instruction that will replace the original Cmp
	// Instruction. Instead of enumerating the various combinations when
	// TrueWhenLessThan, TrueWhenEqual and TrueWhenGreaterThan are true versus
	// false, we rely on chaining of ORs and future passes of InstCombine to
	// simplify the OR further (i.e. a s< b \|\| a == b becomes a s<= b).

	// When none of the three constants satisfy the predicate for the RHS (C),
	// the entire original Cmp can be simplified to a false.
	Value *Cond = Builder.getFalse();
	if (TrueWhenLessThan)
	Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SLT,
	OrigLHS, OrigRHS));
	if (TrueWhenEqual)
	Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_EQ,
	OrigLHS, OrigRHS));
	if (TrueWhenGreaterThan)
	Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SGT,
	OrigLHS, OrigRHS));

	return replaceInstUsesWith(Cmp, Cond);
	}
	return nullptr;
	}

	static Instruction *foldICmpBitCast(ICmpInst &Cmp,
	InstCombiner::BuilderTy &Builder) {
	auto *Bitcast = dyn_cast<BitCastInst>(Cmp.getOperand(0));
	if (!Bitcast)
	return nullptr;

	ICmpInst::Predicate Pred = Cmp.getPredicate();
	Value *Op1 = Cmp.getOperand(1);
	Value *BCSrcOp = Bitcast->getOperand(0);

	// Make sure the bitcast doesn't change the number of vector elements.
	if (Bitcast->getSrcTy()->getScalarSizeInBits() ==
	Bitcast->getDestTy()->getScalarSizeInBits()) {
	// Zero-equality and sign-bit checks are preserved through sitofp + bitcast.
	Value *X;
	if (match(BCSrcOp, m_SIToFP(m_Value(X)))) {
	// icmp eq (bitcast (sitofp X)), 0 --> icmp eq X, 0
	// icmp ne (bitcast (sitofp X)), 0 --> icmp ne X, 0
	// icmp slt (bitcast (sitofp X)), 0 --> icmp slt X, 0
	// icmp sgt (bitcast (sitofp X)), 0 --> icmp sgt X, 0
	if ((Pred == ICmpInst::ICMP_EQ \|\| Pred == ICmpInst::ICMP_SLT \|\|
	Pred == ICmpInst::ICMP_NE \|\| Pred == ICmpInst::ICMP_SGT) &&
	match(Op1, m_Zero()))
	return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));

	// icmp slt (bitcast (sitofp X)), 1 --> icmp slt X, 1
	if (Pred == ICmpInst::ICMP_SLT && match(Op1, m_One()))
	return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), 1));

	// icmp sgt (bitcast (sitofp X)), -1 --> icmp sgt X, -1
	if (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes()))
	return new ICmpInst(Pred, X,
	ConstantInt::getAllOnesValue(X->getType()));
	}

	// Zero-equality checks are preserved through unsigned floating-point casts:
	// icmp eq (bitcast (uitofp X)), 0 --> icmp eq X, 0
	// icmp ne (bitcast (uitofp X)), 0 --> icmp ne X, 0
	if (match(BCSrcOp, m_UIToFP(m_Value(X))))
	if (Cmp.isEquality() && match(Op1, m_Zero()))
	return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
	}

	// Test to see if the operands of the icmp are casted versions of other
	// values. If the ptr->ptr cast can be stripped off both arguments, do so.
	if (Bitcast->getType()->isPointerTy() &&
	(isa<Constant>(Op1) \|\| isa<BitCastInst>(Op1))) {
	// If operand #1 is a bitcast instruction, it must also be a ptr->ptr cast
	// so eliminate it as well.
	if (auto *BC2 = dyn_cast<BitCastInst>(Op1))
	Op1 = BC2->getOperand(0);

	Op1 = Builder.CreateBitCast(Op1, BCSrcOp->getType());
	return new ICmpInst(Pred, BCSrcOp, Op1);
	}

	// Folding: icmp <pred> iN X, C
	// where X = bitcast <M x iK> (shufflevector <M x iK> %vec, undef, SC)) to iN
	// and C is a splat of a K-bit pattern
	// and SC is a constant vector = <C', C', C', ..., C'>
	// Into:
	// %E = extractelement <M x iK> %vec, i32 C'
	// icmp <pred> iK %E, trunc(C)
	const APInt *C;
	if (!match(Cmp.getOperand(1), m_APInt(C)) \|\|
	!Bitcast->getType()->isIntegerTy() \|\|
	!Bitcast->getSrcTy()->isIntOrIntVectorTy())
	return nullptr;

	Value *Vec;
	Constant *Mask;
	if (match(BCSrcOp,
	m_ShuffleVector(m_Value(Vec), m_Undef(), m_Constant(Mask)))) {
	// Check whether every element of Mask is the same constant
	if (auto *Elem = dyn_cast_or_null<ConstantInt>(Mask->getSplatValue())) {
	auto *VecTy = cast<VectorType>(BCSrcOp->getType());
	auto *EltTy = cast<IntegerType>(VecTy->getElementType());
	if (C->isSplat(EltTy->getBitWidth())) {
	// Fold the icmp based on the value of C
	// If C is M copies of an iK sized bit pattern,
	// then:
	// => %E = extractelement <N x iK> %vec, i32 Elem
	// icmp <pred> iK %SplatVal, <pattern>
	Value *Extract = Builder.CreateExtractElement(Vec, Elem);
	Value *NewC = ConstantInt::get(EltTy, C->trunc(EltTy->getBitWidth()));
	return new ICmpInst(Pred, Extract, NewC);
	}
	}
	}
	return nullptr;
	}

	/// Try to fold integer comparisons with a constant operand: icmp Pred X, C
	/// where X is some kind of instruction.
	Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
	const APInt *C;
	if (!match(Cmp.getOperand(1), m_APInt(C)))
	return nullptr;

	if (auto *BO = dyn_cast<BinaryOperator>(Cmp.getOperand(0))) {
	switch (BO->getOpcode()) {
	case Instruction::Xor:
	if (Instruction I = foldICmpXorConstant(Cmp, BO, C))
	return I;
	break;
	case Instruction::And:
	if (Instruction I = foldICmpAndConstant(Cmp, BO, C))
	return I;
	break;
	case Instruction::Or:
	if (Instruction I = foldICmpOrConstant(Cmp, BO, C))
	return I;
	break;
	case Instruction::Mul:
	if (Instruction I = foldICmpMulConstant(Cmp, BO, C))
	return I;
	break;
	case Instruction::Shl:
	if (Instruction I = foldICmpShlConstant(Cmp, BO, C))
	return I;
	break;
	case Instruction::LShr:
	case Instruction::AShr:
	if (Instruction I = foldICmpShrConstant(Cmp, BO, C))
	return I;
	break;
	case Instruction::UDiv:
	if (Instruction I = foldICmpUDivConstant(Cmp, BO, C))
	return I;
	LLVM_FALLTHROUGH;
	case Instruction::SDiv:
	if (Instruction I = foldICmpDivConstant(Cmp, BO, C))
	return I;
	break;
	case Instruction::Sub:
	if (Instruction I = foldICmpSubConstant(Cmp, BO, C))
	return I;
	break;
	case Instruction::Add:
	if (Instruction I = foldICmpAddConstant(Cmp, BO, C))
	return I;
	break;
	default:
	break;
	}
	// TODO: These folds could be refactored to be part of the above calls.
	if (Instruction I = foldICmpBinOpEqualityWithConstant(Cmp, BO, C))
	return I;
	}

	// Match against CmpInst LHS being instructions other than binary operators.

	if (auto *SI = dyn_cast<SelectInst>(Cmp.getOperand(0))) {
	// For now, we only support constant integers while folding the
	// ICMP(SELECT)) pattern. We can extend this to support vector of integers
	// similar to the cases handled by binary ops above.
	if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(Cmp.getOperand(1)))
	if (Instruction *I = foldICmpSelectConstant(Cmp, SI, ConstRHS))
	return I;
	}

	if (auto *TI = dyn_cast<TruncInst>(Cmp.getOperand(0))) {
	if (Instruction I = foldICmpTruncConstant(Cmp, TI, C))
	return I;
	}

	if (auto *II = dyn_cast<IntrinsicInst>(Cmp.getOperand(0)))
	if (Instruction I = foldICmpIntrinsicWithConstant(Cmp, II, C))
	return I;

	return nullptr;
	}

	/// Fold an icmp equality instruction with binary operator LHS and constant RHS:
	/// icmp eq/ne BO, C.
	Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
	BinaryOperator *BO,
	const APInt &C) {
	// TODO: Some of these folds could work with arbitrary constants, but this
	// function is limited to scalar and vector splat constants.
	if (!Cmp.isEquality())
	return nullptr;

	ICmpInst::Predicate Pred = Cmp.getPredicate();
	bool isICMP_NE = Pred == ICmpInst::ICMP_NE;
	Constant *RHS = cast<Constant>(Cmp.getOperand(1));
	Value BOp0 = BO->getOperand(0), BOp1 = BO->getOperand(1);

	switch (BO->getOpcode()) {
	case Instruction::SRem:
	// If we have a signed (X % (2^c)) == 0, turn it into an unsigned one.
	if (C.isNullValue() && BO->hasOneUse()) {
	const APInt *BOC;
	if (match(BOp1, m_APInt(BOC)) && BOC->sgt(1) && BOC->isPowerOf2()) {
	Value *NewRem = Builder.CreateURem(BOp0, BOp1, BO->getName());
	return new ICmpInst(Pred, NewRem,
	Constant::getNullValue(BO->getType()));
	}
	}
	break;
	case Instruction::Add: {
	// Replace ((add A, B) != C) with (A != C-B) if B & C are constants.
	const APInt *BOC;
	if (match(BOp1, m_APInt(BOC))) {
	if (BO->hasOneUse()) {
	Constant *SubC = ConstantExpr::getSub(RHS, cast<Constant>(BOp1));
	return new ICmpInst(Pred, BOp0, SubC);
	}
	} else if (C.isNullValue()) {
	// Replace ((add A, B) != 0) with (A != -B) if A or B is
	// efficiently invertible, or if the add has just this one use.
	if (Value *NegVal = dyn_castNegVal(BOp1))
	return new ICmpInst(Pred, BOp0, NegVal);
	if (Value *NegVal = dyn_castNegVal(BOp0))
	return new ICmpInst(Pred, NegVal, BOp1);
	if (BO->hasOneUse()) {
	Value *Neg = Builder.CreateNeg(BOp1);
	Neg->takeName(BO);
	return new ICmpInst(Pred, BOp0, Neg);
	}
	}
	break;
	}
	case Instruction::Xor:
	if (BO->hasOneUse()) {
	if (Constant *BOC = dyn_cast<Constant>(BOp1)) {
	// For the xor case, we can xor two constants together, eliminating
	// the explicit xor.
	return new ICmpInst(Pred, BOp0, ConstantExpr::getXor(RHS, BOC));
	} else if (C.isNullValue()) {
	// Replace ((xor A, B) != 0) with (A != B)
	return new ICmpInst(Pred, BOp0, BOp1);
	}
	}
	break;
	case Instruction::Sub:
	if (BO->hasOneUse()) {
	const APInt *BOC;
	if (match(BOp0, m_APInt(BOC))) {
	// Replace ((sub BOC, B) != C) with (B != BOC-C).
	Constant *SubC = ConstantExpr::getSub(cast<Constant>(BOp0), RHS);
	return new ICmpInst(Pred, BOp1, SubC);
	} else if (C.isNullValue()) {
	// Replace ((sub A, B) != 0) with (A != B).
	return new ICmpInst(Pred, BOp0, BOp1);
	}
	}
	break;
	case Instruction::Or: {
	const APInt *BOC;
	if (match(BOp1, m_APInt(BOC)) && BO->hasOneUse() && RHS->isAllOnesValue()) {
	// Comparing if all bits outside of a constant mask are set?
	// Replace (X \| C) == -1 with (X & ~C) == ~C.
	// This removes the -1 constant.
	Constant *NotBOC = ConstantExpr::getNot(cast<Constant>(BOp1));
	Value *And = Builder.CreateAnd(BOp0, NotBOC);
	return new ICmpInst(Pred, And, NotBOC);
	}
	break;
	}
	case Instruction::And: {
	const APInt *BOC;
	if (match(BOp1, m_APInt(BOC))) {
	// If we have ((X & C) == C), turn it into ((X & C) != 0).
	if (C == *BOC && C.isPowerOf2())
	return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
	BO, Constant::getNullValue(RHS->getType()));
	}
	break;
	}
	case Instruction::Mul:
	if (C.isNullValue() && BO->hasNoSignedWrap()) {
	const APInt *BOC;
	if (match(BOp1, m_APInt(BOC)) && !BOC->isNullValue()) {
	// The trivial case (mul X, 0) is handled by InstSimplify.
	// General case : (mul X, C) != 0 iff X != 0
	// (mul X, C) == 0 iff X == 0
	return new ICmpInst(Pred, BOp0, Constant::getNullValue(RHS->getType()));
	}
	}
	break;
	case Instruction::UDiv:
	if (C.isNullValue()) {
	// (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A)
	auto NewPred = isICMP_NE ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
	return new ICmpInst(NewPred, BOp1, BOp0);
	}
	break;
	default:
	break;
	}
	return nullptr;
	}

	/// Fold an equality icmp with LLVM intrinsic and constant operand.
	Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp,
	IntrinsicInst *II,
	const APInt &C) {
	Type *Ty = II->getType();
	unsigned BitWidth = C.getBitWidth();
	switch (II->getIntrinsicID()) {
	case Intrinsic::bswap:
	Worklist.Add(II);
	Cmp.setOperand(0, II->getArgOperand(0));
	Cmp.setOperand(1, ConstantInt::get(Ty, C.byteSwap()));
	return &Cmp;

	case Intrinsic::ctlz:
	case Intrinsic::cttz: {
	// ctz(A) == bitwidth(A) -> A == 0 and likewise for !=
	if (C == BitWidth) {
	Worklist.Add(II);
	Cmp.setOperand(0, II->getArgOperand(0));
	Cmp.setOperand(1, ConstantInt::getNullValue(Ty));
	return &Cmp;
	}

	// ctz(A) == C -> A & Mask1 == Mask2, where Mask2 only has bit C set
	// and Mask1 has bits 0..C+1 set. Similar for ctl, but for high bits.
	// Limit to one use to ensure we don't increase instruction count.
	unsigned Num = C.getLimitedValue(BitWidth);
	if (Num != BitWidth && II->hasOneUse()) {
	bool IsTrailing = II->getIntrinsicID() == Intrinsic::cttz;
	APInt Mask1 = IsTrailing ? APInt::getLowBitsSet(BitWidth, Num + 1)
	: APInt::getHighBitsSet(BitWidth, Num + 1);
	APInt Mask2 = IsTrailing
	? APInt::getOneBitSet(BitWidth, Num)
	: APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
	Cmp.setOperand(0, Builder.CreateAnd(II->getArgOperand(0), Mask1));
	Cmp.setOperand(1, ConstantInt::get(Ty, Mask2));
	Worklist.Add(II);
	return &Cmp;
	}
	break;
	}

	case Intrinsic::ctpop: {
	// popcount(A) == 0 -> A == 0 and likewise for !=
	// popcount(A) == bitwidth(A) -> A == -1 and likewise for !=
	bool IsZero = C.isNullValue();
	if (IsZero \|\| C == BitWidth) {
	Worklist.Add(II);
	Cmp.setOperand(0, II->getArgOperand(0));
	auto *NewOp =
	IsZero ? Constant::getNullValue(Ty) : Constant::getAllOnesValue(Ty);
	Cmp.setOperand(1, NewOp);
	return &Cmp;
	}
	break;
	}
	default:
	break;
	}

	return nullptr;
	}

	/// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C.
	Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
	IntrinsicInst *II,
	const APInt &C) {
	if (Cmp.isEquality())
	return foldICmpEqIntrinsicWithConstant(Cmp, II, C);

	Type *Ty = II->getType();
	unsigned BitWidth = C.getBitWidth();
	switch (II->getIntrinsicID()) {
	case Intrinsic::ctlz: {
	// ctlz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX < 0b00010000
	if (Cmp.getPredicate() == ICmpInst::ICMP_UGT && C.ult(BitWidth)) {
	unsigned Num = C.getLimitedValue();
	APInt Limit = APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
	return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_ULT,
	II->getArgOperand(0), ConstantInt::get(Ty, Limit));
	}

	// ctlz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX > 0b00011111
	if (Cmp.getPredicate() == ICmpInst::ICMP_ULT &&
	C.uge(1) && C.ule(BitWidth)) {
	unsigned Num = C.getLimitedValue();
	APInt Limit = APInt::getLowBitsSet(BitWidth, BitWidth - Num);
	return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_UGT,
	II->getArgOperand(0), ConstantInt::get(Ty, Limit));
	}
	break;
	}
	case Intrinsic::cttz: {
	// Limit to one use to ensure we don't increase instruction count.
	if (!II->hasOneUse())
	return nullptr;

	// cttz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX & 0b00001111 == 0
	if (Cmp.getPredicate() == ICmpInst::ICMP_UGT && C.ult(BitWidth)) {
	APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue() + 1);
	return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_EQ,
	Builder.CreateAnd(II->getArgOperand(0), Mask),
	ConstantInt::getNullValue(Ty));
	}

	// cttz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX & 0b00000111 != 0
	if (Cmp.getPredicate() == ICmpInst::ICMP_ULT &&
	C.uge(1) && C.ule(BitWidth)) {
	APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue());
	return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_NE,
	Builder.CreateAnd(II->getArgOperand(0), Mask),
	ConstantInt::getNullValue(Ty));
	}
	break;
	}
	default:
	break;
	}

	return nullptr;
	}

	/// Handle icmp with constant (but not simple integer constant) RHS.
	Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	Constant *RHSC = dyn_cast<Constant>(Op1);
	Instruction *LHSI = dyn_cast<Instruction>(Op0);
	if (!RHSC \|\| !LHSI)
	return nullptr;

	switch (LHSI->getOpcode()) {
	case Instruction::GetElementPtr:
	// icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null
	if (RHSC->isNullValue() &&
	cast<GetElementPtrInst>(LHSI)->hasAllZeroIndices())
	return new ICmpInst(
	I.getPredicate(), LHSI->getOperand(0),
	Constant::getNullValue(LHSI->getOperand(0)->getType()));
	break;
	case Instruction::PHI:
	// Only fold icmp into the PHI if the phi and icmp are in the same
	// block. If in the same block, we're encouraging jump threading. If
	// not, we are just pessimizing the code by making an i1 phi.
	if (LHSI->getParent() == I.getParent())
	if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
	return NV;
	break;
	case Instruction::Select: {
	// If either operand of the select is a constant, we can fold the
	// comparison into the select arms, which will cause one to be
	// constant folded and the select turned into a bitwise or.
	Value Op1 = nullptr, Op2 = nullptr;
	ConstantInt *CI = nullptr;
	if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
	Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
	CI = dyn_cast<ConstantInt>(Op1);
	}
	if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
	Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
	CI = dyn_cast<ConstantInt>(Op2);
	}

	// We only want to perform this transformation if it will not lead to
	// additional code. This is true if either both sides of the select
	// fold to a constant (in which case the icmp is replaced with a select
	// which will usually simplify) or this is the only user of the
	// select (in which case we are trading a select+icmp for a simpler
	// select+icmp) or all uses of the select can be replaced based on
	// dominance information ("Global cases").
	bool Transform = false;
	if (Op1 && Op2)
	Transform = true;
	else if (Op1 \|\| Op2) {
	// Local case
	if (LHSI->hasOneUse())
	Transform = true;
	// Global cases
	else if (CI && !CI->isZero())
	// When Op1 is constant try replacing select with second operand.
	// Otherwise Op2 is constant and try replacing select with first
	// operand.
	Transform =
	replacedSelectWithOperand(cast<SelectInst>(LHSI), &I, Op1 ? 2 : 1);
	}
	if (Transform) {
	if (!Op1)
	Op1 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(1), RHSC,
	I.getName());
	if (!Op2)
	Op2 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(2), RHSC,
	I.getName());
	return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
	}
	break;
	}
	case Instruction::IntToPtr:
	// icmp pred inttoptr(X), null -> icmp pred X, 0
	if (RHSC->isNullValue() &&
	DL.getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType())
	return new ICmpInst(
	I.getPredicate(), LHSI->getOperand(0),
	Constant::getNullValue(LHSI->getOperand(0)->getType()));
	break;

	case Instruction::Load:
	// Try to optimize things like "A[i] > 4" to index computations.
	if (GetElementPtrInst *GEP =
	dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) {
	if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
	if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
	!cast<LoadInst>(LHSI)->isVolatile())
	if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
	return Res;
	}
	break;
	}

	return nullptr;
	}

	/// Some comparisons can be simplified.
	/// In this case, we are looking for comparisons that look like
	/// a check for a lossy truncation.
	/// Folds:
	/// icmp SrcPred (x & Mask), x to icmp DstPred x, Mask
	/// Where Mask is some pattern that produces all-ones in low bits:
	/// (-1 >> y)
	/// ((-1 << y) >> y) <- non-canonical, has extra uses
	/// ~(-1 << y)
	/// ((1 << y) + (-1)) <- non-canonical, has extra uses
	/// The Mask can be a constant, too.
	/// For some predicates, the operands are commutative.
	/// For others, x can only be on a specific side.
	static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I,
	InstCombiner::BuilderTy &Builder) {
	ICmpInst::Predicate SrcPred;
	Value X, M, *Y;
	auto m_VariableMask = m_CombineOr(
	m_CombineOr(m_Not(m_Shl(m_AllOnes(), m_Value())),
	m_Add(m_Shl(m_One(), m_Value()), m_AllOnes())),
	m_CombineOr(m_LShr(m_AllOnes(), m_Value()),
	m_LShr(m_Shl(m_AllOnes(), m_Value(Y)), m_Deferred(Y))));
	auto m_Mask = m_CombineOr(m_VariableMask, m_LowBitMask());
	if (!match(&I, m_c_ICmp(SrcPred,
	m_c_And(m_CombineAnd(m_Mask, m_Value(M)), m_Value(X)),
	m_Deferred(X))))
	return nullptr;

	ICmpInst::Predicate DstPred;
	switch (SrcPred) {
	case ICmpInst::Predicate::ICMP_EQ:
	// x & (-1 >> y) == x -> x u<= (-1 >> y)
	DstPred = ICmpInst::Predicate::ICMP_ULE;
	break;
	case ICmpInst::Predicate::ICMP_NE:
	// x & (-1 >> y) != x -> x u> (-1 >> y)
	DstPred = ICmpInst::Predicate::ICMP_UGT;
	break;
	case ICmpInst::Predicate::ICMP_UGT:
	// x u> x & (-1 >> y) -> x u> (-1 >> y)
	assert(X == I.getOperand(0) && "instsimplify took care of commut. variant");
	DstPred = ICmpInst::Predicate::ICMP_UGT;
	break;
	case ICmpInst::Predicate::ICMP_UGE:
	// x & (-1 >> y) u>= x -> x u<= (-1 >> y)
	assert(X == I.getOperand(1) && "instsimplify took care of commut. variant");
	DstPred = ICmpInst::Predicate::ICMP_ULE;
	break;
	case ICmpInst::Predicate::ICMP_ULT:
	// x & (-1 >> y) u< x -> x u> (-1 >> y)
	assert(X == I.getOperand(1) && "instsimplify took care of commut. variant");
	DstPred = ICmpInst::Predicate::ICMP_UGT;
	break;
	case ICmpInst::Predicate::ICMP_ULE:
	// x u<= x & (-1 >> y) -> x u<= (-1 >> y)
	assert(X == I.getOperand(0) && "instsimplify took care of commut. variant");
	DstPred = ICmpInst::Predicate::ICMP_ULE;
	break;
	case ICmpInst::Predicate::ICMP_SGT:
	// x s> x & (-1 >> y) -> x s> (-1 >> y)
	if (X != I.getOperand(0)) // X must be on LHS of comparison!
	return nullptr; // Ignore the other case.
	if (!match(M, m_Constant())) // Can not do this fold with non-constant.
	return nullptr;
	if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
	return nullptr;
	DstPred = ICmpInst::Predicate::ICMP_SGT;
	break;
	case ICmpInst::Predicate::ICMP_SGE:
	// x & (-1 >> y) s>= x -> x s<= (-1 >> y)
	if (X != I.getOperand(1)) // X must be on RHS of comparison!
	return nullptr; // Ignore the other case.
	if (!match(M, m_Constant())) // Can not do this fold with non-constant.
	return nullptr;
	if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
	return nullptr;
	DstPred = ICmpInst::Predicate::ICMP_SLE;
	break;
	case ICmpInst::Predicate::ICMP_SLT:
	// x & (-1 >> y) s< x -> x s> (-1 >> y)
	if (X != I.getOperand(1)) // X must be on RHS of comparison!
	return nullptr; // Ignore the other case.
	if (!match(M, m_Constant())) // Can not do this fold with non-constant.
	return nullptr;
	if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
	return nullptr;
	DstPred = ICmpInst::Predicate::ICMP_SGT;
	break;
	case ICmpInst::Predicate::ICMP_SLE:
	// x s<= x & (-1 >> y) -> x s<= (-1 >> y)
	if (X != I.getOperand(0)) // X must be on LHS of comparison!
	return nullptr; // Ignore the other case.
	if (!match(M, m_Constant())) // Can not do this fold with non-constant.
	return nullptr;
	if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
	return nullptr;
	DstPred = ICmpInst::Predicate::ICMP_SLE;
	break;
	default:
	llvm_unreachable("All possible folds are handled.");
	}

	return Builder.CreateICmp(DstPred, X, M);
	}

	/// Some comparisons can be simplified.
	/// In this case, we are looking for comparisons that look like
	/// a check for a lossy signed truncation.
	/// Folds: (MaskedBits is a constant.)
	/// ((%x << MaskedBits) a>> MaskedBits) SrcPred %x
	/// Into:
	/// (add %x, (1 << (KeptBits-1))) DstPred (1 << KeptBits)
	/// Where KeptBits = bitwidth(%x) - MaskedBits
	static Value *
	foldICmpWithTruncSignExtendedVal(ICmpInst &I,
	InstCombiner::BuilderTy &Builder) {
	ICmpInst::Predicate SrcPred;
	Value *X;
	const APInt C0, C1; // FIXME: non-splats, potentially with undef.
	// We are ok with 'shl' having multiple uses, but 'ashr' must be one-use.
	if (!match(&I, m_c_ICmp(SrcPred,
	m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C0)),
	m_APInt(C1))),
	m_Deferred(X))))
	return nullptr;

	// Potential handling of non-splats: for each element:
	// * if both are undef, replace with constant 0.
	// Because (1<<0) is OK and is 1, and ((1<<0)>>1) is also OK and is 0.
	// * if both are not undef, and are different, bailout.
	// * else, only one is undef, then pick the non-undef one.

	// The shift amount must be equal.
	if (C0 != C1)
	return nullptr;
	const APInt &MaskedBits = *C0;
	assert(MaskedBits != 0 && "shift by zero should be folded away already.");

	ICmpInst::Predicate DstPred;
	switch (SrcPred) {
	case ICmpInst::Predicate::ICMP_EQ:
	// ((%x << MaskedBits) a>> MaskedBits) == %x
	// =>
	// (add %x, (1 << (KeptBits-1))) u< (1 << KeptBits)
	DstPred = ICmpInst::Predicate::ICMP_ULT;
	break;
	case ICmpInst::Predicate::ICMP_NE:
	// ((%x << MaskedBits) a>> MaskedBits) != %x
	// =>
	// (add %x, (1 << (KeptBits-1))) u>= (1 << KeptBits)
	DstPred = ICmpInst::Predicate::ICMP_UGE;
	break;
	// FIXME: are more folds possible?
	default:
	return nullptr;
	}

	auto *XType = X->getType();
	const unsigned XBitWidth = XType->getScalarSizeInBits();
	const APInt BitWidth = APInt(XBitWidth, XBitWidth);
	assert(BitWidth.ugt(MaskedBits) && "shifts should leave some bits untouched");

	// KeptBits = bitwidth(%x) - MaskedBits
	const APInt KeptBits = BitWidth - MaskedBits;
	assert(KeptBits.ugt(0) && KeptBits.ult(BitWidth) && "unreachable");
	// ICmpCst = (1 << KeptBits)
	const APInt ICmpCst = APInt(XBitWidth, 1).shl(KeptBits);
	assert(ICmpCst.isPowerOf2());
	// AddCst = (1 << (KeptBits-1))
	const APInt AddCst = ICmpCst.lshr(1);
	assert(AddCst.ult(ICmpCst) && AddCst.isPowerOf2());

	// T0 = add %x, AddCst
	Value *T0 = Builder.CreateAdd(X, ConstantInt::get(XType, AddCst));
	// T1 = T0 DstPred ICmpCst
	Value *T1 = Builder.CreateICmp(DstPred, T0, ConstantInt::get(XType, ICmpCst));

	return T1;
	}

	// Given pattern:
	// icmp eq/ne (and ((x shift Q), (y oppositeshift K))), 0
	// we should move shifts to the same hand of 'and', i.e. rewrite as
	// icmp eq/ne (and (x shift (Q+K)), y), 0 iff (Q+K) u< bitwidth(x)
	// We are only interested in opposite logical shifts here.
	// If we can, we want to end up creating 'lshr' shift.
	static Value *
	foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
	InstCombiner::BuilderTy &Builder) {
	if (!I.isEquality() \|\| !match(I.getOperand(1), m_Zero()) \|\|
	!I.getOperand(0)->hasOneUse())
	return nullptr;

	auto m_AnyLogicalShift = m_LogicalShift(m_Value(), m_Value());
	auto m_AnyLShr = m_LShr(m_Value(), m_Value());

	// Look for an 'and' of two (opposite) logical shifts.
	// Pick the single-use shift as XShift.
	- Value XShift, YShift;
	+ Instruction XShift, YShift;
	if (!match(I.getOperand(0),
	- m_c_And(m_OneUse(m_CombineAnd(m_AnyLogicalShift, m_Value(XShift))),
	- m_CombineAnd(m_AnyLogicalShift, m_Value(YShift)))))
	+ m_c_And(m_CombineAnd(m_AnyLogicalShift, m_Instruction(XShift)),
	+ m_CombineAnd(m_AnyLogicalShift, m_Instruction(YShift)))))
	return nullptr;

	- // If YShift is a single-use 'lshr', swap the shifts around.
	- if (match(YShift, m_OneUse(m_AnyLShr)))
	+ // If YShift is a 'lshr', swap the shifts around.
	+ if (match(YShift, m_AnyLShr))
	std::swap(XShift, YShift);

	// The shifts must be in opposite directions.
	- Instruction::BinaryOps XShiftOpcode =
	- cast<BinaryOperator>(XShift)->getOpcode();
	- if (XShiftOpcode == cast<BinaryOperator>(YShift)->getOpcode())
	+ auto XShiftOpcode = XShift->getOpcode();
	+ if (XShiftOpcode == YShift->getOpcode())
	return nullptr; // Do not care about same-direction shifts here.

	Value X, XShAmt, Y, YShAmt;
	match(XShift, m_BinOp(m_Value(X), m_Value(XShAmt)));
	match(YShift, m_BinOp(m_Value(Y), m_Value(YShAmt)));
	+
	+ // If one of the values being shifted is a constant, then we will end with
	+ // and+icmp, and shift instr will be constant-folded. If they are not,
	+ // however, we will need to ensure that we won't increase instruction count.
	+ if (!isa<Constant>(X) && !isa<Constant>(Y)) {
	+ // At least one of the hands of the 'and' should be one-use shift.
	+ if (!match(I.getOperand(0),
	+ m_c_And(m_OneUse(m_AnyLogicalShift), m_Value())))
	+ return nullptr;
	+ }

	// Can we fold (XShAmt+YShAmt) ?
	Value *NewShAmt = SimplifyBinOp(Instruction::BinaryOps::Add, XShAmt, YShAmt,
	SQ.getWithInstruction(&I));
	if (!NewShAmt)
	return nullptr;
	// Is the new shift amount smaller than the bit width?
	// FIXME: could also rely on ConstantRange.
	unsigned BitWidth = X->getType()->getScalarSizeInBits();
	if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
	APInt(BitWidth, BitWidth))))
	return nullptr;
	// All good, we can do this fold. The shift is the same that was for X.
	Value *T0 = XShiftOpcode == Instruction::BinaryOps::LShr
	? Builder.CreateLShr(X, NewShAmt)
	: Builder.CreateShl(X, NewShAmt);
	Value *T1 = Builder.CreateAnd(T0, Y);
	return Builder.CreateICmp(I.getPredicate(), T1,
	Constant::getNullValue(X->getType()));
	}

	/// Try to fold icmp (binop), X or icmp X, (binop).
	/// TODO: A large part of this logic is duplicated in InstSimplify's
	/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
	/// duplication.
	Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);

	// Special logic for binary operators.
	BinaryOperator *BO0 = dyn_cast<BinaryOperator>(Op0);
	BinaryOperator *BO1 = dyn_cast<BinaryOperator>(Op1);
	if (!BO0 && !BO1)
	return nullptr;

	const CmpInst::Predicate Pred = I.getPredicate();
	Value *X;

	// Convert add-with-unsigned-overflow comparisons into a 'not' with compare.
	// (Op1 + X) <u Op1 --> ~Op1 <u X
	// Op0 >u (Op0 + X) --> X >u ~Op0
	if (match(Op0, m_OneUse(m_c_Add(m_Specific(Op1), m_Value(X)))) &&
	Pred == ICmpInst::ICMP_ULT)
	return new ICmpInst(Pred, Builder.CreateNot(Op1), X);
	if (match(Op1, m_OneUse(m_c_Add(m_Specific(Op0), m_Value(X)))) &&
	Pred == ICmpInst::ICMP_UGT)
	return new ICmpInst(Pred, X, Builder.CreateNot(Op0));

	bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
	if (BO0 && isa<OverflowingBinaryOperator>(BO0))
	NoOp0WrapProblem =
	ICmpInst::isEquality(Pred) \|\|
	(CmpInst::isUnsigned(Pred) && BO0->hasNoUnsignedWrap()) \|\|
	(CmpInst::isSigned(Pred) && BO0->hasNoSignedWrap());
	if (BO1 && isa<OverflowingBinaryOperator>(BO1))
	NoOp1WrapProblem =
	ICmpInst::isEquality(Pred) \|\|
	(CmpInst::isUnsigned(Pred) && BO1->hasNoUnsignedWrap()) \|\|
	(CmpInst::isSigned(Pred) && BO1->hasNoSignedWrap());

	// Analyze the case when either Op0 or Op1 is an add instruction.
	// Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null).
	Value A = nullptr, B = nullptr, C = nullptr, D = nullptr;
	if (BO0 && BO0->getOpcode() == Instruction::Add) {
	A = BO0->getOperand(0);
	B = BO0->getOperand(1);
	}
	if (BO1 && BO1->getOpcode() == Instruction::Add) {
	C = BO1->getOperand(0);
	D = BO1->getOperand(1);
	}

	// icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
	if ((A == Op1 \|\| B == Op1) && NoOp0WrapProblem)
	return new ICmpInst(Pred, A == Op1 ? B : A,
	Constant::getNullValue(Op1->getType()));

	// icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow.
	if ((C == Op0 \|\| D == Op0) && NoOp1WrapProblem)
	return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()),
	C == Op0 ? D : C);

	// icmp (X+Y), (X+Z) -> icmp Y, Z for equalities or if there is no overflow.
	if (A && C && (A == C \|\| A == D \|\| B == C \|\| B == D) && NoOp0WrapProblem &&
	NoOp1WrapProblem &&
	// Try not to increase register pressure.
	BO0->hasOneUse() && BO1->hasOneUse()) {
	// Determine Y and Z in the form icmp (X+Y), (X+Z).
	Value Y, Z;
	if (A == C) {
	// C + B == C + D -> B == D
	Y = B;
	Z = D;
	} else if (A == D) {
	// D + B == C + D -> B == C
	Y = B;
	Z = C;
	} else if (B == C) {
	// A + C == C + D -> A == D
	Y = A;
	Z = D;
	} else {
	assert(B == D);
	// A + D == C + D -> A == C
	Y = A;
	Z = C;
	}
	return new ICmpInst(Pred, Y, Z);
	}

	// icmp slt (X + -1), Y -> icmp sle X, Y
	if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT &&
	match(B, m_AllOnes()))
	return new ICmpInst(CmpInst::ICMP_SLE, A, Op1);

	// icmp sge (X + -1), Y -> icmp sgt X, Y
	if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE &&
	match(B, m_AllOnes()))
	return new ICmpInst(CmpInst::ICMP_SGT, A, Op1);

	// icmp sle (X + 1), Y -> icmp slt X, Y
	if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE && match(B, m_One()))
	return new ICmpInst(CmpInst::ICMP_SLT, A, Op1);

	// icmp sgt (X + 1), Y -> icmp sge X, Y
	if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT && match(B, m_One()))
	return new ICmpInst(CmpInst::ICMP_SGE, A, Op1);

	// icmp sgt X, (Y + -1) -> icmp sge X, Y
	if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT &&
	match(D, m_AllOnes()))
	return new ICmpInst(CmpInst::ICMP_SGE, Op0, C);

	// icmp sle X, (Y + -1) -> icmp slt X, Y
	if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE &&
	match(D, m_AllOnes()))
	return new ICmpInst(CmpInst::ICMP_SLT, Op0, C);

	// icmp sge X, (Y + 1) -> icmp sgt X, Y
	if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && match(D, m_One()))
	return new ICmpInst(CmpInst::ICMP_SGT, Op0, C);

	// icmp slt X, (Y + 1) -> icmp sle X, Y
	if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && match(D, m_One()))
	return new ICmpInst(CmpInst::ICMP_SLE, Op0, C);

	// TODO: The subtraction-related identities shown below also hold, but
	// canonicalization from (X -nuw 1) to (X + -1) means that the combinations
	// wouldn't happen even if they were implemented.
	//
	// icmp ult (X - 1), Y -> icmp ule X, Y
	// icmp uge (X - 1), Y -> icmp ugt X, Y
	// icmp ugt X, (Y - 1) -> icmp uge X, Y
	// icmp ule X, (Y - 1) -> icmp ult X, Y

	// icmp ule (X + 1), Y -> icmp ult X, Y
	if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_ULE && match(B, m_One()))
	return new ICmpInst(CmpInst::ICMP_ULT, A, Op1);

	// icmp ugt (X + 1), Y -> icmp uge X, Y
	if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_UGT && match(B, m_One()))
	return new ICmpInst(CmpInst::ICMP_UGE, A, Op1);

	// icmp uge X, (Y + 1) -> icmp ugt X, Y
	if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_UGE && match(D, m_One()))
	return new ICmpInst(CmpInst::ICMP_UGT, Op0, C);

	// icmp ult X, (Y + 1) -> icmp ule X, Y
	if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_ULT && match(D, m_One()))
	return new ICmpInst(CmpInst::ICMP_ULE, Op0, C);

	// if C1 has greater magnitude than C2:
	// icmp (X + C1), (Y + C2) -> icmp (X + C3), Y
	// s.t. C3 = C1 - C2
	//
	// if C2 has greater magnitude than C1:
	// icmp (X + C1), (Y + C2) -> icmp X, (Y + C3)
	// s.t. C3 = C2 - C1
	if (A && C && NoOp0WrapProblem && NoOp1WrapProblem &&
	(BO0->hasOneUse() \|\| BO1->hasOneUse()) && !I.isUnsigned())
	if (ConstantInt *C1 = dyn_cast<ConstantInt>(B))
	if (ConstantInt *C2 = dyn_cast<ConstantInt>(D)) {
	const APInt &AP1 = C1->getValue();
	const APInt &AP2 = C2->getValue();
	if (AP1.isNegative() == AP2.isNegative()) {
	APInt AP1Abs = C1->getValue().abs();
	APInt AP2Abs = C2->getValue().abs();
	if (AP1Abs.uge(AP2Abs)) {
	ConstantInt *C3 = Builder.getInt(AP1 - AP2);
	Value *NewAdd = Builder.CreateNSWAdd(A, C3);
	return new ICmpInst(Pred, NewAdd, C);
	} else {
	ConstantInt *C3 = Builder.getInt(AP2 - AP1);
	Value *NewAdd = Builder.CreateNSWAdd(C, C3);
	return new ICmpInst(Pred, A, NewAdd);
	}
	}
	}

	// Analyze the case when either Op0 or Op1 is a sub instruction.
	// Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null).
	A = nullptr;
	B = nullptr;
	C = nullptr;
	D = nullptr;
	if (BO0 && BO0->getOpcode() == Instruction::Sub) {
	A = BO0->getOperand(0);
	B = BO0->getOperand(1);
	}
	if (BO1 && BO1->getOpcode() == Instruction::Sub) {
	C = BO1->getOperand(0);
	D = BO1->getOperand(1);
	}

	// icmp (X-Y), X -> icmp 0, Y for equalities or if there is no overflow.
	if (A == Op1 && NoOp0WrapProblem)
	return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B);
	// icmp X, (X-Y) -> icmp Y, 0 for equalities or if there is no overflow.
	if (C == Op0 && NoOp1WrapProblem)
	return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType()));

	// (A - B) >u A --> A <u B
	if (A == Op1 && Pred == ICmpInst::ICMP_UGT)
	return new ICmpInst(ICmpInst::ICMP_ULT, A, B);
	// C <u (C - D) --> C <u D
	if (C == Op0 && Pred == ICmpInst::ICMP_ULT)
	return new ICmpInst(ICmpInst::ICMP_ULT, C, D);

	// icmp (Y-X), (Z-X) -> icmp Y, Z for equalities or if there is no overflow.
	if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem &&
	// Try not to increase register pressure.
	BO0->hasOneUse() && BO1->hasOneUse())
	return new ICmpInst(Pred, A, C);
	// icmp (X-Y), (X-Z) -> icmp Z, Y for equalities or if there is no overflow.
	if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem &&
	// Try not to increase register pressure.
	BO0->hasOneUse() && BO1->hasOneUse())
	return new ICmpInst(Pred, D, B);

	// icmp (0-X) < cst --> x > -cst
	if (NoOp0WrapProblem && ICmpInst::isSigned(Pred)) {
	Value *X;
	if (match(BO0, m_Neg(m_Value(X))))
	if (Constant *RHSC = dyn_cast<Constant>(Op1))
	if (RHSC->isNotMinSignedValue())
	return new ICmpInst(I.getSwappedPredicate(), X,
	ConstantExpr::getNeg(RHSC));
	}

	BinaryOperator *SRem = nullptr;
	// icmp (srem X, Y), Y
	if (BO0 && BO0->getOpcode() == Instruction::SRem && Op1 == BO0->getOperand(1))
	SRem = BO0;
	// icmp Y, (srem X, Y)
	else if (BO1 && BO1->getOpcode() == Instruction::SRem &&
	Op0 == BO1->getOperand(1))
	SRem = BO1;
	if (SRem) {
	// We don't check hasOneUse to avoid increasing register pressure because
	// the value we use is the same value this instruction was already using.
	switch (SRem == BO0 ? ICmpInst::getSwappedPredicate(Pred) : Pred) {
	default:
	break;
	case ICmpInst::ICMP_EQ:
	return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
	case ICmpInst::ICMP_NE:
	return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
	case ICmpInst::ICMP_SGT:
	case ICmpInst::ICMP_SGE:
	return new ICmpInst(ICmpInst::ICMP_SGT, SRem->getOperand(1),
	Constant::getAllOnesValue(SRem->getType()));
	case ICmpInst::ICMP_SLT:
	case ICmpInst::ICMP_SLE:
	return new ICmpInst(ICmpInst::ICMP_SLT, SRem->getOperand(1),
	Constant::getNullValue(SRem->getType()));
	}
	}

	if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() && BO0->hasOneUse() &&
	BO1->hasOneUse() && BO0->getOperand(1) == BO1->getOperand(1)) {
	switch (BO0->getOpcode()) {
	default:
	break;
	case Instruction::Add:
	case Instruction::Sub:
	case Instruction::Xor: {
	if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
	return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));

	const APInt *C;
	if (match(BO0->getOperand(1), m_APInt(C))) {
	// icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
	if (C->isSignMask()) {
	ICmpInst::Predicate NewPred =
	I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
	return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
	}

	// icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b
	if (BO0->getOpcode() == Instruction::Xor && C->isMaxSignedValue()) {
	ICmpInst::Predicate NewPred =
	I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
	NewPred = I.getSwappedPredicate(NewPred);
	return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
	}
	}
	break;
	}
	case Instruction::Mul: {
	if (!I.isEquality())
	break;

	const APInt *C;
	if (match(BO0->getOperand(1), m_APInt(C)) && !C->isNullValue() &&
	!C->isOneValue()) {
	// icmp eq/ne (X * C), (Y * C) --> icmp (X & Mask), (Y & Mask)
	// Mask = -1 >> count-trailing-zeros(C).
	if (unsigned TZs = C->countTrailingZeros()) {
	Constant *Mask = ConstantInt::get(
	BO0->getType(),
	APInt::getLowBitsSet(C->getBitWidth(), C->getBitWidth() - TZs));
	Value *And1 = Builder.CreateAnd(BO0->getOperand(0), Mask);
	Value *And2 = Builder.CreateAnd(BO1->getOperand(0), Mask);
	return new ICmpInst(Pred, And1, And2);
	}
	// If there are no trailing zeros in the multiplier, just eliminate
	// the multiplies (no masking is needed):
	// icmp eq/ne (X * C), (Y * C) --> icmp eq/ne X, Y
	return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
	}
	break;
	}
	case Instruction::UDiv:
	case Instruction::LShr:
	if (I.isSigned() \|\| !BO0->isExact() \|\| !BO1->isExact())
	break;
	return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));

	case Instruction::SDiv:
	if (!I.isEquality() \|\| !BO0->isExact() \|\| !BO1->isExact())
	break;
	return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));

	case Instruction::AShr:
	if (!BO0->isExact() \|\| !BO1->isExact())
	break;
	return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));

	case Instruction::Shl: {
	bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
	bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
	if (!NUW && !NSW)
	break;
	if (!NSW && I.isSigned())
	break;
	return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
	}
	}
	}

	if (BO0) {
	// Transform A & (L - 1) `ult` L --> L != 0
	auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes());
	auto BitwiseAnd = m_c_And(m_Value(), LSubOne);

	if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {
	auto *Zero = Constant::getNullValue(BO0->getType());
	return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
	}
	}

	if (Value *V = foldICmpWithLowBitMaskedVal(I, Builder))
	return replaceInstUsesWith(I, V);

	if (Value *V = foldICmpWithTruncSignExtendedVal(I, Builder))
	return replaceInstUsesWith(I, V);

	if (Value *V = foldShiftIntoShiftInAnotherHandOfAndInICmp(I, SQ, Builder))
	return replaceInstUsesWith(I, V);

	return nullptr;
	}

	/// Fold icmp Pred min\|max(X, Y), X.
	static Instruction *foldICmpWithMinMax(ICmpInst &Cmp) {
	ICmpInst::Predicate Pred = Cmp.getPredicate();
	Value *Op0 = Cmp.getOperand(0);
	Value *X = Cmp.getOperand(1);

	// Canonicalize minimum or maximum operand to LHS of the icmp.
	if (match(X, m_c_SMin(m_Specific(Op0), m_Value())) \|\|
	match(X, m_c_SMax(m_Specific(Op0), m_Value())) \|\|
	match(X, m_c_UMin(m_Specific(Op0), m_Value())) \|\|
	match(X, m_c_UMax(m_Specific(Op0), m_Value()))) {
	std::swap(Op0, X);
	Pred = Cmp.getSwappedPredicate();
	}

	Value *Y;
	if (match(Op0, m_c_SMin(m_Specific(X), m_Value(Y)))) {
	// smin(X, Y) == X --> X s<= Y
	// smin(X, Y) s>= X --> X s<= Y
	if (Pred == CmpInst::ICMP_EQ \|\| Pred == CmpInst::ICMP_SGE)
	return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);

	// smin(X, Y) != X --> X s> Y
	// smin(X, Y) s< X --> X s> Y
	if (Pred == CmpInst::ICMP_NE \|\| Pred == CmpInst::ICMP_SLT)
	return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);

	// These cases should be handled in InstSimplify:
	// smin(X, Y) s<= X --> true
	// smin(X, Y) s> X --> false
	return nullptr;
	}

	if (match(Op0, m_c_SMax(m_Specific(X), m_Value(Y)))) {
	// smax(X, Y) == X --> X s>= Y
	// smax(X, Y) s<= X --> X s>= Y
	if (Pred == CmpInst::ICMP_EQ \|\| Pred == CmpInst::ICMP_SLE)
	return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);

	// smax(X, Y) != X --> X s< Y
	// smax(X, Y) s> X --> X s< Y
	if (Pred == CmpInst::ICMP_NE \|\| Pred == CmpInst::ICMP_SGT)
	return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);

	// These cases should be handled in InstSimplify:
	// smax(X, Y) s>= X --> true
	// smax(X, Y) s< X --> false
	return nullptr;
	}

	if (match(Op0, m_c_UMin(m_Specific(X), m_Value(Y)))) {
	// umin(X, Y) == X --> X u<= Y
	// umin(X, Y) u>= X --> X u<= Y
	if (Pred == CmpInst::ICMP_EQ \|\| Pred == CmpInst::ICMP_UGE)
	return new ICmpInst(ICmpInst::ICMP_ULE, X, Y);

	// umin(X, Y) != X --> X u> Y
	// umin(X, Y) u< X --> X u> Y
	if (Pred == CmpInst::ICMP_NE \|\| Pred == CmpInst::ICMP_ULT)
	return new ICmpInst(ICmpInst::ICMP_UGT, X, Y);

	// These cases should be handled in InstSimplify:
	// umin(X, Y) u<= X --> true
	// umin(X, Y) u> X --> false
	return nullptr;
	}

	if (match(Op0, m_c_UMax(m_Specific(X), m_Value(Y)))) {
	// umax(X, Y) == X --> X u>= Y
	// umax(X, Y) u<= X --> X u>= Y
	if (Pred == CmpInst::ICMP_EQ \|\| Pred == CmpInst::ICMP_ULE)
	return new ICmpInst(ICmpInst::ICMP_UGE, X, Y);

	// umax(X, Y) != X --> X u< Y
	// umax(X, Y) u> X --> X u< Y
	if (Pred == CmpInst::ICMP_NE \|\| Pred == CmpInst::ICMP_UGT)
	return new ICmpInst(ICmpInst::ICMP_ULT, X, Y);

	// These cases should be handled in InstSimplify:
	// umax(X, Y) u>= X --> true
	// umax(X, Y) u< X --> false
	return nullptr;
	}

	return nullptr;
	}

	Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
	if (!I.isEquality())
	return nullptr;

	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	const CmpInst::Predicate Pred = I.getPredicate();
	Value A, B, C, D;
	if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
	if (A == Op1 \|\| B == Op1) { // (A^B) == A -> B == 0
	Value *OtherVal = A == Op1 ? B : A;
	return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
	}

	if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
	// A^c1 == C^c2 --> A == C^(c1^c2)
	ConstantInt C1, C2;
	if (match(B, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2)) &&
	Op1->hasOneUse()) {
	Constant *NC = Builder.getInt(C1->getValue() ^ C2->getValue());
	Value *Xor = Builder.CreateXor(C, NC);
	return new ICmpInst(Pred, A, Xor);
	}

	// A^B == A^D -> B == D
	if (A == C)
	return new ICmpInst(Pred, B, D);
	if (A == D)
	return new ICmpInst(Pred, B, C);
	if (B == C)
	return new ICmpInst(Pred, A, D);
	if (B == D)
	return new ICmpInst(Pred, A, C);
	}
	}

	if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 \|\| B == Op0)) {
	// A == (A^B) -> B == 0
	Value *OtherVal = A == Op0 ? B : A;
	return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
	}

	// (X&Z) == (Y&Z) -> (X^Y) & Z == 0
	if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) &&
	match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) {
	Value X = nullptr, Y = nullptr, *Z = nullptr;

	if (A == C) {
	X = B;
	Y = D;
	Z = A;
	} else if (A == D) {
	X = B;
	Y = C;
	Z = A;
	} else if (B == C) {
	X = A;
	Y = D;
	Z = B;
	} else if (B == D) {
	X = A;
	Y = C;
	Z = B;
	}

	if (X) { // Build (X^Y) & Z
	Op1 = Builder.CreateXor(X, Y);
	Op1 = Builder.CreateAnd(Op1, Z);
	I.setOperand(0, Op1);
	I.setOperand(1, Constant::getNullValue(Op1->getType()));
	return &I;
	}
	}

	// Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B)
	// and (B & (1<<X)-1) == (zext A) --> A == (trunc B)
	ConstantInt *Cst1;
	if ((Op0->hasOneUse() && match(Op0, m_ZExt(m_Value(A))) &&
	match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) \|\|
	(Op1->hasOneUse() && match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) &&
	match(Op1, m_ZExt(m_Value(A))))) {
	APInt Pow2 = Cst1->getValue() + 1;
	if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) &&
	Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth())
	return new ICmpInst(Pred, A, Builder.CreateTrunc(B, A->getType()));
	}

	// (A >> C) == (B >> C) --> (A^B) u< (1 << C)
	// For lshr and ashr pairs.
	if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) &&
	match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) \|\|
	(match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) &&
	match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) {
	unsigned TypeBits = Cst1->getBitWidth();
	unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
	if (ShAmt < TypeBits && ShAmt != 0) {
	ICmpInst::Predicate NewPred =
	Pred == ICmpInst::ICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
	Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
	APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
	return new ICmpInst(NewPred, Xor, Builder.getInt(CmpVal));
	}
	}

	// (A << C) == (B << C) --> ((A^B) & (~0U >> C)) == 0
	if (match(Op0, m_OneUse(m_Shl(m_Value(A), m_ConstantInt(Cst1)))) &&
	match(Op1, m_OneUse(m_Shl(m_Value(B), m_Specific(Cst1))))) {
	unsigned TypeBits = Cst1->getBitWidth();
	unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
	if (ShAmt < TypeBits && ShAmt != 0) {
	Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
	APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt);
	Value *And = Builder.CreateAnd(Xor, Builder.getInt(AndVal),
	I.getName() + ".mask");
	return new ICmpInst(Pred, And, Constant::getNullValue(Cst1->getType()));
	}
	}

	// Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
	// "icmp (and X, mask), cst"
	uint64_t ShAmt = 0;
	if (Op0->hasOneUse() &&
	match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A), m_ConstantInt(ShAmt))))) &&
	match(Op1, m_ConstantInt(Cst1)) &&
	// Only do this when A has multiple uses. This is most important to do
	// when it exposes other optimizations.
	!A->hasOneUse()) {
	unsigned ASize = cast<IntegerType>(A->getType())->getPrimitiveSizeInBits();

	if (ShAmt < ASize) {
	APInt MaskV =
	APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits());
	MaskV <<= ShAmt;

	APInt CmpV = Cst1->getValue().zext(ASize);
	CmpV <<= ShAmt;

	Value *Mask = Builder.CreateAnd(A, Builder.getInt(MaskV));
	return new ICmpInst(Pred, Mask, Builder.getInt(CmpV));
	}
	}

	// If both operands are byte-swapped or bit-reversed, just compare the
	// original values.
	// TODO: Move this to a function similar to foldICmpIntrinsicWithConstant()
	// and handle more intrinsics.
	if ((match(Op0, m_BSwap(m_Value(A))) && match(Op1, m_BSwap(m_Value(B)))) \|\|
	(match(Op0, m_BitReverse(m_Value(A))) &&
	match(Op1, m_BitReverse(m_Value(B)))))
	return new ICmpInst(Pred, A, B);

	// Canonicalize checking for a power-of-2-or-zero value:
	// (A & (A-1)) == 0 --> ctpop(A) < 2 (two commuted variants)
	// ((A-1) & A) != 0 --> ctpop(A) > 1 (two commuted variants)
	if (!match(Op0, m_OneUse(m_c_And(m_Add(m_Value(A), m_AllOnes()),
	m_Deferred(A)))) \|\|
	!match(Op1, m_ZeroInt()))
	A = nullptr;

	// (A & -A) == A --> ctpop(A) < 2 (four commuted variants)
	// (-A & A) != A --> ctpop(A) > 1 (four commuted variants)
	if (match(Op0, m_OneUse(m_c_And(m_Neg(m_Specific(Op1)), m_Specific(Op1)))))
	A = Op1;
	else if (match(Op1,
	m_OneUse(m_c_And(m_Neg(m_Specific(Op0)), m_Specific(Op0)))))
	A = Op0;

	if (A) {
	Type *Ty = A->getType();
	CallInst *CtPop = Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, A);
	return Pred == ICmpInst::ICMP_EQ
	? new ICmpInst(ICmpInst::ICMP_ULT, CtPop, ConstantInt::get(Ty, 2))
	: new ICmpInst(ICmpInst::ICMP_UGT, CtPop, ConstantInt::get(Ty, 1));
	}

	return nullptr;
	}

	/// Handle icmp (cast x to y), (cast/cst). We only handle extending casts so
	/// far.
	Instruction *InstCombiner::foldICmpWithCastAndCast(ICmpInst &ICmp) {
	const CastInst *LHSCI = cast<CastInst>(ICmp.getOperand(0));
	Value *LHSCIOp = LHSCI->getOperand(0);
	Type *SrcTy = LHSCIOp->getType();
	Type *DestTy = LHSCI->getType();

	// Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
	// integer type is the same size as the pointer type.
	const auto& CompatibleSizes = [&](Type* SrcTy, Type* DestTy) -> bool {
	if (isa<VectorType>(SrcTy)) {
	SrcTy = cast<VectorType>(SrcTy)->getElementType();
	DestTy = cast<VectorType>(DestTy)->getElementType();
	}
	return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth();
	};
	if (LHSCI->getOpcode() == Instruction::PtrToInt &&
	CompatibleSizes(SrcTy, DestTy)) {
	Value *RHSOp = nullptr;
	if (auto *RHSC = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) {
	Value *RHSCIOp = RHSC->getOperand(0);
	if (RHSCIOp->getType()->getPointerAddressSpace() ==
	LHSCIOp->getType()->getPointerAddressSpace()) {
	RHSOp = RHSC->getOperand(0);
	// If the pointer types don't match, insert a bitcast.
	if (LHSCIOp->getType() != RHSOp->getType())
	RHSOp = Builder.CreateBitCast(RHSOp, LHSCIOp->getType());
	}
	} else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) {
	RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
	}

	if (RHSOp)
	return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSOp);
	}

	// The code below only handles extension cast instructions, so far.
	// Enforce this.
	if (LHSCI->getOpcode() != Instruction::ZExt &&
	LHSCI->getOpcode() != Instruction::SExt)
	return nullptr;

	bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt;
	bool isSignedCmp = ICmp.isSigned();

	if (auto *CI = dyn_cast<CastInst>(ICmp.getOperand(1))) {
	// Not an extension from the same type?
	Value *RHSCIOp = CI->getOperand(0);
	if (RHSCIOp->getType() != LHSCIOp->getType())
	return nullptr;

	// If the signedness of the two casts doesn't agree (i.e. one is a sext
	// and the other is a zext), then we can't handle this.
	if (CI->getOpcode() != LHSCI->getOpcode())
	return nullptr;

	// Deal with equality cases early.
	if (ICmp.isEquality())
	return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp);

	// A signed comparison of sign extended values simplifies into a
	// signed comparison.
	if (isSignedCmp && isSignedExt)
	return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp);

	// The other three cases all fold into an unsigned comparison.
	return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, RHSCIOp);
	}

	// If we aren't dealing with a constant on the RHS, exit early.
	auto *C = dyn_cast<Constant>(ICmp.getOperand(1));
	if (!C)
	return nullptr;

	// Compute the constant that would happen if we truncated to SrcTy then
	// re-extended to DestTy.
	Constant *Res1 = ConstantExpr::getTrunc(C, SrcTy);
	Constant *Res2 = ConstantExpr::getCast(LHSCI->getOpcode(), Res1, DestTy);

	// If the re-extended constant didn't change...
	if (Res2 == C) {
	// Deal with equality cases early.
	if (ICmp.isEquality())
	return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1);

	// A signed comparison of sign extended values simplifies into a
	// signed comparison.
	if (isSignedExt && isSignedCmp)
	return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1);

	// The other three cases all fold into an unsigned comparison.
	return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, Res1);
	}

	// The re-extended constant changed, partly changed (in the case of a vector),
	// or could not be determined to be equal (in the case of a constant
	// expression), so the constant cannot be represented in the shorter type.
	// Consequently, we cannot emit a simple comparison.
	// All the cases that fold to true or false will have already been handled
	// by SimplifyICmpInst, so only deal with the tricky case.

	if (isSignedCmp \|\| !isSignedExt \|\| !isa<ConstantInt>(C))
	return nullptr;

	// Evaluate the comparison for LT (we invert for GT below). LE and GE cases
	// should have been folded away previously and not enter in here.

	// We're performing an unsigned comp with a sign extended value.
	// This is true if the input is >= 0. [aka >s -1]
	Constant *NegOne = Constant::getAllOnesValue(SrcTy);
	Value *Result = Builder.CreateICmpSGT(LHSCIOp, NegOne, ICmp.getName());

	// Finally, return the value computed.
	if (ICmp.getPredicate() == ICmpInst::ICMP_ULT)
	return replaceInstUsesWith(ICmp, Result);

	assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!");
	return BinaryOperator::CreateNot(Result);
	}

	static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS) {
	switch (BinaryOp) {
	default:
	llvm_unreachable("Unsupported binary op");
	case Instruction::Add:
	case Instruction::Sub:
	return match(RHS, m_Zero());
	case Instruction::Mul:
	return match(RHS, m_One());
	}
	}

	OverflowResult InstCombiner::computeOverflow(
	Instruction::BinaryOps BinaryOp, bool IsSigned,
	Value LHS, Value RHS, Instruction *CxtI) const {
	switch (BinaryOp) {
	default:
	llvm_unreachable("Unsupported binary op");
	case Instruction::Add:
	if (IsSigned)
	return computeOverflowForSignedAdd(LHS, RHS, CxtI);
	else
	return computeOverflowForUnsignedAdd(LHS, RHS, CxtI);
	case Instruction::Sub:
	if (IsSigned)
	return computeOverflowForSignedSub(LHS, RHS, CxtI);
	else
	return computeOverflowForUnsignedSub(LHS, RHS, CxtI);
	case Instruction::Mul:
	if (IsSigned)
	return computeOverflowForSignedMul(LHS, RHS, CxtI);
	else
	return computeOverflowForUnsignedMul(LHS, RHS, CxtI);
	}
	}

	bool InstCombiner::OptimizeOverflowCheck(
	Instruction::BinaryOps BinaryOp, bool IsSigned, Value LHS, Value RHS,
	Instruction &OrigI, Value &Result, Constant &Overflow) {
	if (OrigI.isCommutative() && isa<Constant>(LHS) && !isa<Constant>(RHS))
	std::swap(LHS, RHS);

	// If the overflow check was an add followed by a compare, the insertion point
	// may be pointing to the compare. We want to insert the new instructions
	// before the add in case there are uses of the add between the add and the
	// compare.
	Builder.SetInsertPoint(&OrigI);

	if (isNeutralValue(BinaryOp, RHS)) {
	Result = LHS;
	Overflow = Builder.getFalse();
	return true;
	}

	switch (computeOverflow(BinaryOp, IsSigned, LHS, RHS, &OrigI)) {
	case OverflowResult::MayOverflow:
	return false;
	case OverflowResult::AlwaysOverflowsLow:
	case OverflowResult::AlwaysOverflowsHigh:
	Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
	Result->takeName(&OrigI);
	Overflow = Builder.getTrue();
	return true;
	case OverflowResult::NeverOverflows:
	Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
	Result->takeName(&OrigI);
	Overflow = Builder.getFalse();
	if (auto *Inst = dyn_cast<Instruction>(Result)) {
	if (IsSigned)
	Inst->setHasNoSignedWrap();
	else
	Inst->setHasNoUnsignedWrap();
	}
	return true;
	}

	llvm_unreachable("Unexpected overflow result");
	}

	/// Recognize and process idiom involving test for multiplication
	/// overflow.
	///
	/// The caller has matched a pattern of the form:
	/// I = cmp u (mul(zext A, zext B), V
	/// The function checks if this is a test for overflow and if so replaces
	/// multiplication with call to 'mul.with.overflow' intrinsic.
	///
	/// \param I Compare instruction.
	/// \param MulVal Result of 'mult' instruction. It is one of the arguments of
	/// the compare instruction. Must be of integer type.
	/// \param OtherVal The other argument of compare instruction.
	/// \returns Instruction which must replace the compare instruction, NULL if no
	/// replacement required.
	static Instruction processUMulZExtIdiom(ICmpInst &I, Value MulVal,
	Value *OtherVal, InstCombiner &IC) {
	// Don't bother doing this transformation for pointers, don't do it for
	// vectors.
	if (!isa<IntegerType>(MulVal->getType()))
	return nullptr;

	assert(I.getOperand(0) == MulVal \|\| I.getOperand(1) == MulVal);
	assert(I.getOperand(0) == OtherVal \|\| I.getOperand(1) == OtherVal);
	auto *MulInstr = dyn_cast<Instruction>(MulVal);
	if (!MulInstr)
	return nullptr;
	assert(MulInstr->getOpcode() == Instruction::Mul);

	auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)),
	*RHS = cast<ZExtOperator>(MulInstr->getOperand(1));
	assert(LHS->getOpcode() == Instruction::ZExt);
	assert(RHS->getOpcode() == Instruction::ZExt);
	Value A = LHS->getOperand(0), B = RHS->getOperand(0);

	// Calculate type and width of the result produced by mul.with.overflow.
	Type TyA = A->getType(), TyB = B->getType();
	unsigned WidthA = TyA->getPrimitiveSizeInBits(),
	WidthB = TyB->getPrimitiveSizeInBits();
	unsigned MulWidth;
	Type *MulType;
	if (WidthB > WidthA) {
	MulWidth = WidthB;
	MulType = TyB;
	} else {
	MulWidth = WidthA;
	MulType = TyA;
	}

	// In order to replace the original mul with a narrower mul.with.overflow,
	// all uses must ignore upper bits of the product. The number of used low
	// bits must be not greater than the width of mul.with.overflow.
	if (MulVal->hasNUsesOrMore(2))
	for (User *U : MulVal->users()) {
	if (U == &I)
	continue;
	if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
	// Check if truncation ignores bits above MulWidth.
	unsigned TruncWidth = TI->getType()->getPrimitiveSizeInBits();
	if (TruncWidth > MulWidth)
	return nullptr;
	} else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
	// Check if AND ignores bits above MulWidth.
	if (BO->getOpcode() != Instruction::And)
	return nullptr;
	if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
	const APInt &CVal = CI->getValue();
	if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth)
	return nullptr;
	} else {
	// In this case we could have the operand of the binary operation
	// being defined in another block, and performing the replacement
	// could break the dominance relation.
	return nullptr;
	}
	} else {
	// Other uses prohibit this transformation.
	return nullptr;
	}
	}

	// Recognize patterns
	switch (I.getPredicate()) {
	case ICmpInst::ICMP_EQ:
	case ICmpInst::ICMP_NE:
	// Recognize pattern:
	// mulval = mul(zext A, zext B)
	// cmp eq/neq mulval, zext trunc mulval
	if (ZExtInst *Zext = dyn_cast<ZExtInst>(OtherVal))
	if (Zext->hasOneUse()) {
	Value *ZextArg = Zext->getOperand(0);
	if (TruncInst *Trunc = dyn_cast<TruncInst>(ZextArg))
	if (Trunc->getType()->getPrimitiveSizeInBits() == MulWidth)
	break; //Recognized
	}

	// Recognize pattern:
	// mulval = mul(zext A, zext B)
	// cmp eq/neq mulval, and(mulval, mask), mask selects low MulWidth bits.
	ConstantInt *CI;
	Value *ValToMask;
	if (match(OtherVal, m_And(m_Value(ValToMask), m_ConstantInt(CI)))) {
	if (ValToMask != MulVal)
	return nullptr;
	const APInt &CVal = CI->getValue() + 1;
	if (CVal.isPowerOf2()) {
	unsigned MaskWidth = CVal.logBase2();
	if (MaskWidth == MulWidth)
	break; // Recognized
	}
	}
	return nullptr;

	case ICmpInst::ICMP_UGT:
	// Recognize pattern:
	// mulval = mul(zext A, zext B)
	// cmp ugt mulval, max
	if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
	APInt MaxVal = APInt::getMaxValue(MulWidth);
	MaxVal = MaxVal.zext(CI->getBitWidth());
	if (MaxVal.eq(CI->getValue()))
	break; // Recognized
	}
	return nullptr;

	case ICmpInst::ICMP_UGE:
	// Recognize pattern:
	// mulval = mul(zext A, zext B)
	// cmp uge mulval, max+1
	if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
	APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth);
	if (MaxVal.eq(CI->getValue()))
	break; // Recognized
	}
	return nullptr;

	case ICmpInst::ICMP_ULE:
	// Recognize pattern:
	// mulval = mul(zext A, zext B)
	// cmp ule mulval, max
	if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
	APInt MaxVal = APInt::getMaxValue(MulWidth);
	MaxVal = MaxVal.zext(CI->getBitWidth());
	if (MaxVal.eq(CI->getValue()))
	break; // Recognized
	}
	return nullptr;

	case ICmpInst::ICMP_ULT:
	// Recognize pattern:
	// mulval = mul(zext A, zext B)
	// cmp ule mulval, max + 1
	if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
	APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth);
	if (MaxVal.eq(CI->getValue()))
	break; // Recognized
	}
	return nullptr;

	default:
	return nullptr;
	}

	InstCombiner::BuilderTy &Builder = IC.Builder;
	Builder.SetInsertPoint(MulInstr);

	// Replace: mul(zext A, zext B) --> mul.with.overflow(A, B)
	Value MulA = A, MulB = B;
	if (WidthA < MulWidth)
	MulA = Builder.CreateZExt(A, MulType);
	if (WidthB < MulWidth)
	MulB = Builder.CreateZExt(B, MulType);
	Function *F = Intrinsic::getDeclaration(
	I.getModule(), Intrinsic::umul_with_overflow, MulType);
	CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul");
	IC.Worklist.Add(MulInstr);

	// If there are uses of mul result other than the comparison, we know that
	// they are truncation or binary AND. Change them to use result of
	// mul.with.overflow and adjust properly mask/size.
	if (MulVal->hasNUsesOrMore(2)) {
	Value *Mul = Builder.CreateExtractValue(Call, 0, "umul.value");
	for (auto UI = MulVal->user_begin(), UE = MulVal->user_end(); UI != UE;) {
	User U = UI++;
	if (U == &I \|\| U == OtherVal)
	continue;
	if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
	if (TI->getType()->getPrimitiveSizeInBits() == MulWidth)
	IC.replaceInstUsesWith(*TI, Mul);
	else
	TI->setOperand(0, Mul);
	} else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
	assert(BO->getOpcode() == Instruction::And);
	// Replace (mul & mask) --> zext (mul.with.overflow & short_mask)
	ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
	APInt ShortMask = CI->getValue().trunc(MulWidth);
	Value *ShortAnd = Builder.CreateAnd(Mul, ShortMask);
	Instruction *Zext =
	cast<Instruction>(Builder.CreateZExt(ShortAnd, BO->getType()));
	IC.Worklist.Add(Zext);
	IC.replaceInstUsesWith(*BO, Zext);
	} else {
	llvm_unreachable("Unexpected Binary operation");
	}
	IC.Worklist.Add(cast<Instruction>(U));
	}
	}
	if (isa<Instruction>(OtherVal))
	IC.Worklist.Add(cast<Instruction>(OtherVal));

	// The original icmp gets replaced with the overflow value, maybe inverted
	// depending on predicate.
	bool Inverse = false;
	switch (I.getPredicate()) {
	case ICmpInst::ICMP_NE:
	break;
	case ICmpInst::ICMP_EQ:
	Inverse = true;
	break;
	case ICmpInst::ICMP_UGT:
	case ICmpInst::ICMP_UGE:
	if (I.getOperand(0) == MulVal)
	break;
	Inverse = true;
	break;
	case ICmpInst::ICMP_ULT:
	case ICmpInst::ICMP_ULE:
	if (I.getOperand(1) == MulVal)
	break;
	Inverse = true;
	break;
	default:
	llvm_unreachable("Unexpected predicate");
	}
	if (Inverse) {
	Value *Res = Builder.CreateExtractValue(Call, 1);
	return BinaryOperator::CreateNot(Res);
	}

	return ExtractValueInst::Create(Call, 1);
	}

	/// When performing a comparison against a constant, it is possible that not all
	/// the bits in the LHS are demanded. This helper method computes the mask that
	/// IS demanded.
	static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth) {
	const APInt *RHS;
	if (!match(I.getOperand(1), m_APInt(RHS)))
	return APInt::getAllOnesValue(BitWidth);

	// If this is a normal comparison, it demands all bits. If it is a sign bit
	// comparison, it only demands the sign bit.
	bool UnusedBit;
	if (isSignBitCheck(I.getPredicate(), *RHS, UnusedBit))
	return APInt::getSignMask(BitWidth);

	switch (I.getPredicate()) {
	// For a UGT comparison, we don't care about any bits that
	// correspond to the trailing ones of the comparand. The value of these
	// bits doesn't impact the outcome of the comparison, because any value
	// greater than the RHS must differ in a bit higher than these due to carry.
	case ICmpInst::ICMP_UGT:
	return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingOnes());

	// Similarly, for a ULT comparison, we don't care about the trailing zeros.
	// Any value less than the RHS must differ in a higher bit because of carries.
	case ICmpInst::ICMP_ULT:
	return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingZeros());

	default:
	return APInt::getAllOnesValue(BitWidth);
	}
	}

	/// Check if the order of \p Op0 and \p Op1 as operands in an ICmpInst
	/// should be swapped.
	/// The decision is based on how many times these two operands are reused
	/// as subtract operands and their positions in those instructions.
	/// The rationale is that several architectures use the same instruction for
	/// both subtract and cmp. Thus, it is better if the order of those operands
	/// match.
	/// \return true if Op0 and Op1 should be swapped.
	static bool swapMayExposeCSEOpportunities(const Value Op0, const Value Op1) {
	// Filter out pointer values as those cannot appear directly in subtract.
	// FIXME: we may want to go through inttoptrs or bitcasts.
	if (Op0->getType()->isPointerTy())
	return false;
	// If a subtract already has the same operands as a compare, swapping would be
	// bad. If a subtract has the same operands as a compare but in reverse order,
	// then swapping is good.
	int GoodToSwap = 0;
	for (const User *U : Op0->users()) {
	if (match(U, m_Sub(m_Specific(Op1), m_Specific(Op0))))
	GoodToSwap++;
	else if (match(U, m_Sub(m_Specific(Op0), m_Specific(Op1))))
	GoodToSwap--;
	}
	return GoodToSwap > 0;
	}

	/// Check that one use is in the same block as the definition and all
	/// other uses are in blocks dominated by a given block.
	///
	/// \param DI Definition
	/// \param UI Use
	/// \param DB Block that must dominate all uses of \p DI outside
	/// the parent block
	/// \return true when \p UI is the only use of \p DI in the parent block
	/// and all other uses of \p DI are in blocks dominated by \p DB.
	///
	bool InstCombiner::dominatesAllUses(const Instruction *DI,
	const Instruction *UI,
	const BasicBlock *DB) const {
	assert(DI && UI && "Instruction not defined\n");
	// Ignore incomplete definitions.
	if (!DI->getParent())
	return false;
	// DI and UI must be in the same block.
	if (DI->getParent() != UI->getParent())
	return false;
	// Protect from self-referencing blocks.
	if (DI->getParent() == DB)
	return false;
	for (const User *U : DI->users()) {
	auto *Usr = cast<Instruction>(U);
	if (Usr != UI && !DT.dominates(DB, Usr->getParent()))
	return false;
	}
	return true;
	}

	/// Return true when the instruction sequence within a block is select-cmp-br.
	static bool isChainSelectCmpBranch(const SelectInst *SI) {
	const BasicBlock *BB = SI->getParent();
	if (!BB)
	return false;
	auto *BI = dyn_cast_or_null<BranchInst>(BB->getTerminator());
	if (!BI \|\| BI->getNumSuccessors() != 2)
	return false;
	auto *IC = dyn_cast<ICmpInst>(BI->getCondition());
	if (!IC \|\| (IC->getOperand(0) != SI && IC->getOperand(1) != SI))
	return false;
	return true;
	}

	/// True when a select result is replaced by one of its operands
	/// in select-icmp sequence. This will eventually result in the elimination
	/// of the select.
	///
	/// \param SI Select instruction
	/// \param Icmp Compare instruction
	/// \param SIOpd Operand that replaces the select
	///
	/// Notes:
	/// - The replacement is global and requires dominator information
	/// - The caller is responsible for the actual replacement
	///
	/// Example:
	///
	/// entry:
	/// %4 = select i1 %3, %C* %0, %C* null
	/// %5 = icmp eq %C* %4, null
	/// br i1 %5, label %9, label %7
	/// ...
	/// ; <label>:7 ; preds = %entry
	/// %8 = getelementptr inbounds %C* %4, i64 0, i32 0
	/// ...
	///
	/// can be transformed to
	///
	/// %5 = icmp eq %C* %0, null
	/// %6 = select i1 %3, i1 %5, i1 true
	/// br i1 %6, label %9, label %7
	/// ...
	/// ; <label>:7 ; preds = %entry
	/// %8 = getelementptr inbounds %C* %0, i64 0, i32 0 // replace by %0!
	///
	/// Similar when the first operand of the select is a constant or/and
	/// the compare is for not equal rather than equal.
	///
	/// NOTE: The function is only called when the select and compare constants
	/// are equal, the optimization can work only for EQ predicates. This is not a
	/// major restriction since a NE compare should be 'normalized' to an equal
	/// compare, which usually happens in the combiner and test case
	/// select-cmp-br.ll checks for it.
	bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
	const ICmpInst *Icmp,
	const unsigned SIOpd) {
	assert((SIOpd == 1 \|\| SIOpd == 2) && "Invalid select operand!");
	if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) {
	BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1);
	// The check for the single predecessor is not the best that can be
	// done. But it protects efficiently against cases like when SI's
	// home block has two successors, Succ and Succ1, and Succ1 predecessor
	// of Succ. Then SI can't be replaced by SIOpd because the use that gets
	// replaced can be reached on either path. So the uniqueness check
	// guarantees that the path all uses of SI (outside SI's parent) are on
	// is disjoint from all other paths out of SI. But that information
	// is more expensive to compute, and the trade-off here is in favor
	// of compile-time. It should also be noticed that we check for a single
	// predecessor and not only uniqueness. This to handle the situation when
	// Succ and Succ1 points to the same basic block.
	if (Succ->getSinglePredecessor() && dominatesAllUses(SI, Icmp, Succ)) {
	NumSel++;
	SI->replaceUsesOutsideBlock(SI->getOperand(SIOpd), SI->getParent());
	return true;
	}
	}
	return false;
	}

	/// Try to fold the comparison based on range information we can get by checking
	/// whether bits are known to be zero or one in the inputs.
	Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	Type *Ty = Op0->getType();
	ICmpInst::Predicate Pred = I.getPredicate();

	// Get scalar or pointer size.
	unsigned BitWidth = Ty->isIntOrIntVectorTy()
	? Ty->getScalarSizeInBits()
	: DL.getIndexTypeSizeInBits(Ty->getScalarType());

	if (!BitWidth)
	return nullptr;

	KnownBits Op0Known(BitWidth);
	KnownBits Op1Known(BitWidth);

	if (SimplifyDemandedBits(&I, 0,
	getDemandedBitsLHSMask(I, BitWidth),
	Op0Known, 0))
	return &I;

	if (SimplifyDemandedBits(&I, 1, APInt::getAllOnesValue(BitWidth),
	Op1Known, 0))
	return &I;

	// Given the known and unknown bits, compute a range that the LHS could be
	// in. Compute the Min, Max and RHS values based on the known bits. For the
	// EQ and NE we use unsigned values.
	APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0);
	APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0);
	if (I.isSigned()) {
	computeSignedMinMaxValuesFromKnownBits(Op0Known, Op0Min, Op0Max);
	computeSignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max);
	} else {
	computeUnsignedMinMaxValuesFromKnownBits(Op0Known, Op0Min, Op0Max);
	computeUnsignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max);
	}

	// If Min and Max are known to be the same, then SimplifyDemandedBits figured
	// out that the LHS or RHS is a constant. Constant fold this now, so that
	// code below can assume that Min != Max.
	if (!isa<Constant>(Op0) && Op0Min == Op0Max)
	return new ICmpInst(Pred, ConstantExpr::getIntegerValue(Ty, Op0Min), Op1);
	if (!isa<Constant>(Op1) && Op1Min == Op1Max)
	return new ICmpInst(Pred, Op0, ConstantExpr::getIntegerValue(Ty, Op1Min));

	// Based on the range information we know about the LHS, see if we can
	// simplify this comparison. For example, (x&4) < 8 is always true.
	switch (Pred) {
	default:
	llvm_unreachable("Unknown icmp opcode!");
	case ICmpInst::ICMP_EQ:
	case ICmpInst::ICMP_NE: {
	if (Op0Max.ult(Op1Min) \|\| Op0Min.ugt(Op1Max)) {
	return Pred == CmpInst::ICMP_EQ
	? replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()))
	: replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
	}

	// If all bits are known zero except for one, then we know at most one bit
	// is set. If the comparison is against zero, then this is a check to see if
	// that bit is set.
	APInt Op0KnownZeroInverted = ~Op0Known.Zero;
	if (Op1Known.isZero()) {
	// If the LHS is an AND with the same constant, look through it.
	Value *LHS = nullptr;
	const APInt *LHSC;
	if (!match(Op0, m_And(m_Value(LHS), m_APInt(LHSC))) \|\|
	*LHSC != Op0KnownZeroInverted)
	LHS = Op0;

	Value *X;
	if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
	APInt ValToCheck = Op0KnownZeroInverted;
	Type *XTy = X->getType();
	if (ValToCheck.isPowerOf2()) {
	// ((1 << X) & 8) == 0 -> X != 3
	// ((1 << X) & 8) != 0 -> X == 3
	auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
	auto NewPred = ICmpInst::getInversePredicate(Pred);
	return new ICmpInst(NewPred, X, CmpC);
	} else if ((++ValToCheck).isPowerOf2()) {
	// ((1 << X) & 7) == 0 -> X >= 3
	// ((1 << X) & 7) != 0 -> X < 3
	auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
	auto NewPred =
	Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGE : CmpInst::ICMP_ULT;
	return new ICmpInst(NewPred, X, CmpC);
	}
	}

	// Check if the LHS is 8 >>u x and the result is a power of 2 like 1.
	const APInt *CI;
	if (Op0KnownZeroInverted.isOneValue() &&
	match(LHS, m_LShr(m_Power2(CI), m_Value(X)))) {
	// ((8 >>u X) & 1) == 0 -> X != 3
	// ((8 >>u X) & 1) != 0 -> X == 3
	unsigned CmpVal = CI->countTrailingZeros();
	auto NewPred = ICmpInst::getInversePredicate(Pred);
	return new ICmpInst(NewPred, X, ConstantInt::get(X->getType(), CmpVal));
	}
	}
	break;
	}
	case ICmpInst::ICMP_ULT: {
	if (Op0Max.ult(Op1Min)) // A <u B -> true if max(A) < min(B)
	return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
	if (Op0Min.uge(Op1Max)) // A <u B -> false if min(A) >= max(B)
	return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
	if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B)
	return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);

	const APInt *CmpC;
	if (match(Op1, m_APInt(CmpC))) {
	// A <u C -> A == C-1 if min(A)+1 == C
	if (*CmpC == Op0Min + 1)
	return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
	ConstantInt::get(Op1->getType(), *CmpC - 1));
	// X <u C --> X == 0, if the number of zero bits in the bottom of X
	// exceeds the log2 of C.
	if (Op0Known.countMinTrailingZeros() >= CmpC->ceilLogBase2())
	return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
	Constant::getNullValue(Op1->getType()));
	}
	break;
	}
	case ICmpInst::ICMP_UGT: {
	if (Op0Min.ugt(Op1Max)) // A >u B -> true if min(A) > max(B)
	return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
	if (Op0Max.ule(Op1Min)) // A >u B -> false if max(A) <= max(B)
	return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
	if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B)
	return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);

	const APInt *CmpC;
	if (match(Op1, m_APInt(CmpC))) {
	// A >u C -> A == C+1 if max(a)-1 == C
	if (*CmpC == Op0Max - 1)
	return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
	ConstantInt::get(Op1->getType(), *CmpC + 1));
	// X >u C --> X != 0, if the number of zero bits in the bottom of X
	// exceeds the log2 of C.
	if (Op0Known.countMinTrailingZeros() >= CmpC->getActiveBits())
	return new ICmpInst(ICmpInst::ICMP_NE, Op0,
	Constant::getNullValue(Op1->getType()));
	}
	break;
	}
	case ICmpInst::ICMP_SLT: {
	if (Op0Max.slt(Op1Min)) // A <s B -> true if max(A) < min(C)
	return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
	if (Op0Min.sge(Op1Max)) // A <s B -> false if min(A) >= max(C)
	return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
	if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B)
	return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
	const APInt *CmpC;
	if (match(Op1, m_APInt(CmpC))) {
	if (*CmpC == Op0Min + 1) // A <s C -> A == C-1 if min(A)+1 == C
	return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
	ConstantInt::get(Op1->getType(), *CmpC - 1));
	}
	break;
	}
	case ICmpInst::ICMP_SGT: {
	if (Op0Min.sgt(Op1Max)) // A >s B -> true if min(A) > max(B)
	return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
	if (Op0Max.sle(Op1Min)) // A >s B -> false if max(A) <= min(B)
	return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
	if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B)
	return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
	const APInt *CmpC;
	if (match(Op1, m_APInt(CmpC))) {
	if (*CmpC == Op0Max - 1) // A >s C -> A == C+1 if max(A)-1 == C
	return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
	ConstantInt::get(Op1->getType(), *CmpC + 1));
	}
	break;
	}
	case ICmpInst::ICMP_SGE:
	assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!");
	if (Op0Min.sge(Op1Max)) // A >=s B -> true if min(A) >= max(B)
	return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
	if (Op0Max.slt(Op1Min)) // A >=s B -> false if max(A) < min(B)
	return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
	if (Op1Min == Op0Max) // A >=s B -> A == B if max(A) == min(B)
	return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
	break;
	case ICmpInst::ICMP_SLE:
	assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!");
	if (Op0Max.sle(Op1Min)) // A <=s B -> true if max(A) <= min(B)
	return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
	if (Op0Min.sgt(Op1Max)) // A <=s B -> false if min(A) > max(B)
	return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
	if (Op1Max == Op0Min) // A <=s B -> A == B if min(A) == max(B)
	return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
	break;
	case ICmpInst::ICMP_UGE:
	assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!");
	if (Op0Min.uge(Op1Max)) // A >=u B -> true if min(A) >= max(B)
	return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
	if (Op0Max.ult(Op1Min)) // A >=u B -> false if max(A) < min(B)
	return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
	if (Op1Min == Op0Max) // A >=u B -> A == B if max(A) == min(B)
	return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
	break;
	case ICmpInst::ICMP_ULE:
	assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!");
	if (Op0Max.ule(Op1Min)) // A <=u B -> true if max(A) <= min(B)
	return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
	if (Op0Min.ugt(Op1Max)) // A <=u B -> false if min(A) > max(B)
	return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
	if (Op1Max == Op0Min) // A <=u B -> A == B if min(A) == max(B)
	return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
	break;
	}

	// Turn a signed comparison into an unsigned one if both operands are known to
	// have the same sign.
	if (I.isSigned() &&
	((Op0Known.Zero.isNegative() && Op1Known.Zero.isNegative()) \|\|
	(Op0Known.One.isNegative() && Op1Known.One.isNegative())))
	return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);

	return nullptr;
	}

	/// If we have an icmp le or icmp ge instruction with a constant operand, turn
	/// it into the appropriate icmp lt or icmp gt instruction. This transform
	/// allows them to be folded in visitICmpInst.
	static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
	ICmpInst::Predicate Pred = I.getPredicate();
	if (Pred != ICmpInst::ICMP_SLE && Pred != ICmpInst::ICMP_SGE &&
	Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_UGE)
	return nullptr;

	Value *Op0 = I.getOperand(0);
	Value *Op1 = I.getOperand(1);
	auto *Op1C = dyn_cast<Constant>(Op1);
	if (!Op1C)
	return nullptr;

	// Check if the constant operand can be safely incremented/decremented without
	// overflowing/underflowing. For scalars, SimplifyICmpInst has already handled
	// the edge cases for us, so we just assert on them. For vectors, we must
	// handle the edge cases.
	Type *Op1Type = Op1->getType();
	bool IsSigned = I.isSigned();
	bool IsLE = (Pred == ICmpInst::ICMP_SLE \|\| Pred == ICmpInst::ICMP_ULE);
	auto *CI = dyn_cast<ConstantInt>(Op1C);
	if (CI) {
	// A <= MAX -> TRUE ; A >= MIN -> TRUE
	assert(IsLE ? !CI->isMaxValue(IsSigned) : !CI->isMinValue(IsSigned));
	} else if (Op1Type->isVectorTy()) {
	// TODO? If the edge cases for vectors were guaranteed to be handled as they
	// are for scalar, we could remove the min/max checks. However, to do that,
	// we would have to use insertelement/shufflevector to replace edge values.
	unsigned NumElts = Op1Type->getVectorNumElements();
	for (unsigned i = 0; i != NumElts; ++i) {
	Constant *Elt = Op1C->getAggregateElement(i);
	if (!Elt)
	return nullptr;

	if (isa<UndefValue>(Elt))
	continue;

	// Bail out if we can't determine if this constant is min/max or if we
	// know that this constant is min/max.
	auto *CI = dyn_cast<ConstantInt>(Elt);
	if (!CI \|\| (IsLE ? CI->isMaxValue(IsSigned) : CI->isMinValue(IsSigned)))
	return nullptr;
	}
	} else {
	// ConstantExpr?
	return nullptr;
	}

	// Increment or decrement the constant and set the new comparison predicate:
	// ULE -> ULT ; UGE -> UGT ; SLE -> SLT ; SGE -> SGT
	Constant *OneOrNegOne = ConstantInt::get(Op1Type, IsLE ? 1 : -1, true);
	CmpInst::Predicate NewPred = IsLE ? ICmpInst::ICMP_ULT: ICmpInst::ICMP_UGT;
	NewPred = IsSigned ? ICmpInst::getSignedPredicate(NewPred) : NewPred;
	return new ICmpInst(NewPred, Op0, ConstantExpr::getAdd(Op1C, OneOrNegOne));
	}

	/// Integer compare with boolean values can always be turned into bitwise ops.
	static Instruction *canonicalizeICmpBool(ICmpInst &I,
	InstCombiner::BuilderTy &Builder) {
	Value A = I.getOperand(0), B = I.getOperand(1);
	assert(A->getType()->isIntOrIntVectorTy(1) && "Bools only");

	// A boolean compared to true/false can be simplified to Op0/true/false in
	// 14 out of the 20 (10 predicates * 2 constants) possible combinations.
	// Cases not handled by InstSimplify are always 'not' of Op0.
	if (match(B, m_Zero())) {
	switch (I.getPredicate()) {
	case CmpInst::ICMP_EQ: // A == 0 -> !A
	case CmpInst::ICMP_ULE: // A <=u 0 -> !A
	case CmpInst::ICMP_SGE: // A >=s 0 -> !A
	return BinaryOperator::CreateNot(A);
	default:
	llvm_unreachable("ICmp i1 X, C not simplified as expected.");
	}
	} else if (match(B, m_One())) {
	switch (I.getPredicate()) {
	case CmpInst::ICMP_NE: // A != 1 -> !A
	case CmpInst::ICMP_ULT: // A <u 1 -> !A
	case CmpInst::ICMP_SGT: // A >s -1 -> !A
	return BinaryOperator::CreateNot(A);
	default:
	llvm_unreachable("ICmp i1 X, C not simplified as expected.");
	}
	}

	switch (I.getPredicate()) {
	default:
	llvm_unreachable("Invalid icmp instruction!");
	case ICmpInst::ICMP_EQ:
	// icmp eq i1 A, B -> ~(A ^ B)
	return BinaryOperator::CreateNot(Builder.CreateXor(A, B));

	case ICmpInst::ICMP_NE:
	// icmp ne i1 A, B -> A ^ B
	return BinaryOperator::CreateXor(A, B);

	case ICmpInst::ICMP_UGT:
	// icmp ugt -> icmp ult
	std::swap(A, B);
	LLVM_FALLTHROUGH;
	case ICmpInst::ICMP_ULT:
	// icmp ult i1 A, B -> ~A & B
	return BinaryOperator::CreateAnd(Builder.CreateNot(A), B);

	case ICmpInst::ICMP_SGT:
	// icmp sgt -> icmp slt
	std::swap(A, B);
	LLVM_FALLTHROUGH;
	case ICmpInst::ICMP_SLT:
	// icmp slt i1 A, B -> A & ~B
	return BinaryOperator::CreateAnd(Builder.CreateNot(B), A);

	case ICmpInst::ICMP_UGE:
	// icmp uge -> icmp ule
	std::swap(A, B);
	LLVM_FALLTHROUGH;
	case ICmpInst::ICMP_ULE:
	// icmp ule i1 A, B -> ~A \| B
	return BinaryOperator::CreateOr(Builder.CreateNot(A), B);

	case ICmpInst::ICMP_SGE:
	// icmp sge -> icmp sle
	std::swap(A, B);
	LLVM_FALLTHROUGH;
	case ICmpInst::ICMP_SLE:
	// icmp sle i1 A, B -> A \| ~B
	return BinaryOperator::CreateOr(Builder.CreateNot(B), A);
	}
	}

	// Transform pattern like:
	// (1 << Y) u<= X or ~(-1 << Y) u< X or ((1 << Y)+(-1)) u< X
	// (1 << Y) u> X or ~(-1 << Y) u>= X or ((1 << Y)+(-1)) u>= X
	// Into:
	// (X l>> Y) != 0
	// (X l>> Y) == 0
	static Instruction *foldICmpWithHighBitMask(ICmpInst &Cmp,
	InstCombiner::BuilderTy &Builder) {
	ICmpInst::Predicate Pred, NewPred;
	Value X, Y;
	if (match(&Cmp,
	m_c_ICmp(Pred, m_OneUse(m_Shl(m_One(), m_Value(Y))), m_Value(X)))) {
	// We want X to be the icmp's second operand, so swap predicate if it isn't.
	if (Cmp.getOperand(0) == X)
	Pred = Cmp.getSwappedPredicate();

	switch (Pred) {
	case ICmpInst::ICMP_ULE:
	NewPred = ICmpInst::ICMP_NE;
	break;
	case ICmpInst::ICMP_UGT:
	NewPred = ICmpInst::ICMP_EQ;
	break;
	default:
	return nullptr;
	}
	} else if (match(&Cmp, m_c_ICmp(Pred,
	m_OneUse(m_CombineOr(
	m_Not(m_Shl(m_AllOnes(), m_Value(Y))),
	m_Add(m_Shl(m_One(), m_Value(Y)),
	m_AllOnes()))),
	m_Value(X)))) {
	// The variant with 'add' is not canonical, (the variant with 'not' is)
	// we only get it because it has extra uses, and can't be canonicalized,

	// We want X to be the icmp's second operand, so swap predicate if it isn't.
	if (Cmp.getOperand(0) == X)
	Pred = Cmp.getSwappedPredicate();

	switch (Pred) {
	case ICmpInst::ICMP_ULT:
	NewPred = ICmpInst::ICMP_NE;
	break;
	case ICmpInst::ICMP_UGE:
	NewPred = ICmpInst::ICMP_EQ;
	break;
	default:
	return nullptr;
	}
	} else
	return nullptr;

	Value *NewX = Builder.CreateLShr(X, Y, X->getName() + ".highbits");
	Constant *Zero = Constant::getNullValue(NewX->getType());
	return CmpInst::Create(Instruction::ICmp, NewPred, NewX, Zero);
	}

	static Instruction *foldVectorCmp(CmpInst &Cmp,
	InstCombiner::BuilderTy &Builder) {
	// If both arguments of the cmp are shuffles that use the same mask and
	// shuffle within a single vector, move the shuffle after the cmp.
	Value LHS = Cmp.getOperand(0), RHS = Cmp.getOperand(1);
	Value V1, V2;
	Constant *M;
	if (match(LHS, m_ShuffleVector(m_Value(V1), m_Undef(), m_Constant(M))) &&
	match(RHS, m_ShuffleVector(m_Value(V2), m_Undef(), m_Specific(M))) &&
	V1->getType() == V2->getType() &&
	(LHS->hasOneUse() \|\| RHS->hasOneUse())) {
	// cmp (shuffle V1, M), (shuffle V2, M) --> shuffle (cmp V1, V2), M
	CmpInst::Predicate P = Cmp.getPredicate();
	Value *NewCmp = isa<ICmpInst>(Cmp) ? Builder.CreateICmp(P, V1, V2)
	: Builder.CreateFCmp(P, V1, V2);
	return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()), M);
	}
	return nullptr;
	}

	Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
	bool Changed = false;
	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	unsigned Op0Cplxity = getComplexity(Op0);
	unsigned Op1Cplxity = getComplexity(Op1);

	/// Orders the operands of the compare so that they are listed from most
	/// complex to least complex. This puts constants before unary operators,
	/// before binary operators.
	if (Op0Cplxity < Op1Cplxity \|\|
	(Op0Cplxity == Op1Cplxity && swapMayExposeCSEOpportunities(Op0, Op1))) {
	I.swapOperands();
	std::swap(Op0, Op1);
	Changed = true;
	}

	if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1,
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	// Comparing -val or val with non-zero is the same as just comparing val
	// ie, abs(val) != 0 -> val != 0
	if (I.getPredicate() == ICmpInst::ICMP_NE && match(Op1, m_Zero())) {
	Value Cond, SelectTrue, *SelectFalse;
	if (match(Op0, m_Select(m_Value(Cond), m_Value(SelectTrue),
	m_Value(SelectFalse)))) {
	if (Value *V = dyn_castNegVal(SelectTrue)) {
	if (V == SelectFalse)
	return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
	}
	else if (Value *V = dyn_castNegVal(SelectFalse)) {
	if (V == SelectTrue)
	return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
	}
	}
	}

	if (Op0->getType()->isIntOrIntVectorTy(1))
	if (Instruction *Res = canonicalizeICmpBool(I, Builder))
	return Res;

	if (ICmpInst *NewICmp = canonicalizeCmpWithConstant(I))
	return NewICmp;

	if (Instruction *Res = foldICmpWithConstant(I))
	return Res;

	if (Instruction *Res = foldICmpWithDominatingICmp(I))
	return Res;

	if (Instruction *Res = foldICmpUsingKnownBits(I))
	return Res;

	// Test if the ICmpInst instruction is used exclusively by a select as
	// part of a minimum or maximum operation. If so, refrain from doing
	// any other folding. This helps out other analyses which understand
	// non-obfuscated minimum and maximum idioms, such as ScalarEvolution
	// and CodeGen. And in this case, at least one of the comparison
	// operands has at least one user besides the compare (the select),
	// which would often largely negate the benefit of folding anyway.
	//
	// Do the same for the other patterns recognized by matchSelectPattern.
	if (I.hasOneUse())
	if (SelectInst *SI = dyn_cast<SelectInst>(I.user_back())) {
	Value A, B;
	SelectPatternResult SPR = matchSelectPattern(SI, A, B);
	if (SPR.Flavor != SPF_UNKNOWN)
	return nullptr;
	}

	// Do this after checking for min/max to prevent infinite looping.
	if (Instruction *Res = foldICmpWithZero(I))
	return Res;

	// FIXME: We only do this after checking for min/max to prevent infinite
	// looping caused by a reverse canonicalization of these patterns for min/max.
	// FIXME: The organization of folds is a mess. These would naturally go into
	// canonicalizeCmpWithConstant(), but we can't move all of the above folds
	// down here after the min/max restriction.
	ICmpInst::Predicate Pred = I.getPredicate();
	const APInt *C;
	if (match(Op1, m_APInt(C))) {
	// For i32: x >u 2147483647 -> x <s 0 -> true if sign bit set
	if (Pred == ICmpInst::ICMP_UGT && C->isMaxSignedValue()) {
	Constant *Zero = Constant::getNullValue(Op0->getType());
	return new ICmpInst(ICmpInst::ICMP_SLT, Op0, Zero);
	}

	// For i32: x <u 2147483648 -> x >s -1 -> true if sign bit clear
	if (Pred == ICmpInst::ICMP_ULT && C->isMinSignedValue()) {
	Constant *AllOnes = Constant::getAllOnesValue(Op0->getType());
	return new ICmpInst(ICmpInst::ICMP_SGT, Op0, AllOnes);
	}
	}

	if (Instruction *Res = foldICmpInstWithConstant(I))
	return Res;

	if (Instruction *Res = foldICmpInstWithConstantNotInt(I))
	return Res;

	// If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now.
	if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op0))
	if (Instruction *NI = foldGEPICmp(GEP, Op1, I.getPredicate(), I))
	return NI;
	if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1))
	if (Instruction *NI = foldGEPICmp(GEP, Op0,
	ICmpInst::getSwappedPredicate(I.getPredicate()), I))
	return NI;

	// Try to optimize equality comparisons against alloca-based pointers.
	if (Op0->getType()->isPointerTy() && I.isEquality()) {
	assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?");
	if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op0, DL)))
	if (Instruction *New = foldAllocaCmp(I, Alloca, Op1))
	return New;
	if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op1, DL)))
	if (Instruction *New = foldAllocaCmp(I, Alloca, Op0))
	return New;
	}

	if (Instruction *Res = foldICmpBitCast(I, Builder))
	return Res;

	if (isa<CastInst>(Op0)) {
	// Handle the special case of: icmp (cast bool to X), <cst>
	// This comes up when you have code like
	// int X = A < B;
	// if (X) ...
	// For generality, we handle any zero-extension of any operand comparison
	// with a constant or another cast from the same type.
	if (isa<Constant>(Op1) \|\| isa<CastInst>(Op1))
	if (Instruction *R = foldICmpWithCastAndCast(I))
	return R;
	}

	if (Instruction *Res = foldICmpBinOp(I))
	return Res;

	if (Instruction *Res = foldICmpWithMinMax(I))
	return Res;

	{
	Value A, B;
	// Transform (A & ~B) == 0 --> (A & B) != 0
	// and (A & ~B) != 0 --> (A & B) == 0
	// if A is a power of 2.
	if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
	match(Op1, m_Zero()) &&
	isKnownToBeAPowerOfTwo(A, false, 0, &I) && I.isEquality())
	return new ICmpInst(I.getInversePredicate(), Builder.CreateAnd(A, B),
	Op1);

	// ~X < ~Y --> Y < X
	// ~X < C --> X > ~C
	if (match(Op0, m_Not(m_Value(A)))) {
	if (match(Op1, m_Not(m_Value(B))))
	return new ICmpInst(I.getPredicate(), B, A);

	const APInt *C;
	if (match(Op1, m_APInt(C)))
	return new ICmpInst(I.getSwappedPredicate(), A,
	ConstantInt::get(Op1->getType(), ~(*C)));
	}

	Instruction *AddI = nullptr;
	if (match(&I, m_UAddWithOverflow(m_Value(A), m_Value(B),
	m_Instruction(AddI))) &&
	isa<IntegerType>(A->getType())) {
	Value *Result;
	Constant *Overflow;
	if (OptimizeOverflowCheck(Instruction::Add, /Signed/false, A, B,
	*AddI, Result, Overflow)) {
	replaceInstUsesWith(*AddI, Result);
	return replaceInstUsesWith(I, Overflow);
	}
	}

	// (zext a) * (zext b) --> llvm.umul.with.overflow.
	if (match(Op0, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
	if (Instruction R = processUMulZExtIdiom(I, Op0, Op1, this))
	return R;
	}
	if (match(Op1, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
	if (Instruction R = processUMulZExtIdiom(I, Op1, Op0, this))
	return R;
	}
	}

	if (Instruction *Res = foldICmpEquality(I))
	return Res;

	// The 'cmpxchg' instruction returns an aggregate containing the old value and
	// an i1 which indicates whether or not we successfully did the swap.
	//
	// Replace comparisons between the old value and the expected value with the
	// indicator that 'cmpxchg' returns.
	//
	// N.B. This transform is only valid when the 'cmpxchg' is not permitted to
	// spuriously fail. In those cases, the old value may equal the expected
	// value but it is possible for the swap to not occur.
	if (I.getPredicate() == ICmpInst::ICMP_EQ)
	if (auto *EVI = dyn_cast<ExtractValueInst>(Op0))
	if (auto *ACXI = dyn_cast<AtomicCmpXchgInst>(EVI->getAggregateOperand()))
	if (EVI->getIndices()[0] == 0 && ACXI->getCompareOperand() == Op1 &&
	!ACXI->isWeak())
	return ExtractValueInst::Create(ACXI, 1);

	{
	Value *X;
	const APInt *C;
	// icmp X+Cst, X
	if (match(Op0, m_Add(m_Value(X), m_APInt(C))) && Op1 == X)
	return foldICmpAddOpConst(X, *C, I.getPredicate());

	// icmp X, X+Cst
	if (match(Op1, m_Add(m_Value(X), m_APInt(C))) && Op0 == X)
	return foldICmpAddOpConst(X, *C, I.getSwappedPredicate());
	}

	if (Instruction *Res = foldICmpWithHighBitMask(I, Builder))
	return Res;

	if (I.getType()->isVectorTy())
	if (Instruction *Res = foldVectorCmp(I, Builder))
	return Res;

	return Changed ? &I : nullptr;
	}

	/// Fold fcmp ([us]itofp x, cst) if possible.
	Instruction InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction LHSI,
	Constant *RHSC) {
	if (!isa<ConstantFP>(RHSC)) return nullptr;
	const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();

	// Get the width of the mantissa. We don't want to hack on conversions that
	// might lose information from the integer, e.g. "i64 -> float"
	int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
	if (MantissaWidth == -1) return nullptr; // Unknown.

	IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());

	bool LHSUnsigned = isa<UIToFPInst>(LHSI);

	if (I.isEquality()) {
	FCmpInst::Predicate P = I.getPredicate();
	bool IsExact = false;
	APSInt RHSCvt(IntTy->getBitWidth(), LHSUnsigned);
	RHS.convertToInteger(RHSCvt, APFloat::rmNearestTiesToEven, &IsExact);

	// If the floating point constant isn't an integer value, we know if we will
	// ever compare equal / not equal to it.
	if (!IsExact) {
	// TODO: Can never be -0.0 and other non-representable values
	APFloat RHSRoundInt(RHS);
	RHSRoundInt.roundToIntegral(APFloat::rmNearestTiesToEven);
	if (RHS.compare(RHSRoundInt) != APFloat::cmpEqual) {
	if (P == FCmpInst::FCMP_OEQ \|\| P == FCmpInst::FCMP_UEQ)
	return replaceInstUsesWith(I, Builder.getFalse());

	assert(P == FCmpInst::FCMP_ONE \|\| P == FCmpInst::FCMP_UNE);
	return replaceInstUsesWith(I, Builder.getTrue());
	}
	}

	// TODO: If the constant is exactly representable, is it always OK to do
	// equality compares as integer?
	}

	// Check to see that the input is converted from an integer type that is small
	// enough that preserves all bits. TODO: check here for "known" sign bits.
	// This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
	unsigned InputSize = IntTy->getScalarSizeInBits();

	// Following test does NOT adjust InputSize downwards for signed inputs,
	// because the most negative value still requires all the mantissa bits
	// to distinguish it from one less than that value.
	if ((int)InputSize > MantissaWidth) {
	// Conversion would lose accuracy. Check if loss can impact comparison.
	int Exp = ilogb(RHS);
	if (Exp == APFloat::IEK_Inf) {
	int MaxExponent = ilogb(APFloat::getLargest(RHS.getSemantics()));
	if (MaxExponent < (int)InputSize - !LHSUnsigned)
	// Conversion could create infinity.
	return nullptr;
	} else {
	// Note that if RHS is zero or NaN, then Exp is negative
	// and first condition is trivially false.
	if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned)
	// Conversion could affect comparison.
	return nullptr;
	}
	}

	// Otherwise, we can potentially simplify the comparison. We know that it
	// will always come through as an integer value and we know the constant is
	// not a NAN (it would have been previously simplified).
	assert(!RHS.isNaN() && "NaN comparison not already folded!");

	ICmpInst::Predicate Pred;
	switch (I.getPredicate()) {
	default: llvm_unreachable("Unexpected predicate!");
	case FCmpInst::FCMP_UEQ:
	case FCmpInst::FCMP_OEQ:
	Pred = ICmpInst::ICMP_EQ;
	break;
	case FCmpInst::FCMP_UGT:
	case FCmpInst::FCMP_OGT:
	Pred = LHSUnsigned ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_SGT;
	break;
	case FCmpInst::FCMP_UGE:
	case FCmpInst::FCMP_OGE:
	Pred = LHSUnsigned ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_SGE;
	break;
	case FCmpInst::FCMP_ULT:
	case FCmpInst::FCMP_OLT:
	Pred = LHSUnsigned ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_SLT;
	break;
	case FCmpInst::FCMP_ULE:
	case FCmpInst::FCMP_OLE:
	Pred = LHSUnsigned ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_SLE;
	break;
	case FCmpInst::FCMP_UNE:
	case FCmpInst::FCMP_ONE:
	Pred = ICmpInst::ICMP_NE;
	break;
	case FCmpInst::FCMP_ORD:
	return replaceInstUsesWith(I, Builder.getTrue());
	case FCmpInst::FCMP_UNO:
	return replaceInstUsesWith(I, Builder.getFalse());
	}

	// Now we know that the APFloat is a normal number, zero or inf.

	// See if the FP constant is too large for the integer. For example,
	// comparing an i8 to 300.0.
	unsigned IntWidth = IntTy->getScalarSizeInBits();

	if (!LHSUnsigned) {
	// If the RHS value is > SignedMax, fold the comparison. This handles +INF
	// and large values.
	APFloat SMax(RHS.getSemantics());
	SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true,
	APFloat::rmNearestTiesToEven);
	if (SMax.compare(RHS) == APFloat::cmpLessThan) { // smax < 13123.0
	if (Pred == ICmpInst::ICMP_NE \|\| Pred == ICmpInst::ICMP_SLT \|\|
	Pred == ICmpInst::ICMP_SLE)
	return replaceInstUsesWith(I, Builder.getTrue());
	return replaceInstUsesWith(I, Builder.getFalse());
	}
	} else {
	// If the RHS value is > UnsignedMax, fold the comparison. This handles
	// +INF and large values.
	APFloat UMax(RHS.getSemantics());
	UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false,
	APFloat::rmNearestTiesToEven);
	if (UMax.compare(RHS) == APFloat::cmpLessThan) { // umax < 13123.0
	if (Pred == ICmpInst::ICMP_NE \|\| Pred == ICmpInst::ICMP_ULT \|\|
	Pred == ICmpInst::ICMP_ULE)
	return replaceInstUsesWith(I, Builder.getTrue());
	return replaceInstUsesWith(I, Builder.getFalse());
	}
	}

	if (!LHSUnsigned) {
	// See if the RHS value is < SignedMin.
	APFloat SMin(RHS.getSemantics());
	SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true,
	APFloat::rmNearestTiesToEven);
	if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0
	if (Pred == ICmpInst::ICMP_NE \|\| Pred == ICmpInst::ICMP_SGT \|\|
	Pred == ICmpInst::ICMP_SGE)
	return replaceInstUsesWith(I, Builder.getTrue());
	return replaceInstUsesWith(I, Builder.getFalse());
	}
	} else {
	// See if the RHS value is < UnsignedMin.
	APFloat SMin(RHS.getSemantics());
	SMin.convertFromAPInt(APInt::getMinValue(IntWidth), true,
	APFloat::rmNearestTiesToEven);
	if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // umin > 12312.0
	if (Pred == ICmpInst::ICMP_NE \|\| Pred == ICmpInst::ICMP_UGT \|\|
	Pred == ICmpInst::ICMP_UGE)
	return replaceInstUsesWith(I, Builder.getTrue());
	return replaceInstUsesWith(I, Builder.getFalse());
	}
	}

	// Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or
	// [0, UMAX], but it may still be fractional. See if it is fractional by
	// casting the FP value to the integer value and back, checking for equality.
	// Don't do this for zero, because -0.0 is not fractional.
	Constant *RHSInt = LHSUnsigned
	? ConstantExpr::getFPToUI(RHSC, IntTy)
	: ConstantExpr::getFPToSI(RHSC, IntTy);
	if (!RHS.isZero()) {
	bool Equal = LHSUnsigned
	? ConstantExpr::getUIToFP(RHSInt, RHSC->getType()) == RHSC
	: ConstantExpr::getSIToFP(RHSInt, RHSC->getType()) == RHSC;
	if (!Equal) {
	// If we had a comparison against a fractional value, we have to adjust
	// the compare predicate and sometimes the value. RHSC is rounded towards
	// zero at this point.
	switch (Pred) {
	default: llvm_unreachable("Unexpected integer comparison!");
	case ICmpInst::ICMP_NE: // (float)int != 4.4 --> true
	return replaceInstUsesWith(I, Builder.getTrue());
	case ICmpInst::ICMP_EQ: // (float)int == 4.4 --> false
	return replaceInstUsesWith(I, Builder.getFalse());
	case ICmpInst::ICMP_ULE:
	// (float)int <= 4.4 --> int <= 4
	// (float)int <= -4.4 --> false
	if (RHS.isNegative())
	return replaceInstUsesWith(I, Builder.getFalse());
	break;
	case ICmpInst::ICMP_SLE:
	// (float)int <= 4.4 --> int <= 4
	// (float)int <= -4.4 --> int < -4
	if (RHS.isNegative())
	Pred = ICmpInst::ICMP_SLT;
	break;
	case ICmpInst::ICMP_ULT:
	// (float)int < -4.4 --> false
	// (float)int < 4.4 --> int <= 4
	if (RHS.isNegative())
	return replaceInstUsesWith(I, Builder.getFalse());
	Pred = ICmpInst::ICMP_ULE;
	break;
	case ICmpInst::ICMP_SLT:
	// (float)int < -4.4 --> int < -4
	// (float)int < 4.4 --> int <= 4
	if (!RHS.isNegative())
	Pred = ICmpInst::ICMP_SLE;
	break;
	case ICmpInst::ICMP_UGT:
	// (float)int > 4.4 --> int > 4
	// (float)int > -4.4 --> true
	if (RHS.isNegative())
	return replaceInstUsesWith(I, Builder.getTrue());
	break;
	case ICmpInst::ICMP_SGT:
	// (float)int > 4.4 --> int > 4
	// (float)int > -4.4 --> int >= -4
	if (RHS.isNegative())
	Pred = ICmpInst::ICMP_SGE;
	break;
	case ICmpInst::ICMP_UGE:
	// (float)int >= -4.4 --> true
	// (float)int >= 4.4 --> int > 4
	if (RHS.isNegative())
	return replaceInstUsesWith(I, Builder.getTrue());
	Pred = ICmpInst::ICMP_UGT;
	break;
	case ICmpInst::ICMP_SGE:
	// (float)int >= -4.4 --> int >= -4
	// (float)int >= 4.4 --> int > 4
	if (!RHS.isNegative())
	Pred = ICmpInst::ICMP_SGT;
	break;
	}
	}
	}

	// Lower this FP comparison into an appropriate integer version of the
	// comparison.
	return new ICmpInst(Pred, LHSI->getOperand(0), RHSInt);
	}

	/// Fold (C / X) < 0.0 --> X < 0.0 if possible. Swap predicate if necessary.
	static Instruction foldFCmpReciprocalAndZero(FCmpInst &I, Instruction LHSI,
	Constant *RHSC) {
	// When C is not 0.0 and infinities are not allowed:
	// (C / X) < 0.0 is a sign-bit test of X
	// (C / X) < 0.0 --> X < 0.0 (if C is positive)
	// (C / X) < 0.0 --> X > 0.0 (if C is negative, swap the predicate)
	//
	// Proof:
	// Multiply (C / X) < 0.0 by X * X / C.
	// - X is non zero, if it is the flag 'ninf' is violated.
	// - C defines the sign of X * X * C. Thus it also defines whether to swap
	// the predicate. C is also non zero by definition.
	//
	// Thus X * X / C is non zero and the transformation is valid. [qed]

	FCmpInst::Predicate Pred = I.getPredicate();

	// Check that predicates are valid.
	if ((Pred != FCmpInst::FCMP_OGT) && (Pred != FCmpInst::FCMP_OLT) &&
	(Pred != FCmpInst::FCMP_OGE) && (Pred != FCmpInst::FCMP_OLE))
	return nullptr;

	// Check that RHS operand is zero.
	if (!match(RHSC, m_AnyZeroFP()))
	return nullptr;

	// Check fastmath flags ('ninf').
	if (!LHSI->hasNoInfs() \|\| !I.hasNoInfs())
	return nullptr;

	// Check the properties of the dividend. It must not be zero to avoid a
	// division by zero (see Proof).
	const APFloat *C;
	if (!match(LHSI->getOperand(0), m_APFloat(C)))
	return nullptr;

	if (C->isZero())
	return nullptr;

	// Get swapped predicate if necessary.
	if (C->isNegative())
	Pred = I.getSwappedPredicate();

	return new FCmpInst(Pred, LHSI->getOperand(1), RHSC, "", &I);
	}

	/// Optimize fabs(X) compared with zero.
	static Instruction *foldFabsWithFcmpZero(FCmpInst &I) {
	Value *X;
	if (!match(I.getOperand(0), m_Intrinsic<Intrinsic::fabs>(m_Value(X))) \|\|
	!match(I.getOperand(1), m_PosZeroFP()))
	return nullptr;

	auto replacePredAndOp0 = [](FCmpInst I, FCmpInst::Predicate P, Value X) {
	I->setPredicate(P);
	I->setOperand(0, X);
	return I;
	};

	switch (I.getPredicate()) {
	case FCmpInst::FCMP_UGE:
	case FCmpInst::FCMP_OLT:
	// fabs(X) >= 0.0 --> true
	// fabs(X) < 0.0 --> false
	llvm_unreachable("fcmp should have simplified");

	case FCmpInst::FCMP_OGT:
	// fabs(X) > 0.0 --> X != 0.0
	return replacePredAndOp0(&I, FCmpInst::FCMP_ONE, X);

	case FCmpInst::FCMP_UGT:
	// fabs(X) u> 0.0 --> X u!= 0.0
	return replacePredAndOp0(&I, FCmpInst::FCMP_UNE, X);

	case FCmpInst::FCMP_OLE:
	// fabs(X) <= 0.0 --> X == 0.0
	return replacePredAndOp0(&I, FCmpInst::FCMP_OEQ, X);

	case FCmpInst::FCMP_ULE:
	// fabs(X) u<= 0.0 --> X u== 0.0
	return replacePredAndOp0(&I, FCmpInst::FCMP_UEQ, X);

	case FCmpInst::FCMP_OGE:
	// fabs(X) >= 0.0 --> !isnan(X)
	assert(!I.hasNoNaNs() && "fcmp should have simplified");
	return replacePredAndOp0(&I, FCmpInst::FCMP_ORD, X);

	case FCmpInst::FCMP_ULT:
	// fabs(X) u< 0.0 --> isnan(X)
	assert(!I.hasNoNaNs() && "fcmp should have simplified");
	return replacePredAndOp0(&I, FCmpInst::FCMP_UNO, X);

	case FCmpInst::FCMP_OEQ:
	case FCmpInst::FCMP_UEQ:
	case FCmpInst::FCMP_ONE:
	case FCmpInst::FCMP_UNE:
	case FCmpInst::FCMP_ORD:
	case FCmpInst::FCMP_UNO:
	// Look through the fabs() because it doesn't change anything but the sign.
	// fabs(X) == 0.0 --> X == 0.0,
	// fabs(X) != 0.0 --> X != 0.0
	// isnan(fabs(X)) --> isnan(X)
	// !isnan(fabs(X) --> !isnan(X)
	return replacePredAndOp0(&I, I.getPredicate(), X);

	default:
	return nullptr;
	}
	}

	Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
	bool Changed = false;

	/// Orders the operands of the compare so that they are listed from most
	/// complex to least complex. This puts constants before unary operators,
	/// before binary operators.
	if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1))) {
	I.swapOperands();
	Changed = true;
	}

	const CmpInst::Predicate Pred = I.getPredicate();
	Value Op0 = I.getOperand(0), Op1 = I.getOperand(1);
	if (Value *V = SimplifyFCmpInst(Pred, Op0, Op1, I.getFastMathFlags(),
	SQ.getWithInstruction(&I)))
	return replaceInstUsesWith(I, V);

	// Simplify 'fcmp pred X, X'
	Type *OpType = Op0->getType();
	assert(OpType == Op1->getType() && "fcmp with different-typed operands?");
	if (Op0 == Op1) {
	switch (Pred) {
	default: break;
	case FCmpInst::FCMP_UNO: // True if unordered: isnan(X) \| isnan(Y)
	case FCmpInst::FCMP_ULT: // True if unordered or less than
	case FCmpInst::FCMP_UGT: // True if unordered or greater than
	case FCmpInst::FCMP_UNE: // True if unordered or not equal
	// Canonicalize these to be 'fcmp uno %X, 0.0'.
	I.setPredicate(FCmpInst::FCMP_UNO);
	I.setOperand(1, Constant::getNullValue(OpType));
	return &I;

	case FCmpInst::FCMP_ORD: // True if ordered (no nans)
	case FCmpInst::FCMP_OEQ: // True if ordered and equal
	case FCmpInst::FCMP_OGE: // True if ordered and greater than or equal
	case FCmpInst::FCMP_OLE: // True if ordered and less than or equal
	// Canonicalize these to be 'fcmp ord %X, 0.0'.
	I.setPredicate(FCmpInst::FCMP_ORD);
	I.setOperand(1, Constant::getNullValue(OpType));
	return &I;
	}
	}

	// If we're just checking for a NaN (ORD/UNO) and have a non-NaN operand,
	// then canonicalize the operand to 0.0.
	if (Pred == CmpInst::FCMP_ORD \|\| Pred == CmpInst::FCMP_UNO) {
	if (!match(Op0, m_PosZeroFP()) && isKnownNeverNaN(Op0, &TLI)) {
	I.setOperand(0, ConstantFP::getNullValue(OpType));
	return &I;
	}
	if (!match(Op1, m_PosZeroFP()) && isKnownNeverNaN(Op1, &TLI)) {
	I.setOperand(1, ConstantFP::getNullValue(OpType));
	return &I;
	}
	}

	// fcmp pred (fneg X), (fneg Y) -> fcmp swap(pred) X, Y
	Value X, Y;
	if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
	return new FCmpInst(I.getSwappedPredicate(), X, Y, "", &I);

	// Test if the FCmpInst instruction is used exclusively by a select as
	// part of a minimum or maximum operation. If so, refrain from doing
	// any other folding. This helps out other analyses which understand
	// non-obfuscated minimum and maximum idioms, such as ScalarEvolution
	// and CodeGen. And in this case, at least one of the comparison
	// operands has at least one user besides the compare (the select),
	// which would often largely negate the benefit of folding anyway.
	if (I.hasOneUse())
	if (SelectInst *SI = dyn_cast<SelectInst>(I.user_back())) {
	Value A, B;
	SelectPatternResult SPR = matchSelectPattern(SI, A, B);
	if (SPR.Flavor != SPF_UNKNOWN)
	return nullptr;
	}

	// The sign of 0.0 is ignored by fcmp, so canonicalize to +0.0:
	// fcmp Pred X, -0.0 --> fcmp Pred X, 0.0
	if (match(Op1, m_AnyZeroFP()) && !match(Op1, m_PosZeroFP())) {
	I.setOperand(1, ConstantFP::getNullValue(OpType));
	return &I;
	}

	// Handle fcmp with instruction LHS and constant RHS.
	Instruction *LHSI;
	Constant *RHSC;
	if (match(Op0, m_Instruction(LHSI)) && match(Op1, m_Constant(RHSC))) {
	switch (LHSI->getOpcode()) {
	case Instruction::PHI:
	// Only fold fcmp into the PHI if the phi and fcmp are in the same
	// block. If in the same block, we're encouraging jump threading. If
	// not, we are just pessimizing the code by making an i1 phi.
	if (LHSI->getParent() == I.getParent())
	if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
	return NV;
	break;
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	if (Instruction *NV = foldFCmpIntToFPConst(I, LHSI, RHSC))
	return NV;
	break;
	case Instruction::FDiv:
	if (Instruction *NV = foldFCmpReciprocalAndZero(I, LHSI, RHSC))
	return NV;
	break;
	case Instruction::Load:
	if (auto *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0)))
	if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
	if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
	!cast<LoadInst>(LHSI)->isVolatile())
	if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
	return Res;
	break;
	}
	}

	if (Instruction *R = foldFabsWithFcmpZero(I))
	return R;

	if (match(Op0, m_FNeg(m_Value(X)))) {
	// fcmp pred (fneg X), C --> fcmp swap(pred) X, -C
	Constant *C;
	if (match(Op1, m_Constant(C))) {
	Constant *NegC = ConstantExpr::getFNeg(C);
	return new FCmpInst(I.getSwappedPredicate(), X, NegC, "", &I);
	}
	}

	if (match(Op0, m_FPExt(m_Value(X)))) {
	// fcmp (fpext X), (fpext Y) -> fcmp X, Y
	if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType())
	return new FCmpInst(Pred, X, Y, "", &I);

	// fcmp (fpext X), C -> fcmp X, (fptrunc C) if fptrunc is lossless
	const APFloat *C;
	if (match(Op1, m_APFloat(C))) {
	const fltSemantics &FPSem =
	X->getType()->getScalarType()->getFltSemantics();
	bool Lossy;
	APFloat TruncC = *C;
	TruncC.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy);

	// Avoid lossy conversions and denormals.
	// Zero is a special case that's OK to convert.
	APFloat Fabs = TruncC;
	Fabs.clearSign();
	if (!Lossy &&
	((Fabs.compare(APFloat::getSmallestNormalized(FPSem)) !=
	APFloat::cmpLessThan) \|\| Fabs.isZero())) {
	Constant *NewC = ConstantFP::get(X->getType(), TruncC);
	return new FCmpInst(Pred, X, NewC, "", &I);
	}
	}
	}

	if (I.getType()->isVectorTy())
	if (Instruction *Res = foldVectorCmp(I, Builder))
	return Res;

	return Changed ? &I : nullptr;
	}
	Index: vendor/llvm/dist-release_90/lib/Transforms/Scalar/DivRemPairs.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Transforms/Scalar/DivRemPairs.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Transforms/Scalar/DivRemPairs.cpp (revision 351303)
	@@ -1,216 +1,278 @@
	//===- DivRemPairs.cpp - Hoist/decompose division and remainder -- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass hoists and/or decomposes integer division and remainder
	// instructions to enable CFG improvements and better codegen.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Transforms/Scalar/DivRemPairs.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/MapVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/GlobalsModRef.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/DebugCounter.h"
	#include "llvm/Transforms/Scalar.h"
	#include "llvm/Transforms/Utils/BypassSlowDivision.h"
	+
	using namespace llvm;

	#define DEBUG_TYPE "div-rem-pairs"
	STATISTIC(NumPairs, "Number of div/rem pairs");
	STATISTIC(NumHoisted, "Number of instructions hoisted");
	STATISTIC(NumDecomposed, "Number of instructions decomposed");
	DEBUG_COUNTER(DRPCounter, "div-rem-pairs-transform",
	"Controls transformations in div-rem-pairs pass");

	-/// Find matching pairs of integer div/rem ops (they have the same numerator,
	-/// denominator, and signedness). If they exist in different basic blocks, bring
	-/// them together by hoisting or replace the common division operation that is
	-/// implicit in the remainder:
	-/// X % Y <--> X - ((X / Y) * Y).
	-///
	-/// We can largely ignore the normal safety and cost constraints on speculation
	-/// of these ops when we find a matching pair. This is because we are already
	-/// guaranteed that any exceptions and most cost are already incurred by the
	-/// first member of the pair.
	-///
	-/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
	-/// SimplifyCFG, but it's split off on its own because it's different enough
	-/// that it doesn't quite match the stated objectives of those passes.
	-static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
	- const DominatorTree &DT) {
	- bool Changed = false;
	+/// A thin wrapper to store two values that we matched as div-rem pair.
	+/// We want this extra indirection to avoid dealing with RAUW'ing the map keys.
	+struct DivRemPairWorklistEntry {
	+ /// The actual udiv/sdiv instruction. Source of truth.
	+ AssertingVH<Instruction> DivInst;

	+ /// The instruction that we have matched as a remainder instruction.
	+ /// Should only be used as Value, don't introspect it.
	+ AssertingVH<Instruction> RemInst;
	+
	+ DivRemPairWorklistEntry(Instruction DivInst_, Instruction RemInst_)
	+ : DivInst(DivInst_), RemInst(RemInst_) {
	+ assert((DivInst->getOpcode() == Instruction::UDiv \|\|
	+ DivInst->getOpcode() == Instruction::SDiv) &&
	+ "Not a division.");
	+ assert(DivInst->getType() == RemInst->getType() && "Types should match.");
	+ // We can't check anything else about remainder instruction,
	+ // it's not strictly required to be a urem/srem.
	+ }
	+
	+ /// The type for this pair, identical for both the div and rem.
	+ Type *getType() const { return DivInst->getType(); }
	+
	+ /// Is this pair signed or unsigned?
	+ bool isSigned() const { return DivInst->getOpcode() == Instruction::SDiv; }
	+
	+ /// In this pair, what are the divident and divisor?
	+ Value *getDividend() const { return DivInst->getOperand(0); }
	+ Value *getDivisor() const { return DivInst->getOperand(1); }
	+};
	+using DivRemWorklistTy = SmallVector<DivRemPairWorklistEntry, 4>;
	+
	+/// Find matching pairs of integer div/rem ops (they have the same numerator,
	+/// denominator, and signedness). Place those pairs into a worklist for further
	+/// processing. This indirection is needed because we have to use TrackingVH<>
	+/// because we will be doing RAUW, and if one of the rem instructions we change
	+/// happens to be an input to another div/rem in the maps, we'd have problems.
	+static DivRemWorklistTy getWorklist(Function &F) {
	// Insert all divide and remainder instructions into maps keyed by their
	// operands and opcode (signed or unsigned).
	DenseMap<DivRemMapKey, Instruction *> DivMap;
	// Use a MapVector for RemMap so that instructions are moved/inserted in a
	// deterministic order.
	MapVector<DivRemMapKey, Instruction *> RemMap;
	for (auto &BB : F) {
	for (auto &I : BB) {
	if (I.getOpcode() == Instruction::SDiv)
	DivMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
	else if (I.getOpcode() == Instruction::UDiv)
	DivMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
	else if (I.getOpcode() == Instruction::SRem)
	RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
	else if (I.getOpcode() == Instruction::URem)
	RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
	}
	}

	+ // We'll accumulate the matching pairs of div-rem instructions here.
	+ DivRemWorklistTy Worklist;
	+
	// We can iterate over either map because we are only looking for matched
	// pairs. Choose remainders for efficiency because they are usually even more
	// rare than division.
	for (auto &RemPair : RemMap) {
	// Find the matching division instruction from the division map.
	Instruction *DivInst = DivMap[RemPair.first];
	if (!DivInst)
	continue;

	- // We have a matching pair of div/rem instructions. If one dominates the
	- // other, hoist and/or replace one.
	+ // We have a matching pair of div/rem instructions.
	NumPairs++;
	Instruction *RemInst = RemPair.second;
	- bool IsSigned = DivInst->getOpcode() == Instruction::SDiv;
	- bool HasDivRemOp = TTI.hasDivRemOp(DivInst->getType(), IsSigned);

	+ // Place it in the worklist.
	+ Worklist.emplace_back(DivInst, RemInst);
	+ }
	+
	+ return Worklist;
	+}
	+
	+/// Find matching pairs of integer div/rem ops (they have the same numerator,
	+/// denominator, and signedness). If they exist in different basic blocks, bring
	+/// them together by hoisting or replace the common division operation that is
	+/// implicit in the remainder:
	+/// X % Y <--> X - ((X / Y) * Y).
	+///
	+/// We can largely ignore the normal safety and cost constraints on speculation
	+/// of these ops when we find a matching pair. This is because we are already
	+/// guaranteed that any exceptions and most cost are already incurred by the
	+/// first member of the pair.
	+///
	+/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
	+/// SimplifyCFG, but it's split off on its own because it's different enough
	+/// that it doesn't quite match the stated objectives of those passes.
	+static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
	+ const DominatorTree &DT) {
	+ bool Changed = false;
	+
	+ // Get the matching pairs of div-rem instructions. We want this extra
	+ // indirection to avoid dealing with having to RAUW the keys of the maps.
	+ DivRemWorklistTy Worklist = getWorklist(F);
	+
	+ // Process each entry in the worklist.
	+ for (DivRemPairWorklistEntry &E : Worklist) {
	+ bool HasDivRemOp = TTI.hasDivRemOp(E.getType(), E.isSigned());
	+
	+ auto &DivInst = E.DivInst;
	+ auto &RemInst = E.RemInst;
	+
	// If the target supports div+rem and the instructions are in the same block
	// already, there's nothing to do. The backend should handle this. If the
	// target does not support div+rem, then we will decompose the rem.
	if (HasDivRemOp && RemInst->getParent() == DivInst->getParent())
	continue;

	bool DivDominates = DT.dominates(DivInst, RemInst);
	if (!DivDominates && !DT.dominates(RemInst, DivInst))
	continue;

	if (!DebugCounter::shouldExecute(DRPCounter))
	continue;

	if (HasDivRemOp) {
	// The target has a single div/rem operation. Hoist the lower instruction
	// to make the matched pair visible to the backend.
	if (DivDominates)
	RemInst->moveAfter(DivInst);
	else
	DivInst->moveAfter(RemInst);
	NumHoisted++;
	} else {
	// The target does not have a single div/rem operation. Decompose the
	// remainder calculation as:
	// X % Y --> X - ((X / Y) * Y).
	- Value *X = RemInst->getOperand(0);
	- Value *Y = RemInst->getOperand(1);
	+ Value *X = E.getDividend();
	+ Value *Y = E.getDivisor();
	Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y);
	Instruction *Sub = BinaryOperator::CreateSub(X, Mul);

	// If the remainder dominates, then hoist the division up to that block:
	//
	// bb1:
	// %rem = srem %x, %y
	// bb2:
	// %div = sdiv %x, %y
	// -->
	// bb1:
	// %div = sdiv %x, %y
	// %mul = mul %div, %y
	// %rem = sub %x, %mul
	//
	// If the division dominates, it's already in the right place. The mul+sub
	// will be in a different block because we don't assume that they are
	// cheap to speculatively execute:
	//
	// bb1:
	// %div = sdiv %x, %y
	// bb2:
	// %rem = srem %x, %y
	// -->
	// bb1:
	// %div = sdiv %x, %y
	// bb2:
	// %mul = mul %div, %y
	// %rem = sub %x, %mul
	//
	// If the div and rem are in the same block, we do the same transform,
	// but any code movement would be within the same block.

	if (!DivDominates)
	DivInst->moveBefore(RemInst);
	Mul->insertAfter(RemInst);
	Sub->insertAfter(Mul);

	// Now kill the explicit remainder. We have replaced it with:
	// (sub X, (mul (div X, Y), Y)
	- RemInst->replaceAllUsesWith(Sub);
	- RemInst->eraseFromParent();
	+ Sub->setName(RemInst->getName() + ".decomposed");
	+ Instruction *OrigRemInst = RemInst;
	+ // Update AssertingVH<> with new instruction so it doesn't assert.
	+ RemInst = Sub;
	+ // And replace the original instruction with the new one.
	+ OrigRemInst->replaceAllUsesWith(Sub);
	+ OrigRemInst->eraseFromParent();
	NumDecomposed++;
	}
	Changed = true;
	}

	return Changed;
	}

	// Pass manager boilerplate below here.

	namespace {
	struct DivRemPairsLegacyPass : public FunctionPass {
	static char ID;
	DivRemPairsLegacyPass() : FunctionPass(ID) {
	initializeDivRemPairsLegacyPassPass(*PassRegistry::getPassRegistry());
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<DominatorTreeWrapperPass>();
	AU.addRequired<TargetTransformInfoWrapperPass>();
	AU.setPreservesCFG();
	AU.addPreserved<DominatorTreeWrapperPass>();
	AU.addPreserved<GlobalsAAWrapperPass>();
	FunctionPass::getAnalysisUsage(AU);
	}

	bool runOnFunction(Function &F) override {
	if (skipFunction(F))
	return false;
	auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
	auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
	return optimizeDivRem(F, TTI, DT);
	}
	};
	-}
	+} // namespace

	char DivRemPairsLegacyPass::ID = 0;
	INITIALIZE_PASS_BEGIN(DivRemPairsLegacyPass, "div-rem-pairs",
	"Hoist/decompose integer division and remainder", false,
	false)
	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
	INITIALIZE_PASS_END(DivRemPairsLegacyPass, "div-rem-pairs",
	"Hoist/decompose integer division and remainder", false,
	false)
	FunctionPass *llvm::createDivRemPairsPass() {
	return new DivRemPairsLegacyPass();
	}

	PreservedAnalyses DivRemPairsPass::run(Function &F,
	FunctionAnalysisManager &FAM) {
	TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
	DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
	if (!optimizeDivRem(F, TTI, DT))
	return PreservedAnalyses::all();
	// TODO: This pass just hoists/replaces math ops - all analyses are preserved?
	PreservedAnalyses PA;
	PA.preserveSet<CFGAnalyses>();
	PA.preserve<GlobalsAA>();
	return PA;
	}
	Index: vendor/llvm/dist-release_90/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
	===================================================================
	--- vendor/llvm/dist-release_90/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp (revision 351302)
	+++ vendor/llvm/dist-release_90/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp (revision 351303)
	@@ -1,827 +1,829 @@
	//===- SpeculateAroundPHIs.cpp --------------------------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
	#include "llvm/ADT/PostOrderIterator.h"
	#include "llvm/ADT/Sequence.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Transforms/Utils/BasicBlockUtils.h"

	using namespace llvm;

	#define DEBUG_TYPE "spec-phis"

	STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around");
	STATISTIC(NumEdgesSplit,
	"Number of critical edges which were split for speculation");
	STATISTIC(NumSpeculatedInstructions,
	"Number of instructions we speculated around the PHI nodes");
	STATISTIC(NumNewRedundantInstructions,
	"Number of new, redundant instructions inserted");

	/// Check whether speculating the users of a PHI node around the PHI
	/// will be safe.
	///
	/// This checks both that all of the users are safe and also that all of their
	/// operands are either recursively safe or already available along an incoming
	/// edge to the PHI.
	///
	/// This routine caches both all the safe nodes explored in `PotentialSpecSet`
	/// and the chain of nodes that definitively reach any unsafe node in
	/// `UnsafeSet`. By preserving these between repeated calls to this routine for
	/// PHIs in the same basic block, the exploration here can be reused. However,
	/// these caches must no be reused for PHIs in a different basic block as they
	/// reflect what is available along incoming edges.
	static bool
	isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
	SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
	SmallPtrSetImpl<Instruction *> &UnsafeSet) {
	auto *PhiBB = PN.getParent();
	SmallPtrSet<Instruction *, 4> Visited;
	SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;

	// Walk each user of the PHI node.
	for (Use &U : PN.uses()) {
	auto *UI = cast<Instruction>(U.getUser());

	// Ensure the use post-dominates the PHI node. This ensures that, in the
	// absence of unwinding, the use will actually be reached.
	// FIXME: We use a blunt hammer of requiring them to be in the same basic
	// block. We should consider using actual post-dominance here in the
	// future.
	if (UI->getParent() != PhiBB) {
	LLVM_DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n");
	return false;
	}

	if (auto CS = ImmutableCallSite(UI)) {
	if (CS.isConvergent() \|\| CS.cannotDuplicate()) {
	LLVM_DEBUG(dbgs() << " Unsafe: convergent "
	"callsite cannot de duplicated: " << *UI << '\n');
	return false;
	}
	}

	// FIXME: This check is much too conservative. We're not going to move these
	// instructions onto new dynamic paths through the program unless there is
	// a call instruction between the use and the PHI node. And memory isn't
	// changing unless there is a store in that same sequence. We should
	// probably change this to do at least a limited scan of the intervening
	// instructions and allow handling stores in easily proven safe cases.
	if (mayBeMemoryDependent(*UI)) {
	LLVM_DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n");
	return false;
	}

	// Now do a depth-first search of everything these users depend on to make
	// sure they are transitively safe. This is a depth-first search, but we
	// check nodes in preorder to minimize the amount of checking.
	Visited.insert(UI);
	DFSStack.push_back({UI, UI->value_op_begin()});
	do {
	User::value_op_iterator OpIt;
	std::tie(UI, OpIt) = DFSStack.pop_back_val();

	while (OpIt != UI->value_op_end()) {
	auto OpI = dyn_cast<Instruction>(OpIt);
	// Increment to the next operand for whenever we continue.
	++OpIt;
	// No need to visit non-instructions, which can't form dependencies.
	if (!OpI)
	continue;

	// Now do the main pre-order checks that this operand is a viable
	// dependency of something we want to speculate.

	// First do a few checks for instructions that won't require
	// speculation at all because they are trivially available on the
	// incoming edge (either through dominance or through an incoming value
	// to a PHI).
	//
	// The cases in the current block will be trivially dominated by the
	// edge.
	auto *ParentBB = OpI->getParent();
	if (ParentBB == PhiBB) {
	if (isa<PHINode>(OpI)) {
	// We can trivially map through phi nodes in the same block.
	continue;
	}
	} else if (DT.dominates(ParentBB, PhiBB)) {
	// Instructions from dominating blocks are already available.
	continue;
	}

	// Once we know that we're considering speculating the operand, check
	// if we've already explored this subgraph and found it to be safe.
	if (PotentialSpecSet.count(OpI))
	continue;

	// If we've already explored this subgraph and found it unsafe, bail.
	// If when we directly test whether this is safe it fails, bail.
	if (UnsafeSet.count(OpI) \|\| ParentBB != PhiBB \|\|
	mayBeMemoryDependent(*OpI)) {
	LLVM_DEBUG(dbgs() << " Unsafe: can't speculate transitive use: "
	<< *OpI << "\n");
	// Record the stack of instructions which reach this node as unsafe
	// so we prune subsequent searches.
	UnsafeSet.insert(OpI);
	for (auto &StackPair : DFSStack) {
	Instruction *I = StackPair.first;
	UnsafeSet.insert(I);
	}
	return false;
	}

	// Skip any operands we're already recursively checking.
	if (!Visited.insert(OpI).second)
	continue;

	// Push onto the stack and descend. We can directly continue this
	// loop when ascending.
	DFSStack.push_back({UI, OpIt});
	UI = OpI;
	OpIt = OpI->value_op_begin();
	}

	// This node and all its operands are safe. Go ahead and cache that for
	// reuse later.
	PotentialSpecSet.insert(UI);

	// Continue with the next node on the stack.
	} while (!DFSStack.empty());
	}

	#ifndef NDEBUG
	// Every visited operand should have been marked as safe for speculation at
	// this point. Verify this and return success.
	for (auto *I : Visited)
	assert(PotentialSpecSet.count(I) &&
	"Failed to mark a visited instruction as safe!");
	#endif
	return true;
	}

	/// Check whether, in isolation, a given PHI node is both safe and profitable
	/// to speculate users around.
	///
	/// This handles checking whether there are any constant operands to a PHI
	/// which could represent a useful speculation candidate, whether the users of
	/// the PHI are safe to speculate including all their transitive dependencies,
	/// and whether after speculation there will be some cost savings (profit) to
	/// folding the operands into the users of the PHI node. Returns true if both
	/// safe and profitable with relevant cost savings updated in the map and with
	/// an update to the `PotentialSpecSet`. Returns false if either safety or
	/// profitability are absent. Some new entries may be made to the
	/// `PotentialSpecSet` even when this routine returns false, but they remain
	/// conservatively correct.
	///
	/// The profitability check here is a local one, but it checks this in an
	/// interesting way. Beyond checking that the total cost of materializing the
	/// constants will be less than the cost of folding them into their users, it
	/// also checks that no one incoming constant will have a higher cost when
	/// folded into its users rather than materialized. This higher cost could
	/// result in a dynamic path that is more expensive even when the total cost
	/// is lower. Currently, all of the interesting cases where this optimization
	/// should fire are ones where it is a no-loss operation in this sense. If we
	/// ever want to be more aggressive here, we would need to balance the
	/// different incoming edges' cost by looking at their respective
	/// probabilities.
	static bool isSafeAndProfitableToSpeculateAroundPHI(
	PHINode &PN, SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
	SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
	SmallPtrSetImpl<Instruction *> &UnsafeSet, DominatorTree &DT,
	TargetTransformInfo &TTI) {
	// First see whether there is any cost savings to speculating around this
	// PHI, and build up a map of the constant inputs to how many times they
	// occur.
	bool NonFreeMat = false;
	struct CostsAndCount {
	int MatCost = TargetTransformInfo::TCC_Free;
	int FoldedCost = TargetTransformInfo::TCC_Free;
	int Count = 0;
	};
	SmallDenseMap<ConstantInt *, CostsAndCount, 16> CostsAndCounts;
	SmallPtrSet<BasicBlock *, 16> IncomingConstantBlocks;
	for (int i : llvm::seq<int>(0, PN.getNumIncomingValues())) {
	auto *IncomingC = dyn_cast<ConstantInt>(PN.getIncomingValue(i));
	if (!IncomingC)
	continue;

	// Only visit each incoming edge with a constant input once.
	if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second)
	continue;

	auto InsertResult = CostsAndCounts.insert({IncomingC, {}});
	// Count how many edges share a given incoming costant.
	++InsertResult.first->second.Count;
	// Only compute the cost the first time we see a particular constant.
	if (!InsertResult.second)
	continue;

	int &MatCost = InsertResult.first->second.MatCost;
	MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType());
	NonFreeMat \|= MatCost != TTI.TCC_Free;
	}
	if (!NonFreeMat) {
	LLVM_DEBUG(dbgs() << " Free: " << PN << "\n");
	// No profit in free materialization.
	return false;
	}

	// Now check that the uses of this PHI can actually be speculated,
	// otherwise we'll still have to materialize the PHI value.
	if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
	LLVM_DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n");
	return false;
	}

	// Compute how much (if any) savings are available by speculating around this
	// PHI.
	for (Use &U : PN.uses()) {
	auto *UserI = cast<Instruction>(U.getUser());
	// Now check whether there is any savings to folding the incoming constants
	// into this use.
	unsigned Idx = U.getOperandNo();

	// If we have a binary operator that is commutative, an actual constant
	// operand would end up on the RHS, so pretend the use of the PHI is on the
	// RHS.
	//
	// Technically, this is a bit weird if both operands are PHIs we're
	// speculating. But if that is the case, giving an "optimistic" cost isn't
	// a bad thing because after speculation it will constant fold. And
	// moreover, such cases should likely have been constant folded already by
	// some other pass, so we shouldn't worry about "modeling" them terribly
	// accurately here. Similarly, if the other operand is a constant, it still
	// seems fine to be "optimistic" in our cost modeling, because when the
	// incoming operand from the PHI node is also a constant, we will end up
	// constant folding.
	if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1)
	// Assume we will commute the constant to the RHS to be canonical.
	Idx = 1;

	// Get the intrinsic ID if this user is an intrinsic.
	Intrinsic::ID IID = Intrinsic::not_intrinsic;
	if (auto *UserII = dyn_cast<IntrinsicInst>(UserI))
	IID = UserII->getIntrinsicID();

	for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) {
	ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first;
	int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
	int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
	if (IID)
	FoldedCost += TTI.getIntImmCost(IID, Idx, IncomingC->getValue(),
	IncomingC->getType());
	else
	FoldedCost +=
	TTI.getIntImmCost(UserI->getOpcode(), Idx, IncomingC->getValue(),
	IncomingC->getType());

	// If we accumulate more folded cost for this incoming constant than
	// materialized cost, then we'll regress any edge with this constant so
	// just bail. We're only interested in cases where folding the incoming
	// constants is at least break-even on all paths.
	if (FoldedCost > MatCost) {
	LLVM_DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC
	<< "\n"
	" Materializing cost: "
	<< MatCost
	<< "\n"
	" Accumulated folded cost: "
	<< FoldedCost << "\n");
	return false;
	}
	}
	}

	// Compute the total cost savings afforded by this PHI node.
	int TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free;
	for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) {
	int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
	int FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
	int Count = IncomingConstantAndCostsAndCount.second.Count;

	TotalMatCost += MatCost * Count;
	TotalFoldedCost += FoldedCost * Count;
	}
	assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is "
	"less that its materialized cost, "
	"the sum must be as well.");

	LLVM_DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost)
	<< ": " << PN << "\n");
	CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
	return true;
	}

	/// Simple helper to walk all the users of a list of phis depth first, and call
	/// a visit function on each one in post-order.
	///
	/// All of the PHIs should be in the same basic block, and this is primarily
	/// used to make a single depth-first walk across their collective users
	/// without revisiting any subgraphs. Callers should provide a fast, idempotent
	/// callable to test whether a node has been visited and the more important
	/// callable to actually visit a particular node.
	///
	/// Depth-first and postorder here refer to the operand graph -- we start
	/// from a collection of users of PHI nodes and walk "up" the operands
	/// depth-first.
	template <typename IsVisitedT, typename VisitT>
	static void visitPHIUsersAndDepsInPostOrder(ArrayRef<PHINode *> PNs,
	IsVisitedT IsVisited,
	VisitT Visit) {
	SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
	for (auto *PN : PNs)
	for (Use &U : PN->uses()) {
	auto *UI = cast<Instruction>(U.getUser());
	if (IsVisited(UI))
	// Already visited this user, continue across the roots.
	continue;

	// Otherwise, walk the operand graph depth-first and visit each
	// dependency in postorder.
	DFSStack.push_back({UI, UI->value_op_begin()});
	do {
	User::value_op_iterator OpIt;
	std::tie(UI, OpIt) = DFSStack.pop_back_val();
	while (OpIt != UI->value_op_end()) {
	auto OpI = dyn_cast<Instruction>(OpIt);
	// Increment to the next operand for whenever we continue.
	++OpIt;
	// No need to visit non-instructions, which can't form dependencies,
	// or instructions outside of our potential dependency set that we
	// were given. Finally, if we've already visited the node, continue
	// to the next.
	if (!OpI \|\| IsVisited(OpI))
	continue;

	// Push onto the stack and descend. We can directly continue this
	// loop when ascending.
	DFSStack.push_back({UI, OpIt});
	UI = OpI;
	OpIt = OpI->value_op_begin();
	}

	// Finished visiting children, visit this node.
	assert(!IsVisited(UI) && "Should not have already visited a node!");
	Visit(UI);
	} while (!DFSStack.empty());
	}
	}

	/// Find profitable PHIs to speculate.
	///
	/// For a PHI node to be profitable, we need the cost of speculating its users
	/// (and their dependencies) to not exceed the savings of folding the PHI's
	/// constant operands into the speculated users.
	///
	/// Computing this is surprisingly challenging. Because users of two different
	/// PHI nodes can depend on each other or on common other instructions, it may
	/// be profitable to speculate two PHI nodes together even though neither one
	/// in isolation is profitable. The straightforward way to find all the
	/// profitable PHIs would be to check each combination of PHIs' cost, but this
	/// is exponential in complexity.
	///
	/// Even if we assume that we only care about cases where we can consider each
	/// PHI node in isolation (rather than considering cases where none are
	/// profitable in isolation but some subset are profitable as a set), we still
	/// have a challenge. The obvious way to find all individually profitable PHIs
	/// is to iterate until reaching a fixed point, but this will be quadratic in
	/// complexity. =/
	///
	/// This code currently uses a linear-to-compute order for a greedy approach.
	/// It won't find cases where a set of PHIs must be considered together, but it
	/// handles most cases of order dependence without quadratic iteration. The
	/// specific order used is the post-order across the operand DAG. When the last
	/// user of a PHI is visited in this postorder walk, we check it for
	/// profitability.
	///
	/// There is an orthogonal extra complexity to all of this: computing the cost
	/// itself can easily become a linear computation making everything again (at
	/// best) quadratic. Using a postorder over the operand graph makes it
	/// particularly easy to avoid this through dynamic programming. As we do the
	/// postorder walk, we build the transitive cost of that subgraph. It is also
	/// straightforward to then update these costs when we mark a PHI for
	/// speculation so that subsequent PHIs don't re-pay the cost of already
	/// speculated instructions.
	static SmallVector<PHINode *, 16>
	findProfitablePHIs(ArrayRef<PHINode *> PNs,
	const SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
	const SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
	int NumPreds, DominatorTree &DT, TargetTransformInfo &TTI) {
	SmallVector<PHINode *, 16> SpecPNs;

	// First, establish a reverse mapping from immediate users of the PHI nodes
	// to the nodes themselves, and count how many users each PHI node has in
	// a way we can update while processing them.
	SmallDenseMap<Instruction , TinyPtrVector<PHINode >, 16> UserToPNMap;
	SmallDenseMap<PHINode *, int, 16> PNUserCountMap;
	SmallPtrSet<Instruction *, 16> UserSet;
	for (auto *PN : PNs) {
	assert(UserSet.empty() && "Must start with an empty user set!");
	for (Use &U : PN->uses())
	UserSet.insert(cast<Instruction>(U.getUser()));
	PNUserCountMap[PN] = UserSet.size();
	for (auto *UI : UserSet)
	UserToPNMap.insert({UI, {}}).first->second.push_back(PN);
	UserSet.clear();
	}

	// Now do a DFS across the operand graph of the users, computing cost as we
	// go and when all costs for a given PHI are known, checking that PHI for
	// profitability.
	SmallDenseMap<Instruction *, int, 16> SpecCostMap;
	visitPHIUsersAndDepsInPostOrder(
	PNs,
	/IsVisited/
	[&](Instruction *I) {
	// We consider anything that isn't potentially speculated to be
	// "visited" as it is already handled. Similarly, anything that is
	// potentially speculated but for which we have an entry in our cost
	// map, we're done.
	return !PotentialSpecSet.count(I) \|\| SpecCostMap.count(I);
	},
	/Visit/
	[&](Instruction *I) {
	// We've fully visited the operands, so sum their cost with this node
	// and update the cost map.
	int Cost = TTI.TCC_Free;
	for (Value *OpV : I->operand_values())
	if (auto *OpI = dyn_cast<Instruction>(OpV)) {
	auto CostMapIt = SpecCostMap.find(OpI);
	if (CostMapIt != SpecCostMap.end())
	Cost += CostMapIt->second;
	}
	Cost += TTI.getUserCost(I);
	bool Inserted = SpecCostMap.insert({I, Cost}).second;
	(void)Inserted;
	assert(Inserted && "Must not re-insert a cost during the DFS!");

	// Now check if this node had a corresponding PHI node using it. If so,
	// we need to decrement the outstanding user count for it.
	auto UserPNsIt = UserToPNMap.find(I);
	if (UserPNsIt == UserToPNMap.end())
	return;
	auto &UserPNs = UserPNsIt->second;
	auto UserPNsSplitIt = std::stable_partition(
	UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) {
	int &PNUserCount = PNUserCountMap.find(UserPN)->second;
	assert(
	PNUserCount > 0 &&
	"Should never re-visit a PN after its user count hits zero!");
	--PNUserCount;
	return PNUserCount != 0;
	});

	// FIXME: Rather than one at a time, we should sum the savings as the
	// cost will be completely shared.
	SmallVector<Instruction *, 16> SpecWorklist;
	for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) {
	int SpecCost = TTI.TCC_Free;
	for (Use &U : PN->uses())
	SpecCost +=
	SpecCostMap.find(cast<Instruction>(U.getUser()))->second;
	SpecCost *= (NumPreds - 1);
	// When the user count of a PHI node hits zero, we should check its
	// profitability. If profitable, we should mark it for speculation
	// and zero out the cost of everything it depends on.
	int CostSavings = CostSavingsMap.find(PN)->second;
	if (SpecCost > CostSavings) {
	LLVM_DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN
	<< "\n"
	" Cost savings: "
	<< CostSavings
	<< "\n"
	" Speculation cost: "
	<< SpecCost << "\n");
	continue;
	}

	// We're going to speculate this user-associated PHI. Copy it out and
	// add its users to the worklist to update their cost.
	SpecPNs.push_back(PN);
	for (Use &U : PN->uses()) {
	auto *UI = cast<Instruction>(U.getUser());
	auto CostMapIt = SpecCostMap.find(UI);
	if (CostMapIt->second == 0)
	continue;
	// Zero out this cost entry to avoid duplicates.
	CostMapIt->second = 0;
	SpecWorklist.push_back(UI);
	}
	}

	// Now walk all the operands of the users in the worklist transitively
	// to zero out all the memoized costs.
	while (!SpecWorklist.empty()) {
	Instruction *SpecI = SpecWorklist.pop_back_val();
	assert(SpecCostMap.find(SpecI)->second == 0 &&
	"Didn't zero out a cost!");

	// Walk the operands recursively to zero out their cost as well.
	for (auto *OpV : SpecI->operand_values()) {
	auto *OpI = dyn_cast<Instruction>(OpV);
	if (!OpI)
	continue;
	auto CostMapIt = SpecCostMap.find(OpI);
	if (CostMapIt == SpecCostMap.end() \|\| CostMapIt->second == 0)
	continue;
	CostMapIt->second = 0;
	SpecWorklist.push_back(OpI);
	}
	}
	});

	return SpecPNs;
	}

	/// Speculate users around a set of PHI nodes.
	///
	/// This routine does the actual speculation around a set of PHI nodes where we
	/// have determined this to be both safe and profitable.
	///
	/// This routine handles any spliting of critical edges necessary to create
	/// a safe block to speculate into as well as cloning the instructions and
	/// rewriting all uses.
	static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
	SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
	SmallSetVector<BasicBlock *, 16> &PredSet,
	DominatorTree &DT) {
	LLVM_DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n");
	NumPHIsSpeculated += SpecPNs.size();

	// Split any critical edges so that we have a block to hoist into.
	auto *ParentBB = SpecPNs[0]->getParent();
	SmallVector<BasicBlock *, 16> SpecPreds;
	SpecPreds.reserve(PredSet.size());
	for (auto *PredBB : PredSet) {
	auto *NewPredBB = SplitCriticalEdge(
	PredBB, ParentBB,
	CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
	if (NewPredBB) {
	++NumEdgesSplit;
	LLVM_DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName()
	<< "\n");
	SpecPreds.push_back(NewPredBB);
	} else {
	assert(PredBB->getSingleSuccessor() == ParentBB &&
	"We need a non-critical predecessor to speculate into.");
	assert(!isa<InvokeInst>(PredBB->getTerminator()) &&
	"Cannot have a non-critical invoke!");

	// Already non-critical, use existing pred.
	SpecPreds.push_back(PredBB);
	}
	}

	SmallPtrSet<Instruction *, 16> SpecSet;
	SmallVector<Instruction *, 16> SpecList;
	visitPHIUsersAndDepsInPostOrder(SpecPNs,
	/IsVisited/
	[&](Instruction *I) {
	// This is visited if we don't need to
	// speculate it or we already have
	// speculated it.
	return !PotentialSpecSet.count(I) \|\|
	SpecSet.count(I);
	},
	/Visit/
	[&](Instruction *I) {
	// All operands scheduled, schedule this
	// node.
	SpecSet.insert(I);
	SpecList.push_back(I);
	});

	int NumSpecInsts = SpecList.size() * SpecPreds.size();
	int NumRedundantInsts = NumSpecInsts - SpecList.size();
	LLVM_DEBUG(dbgs() << " Inserting " << NumSpecInsts
	<< " speculated instructions, " << NumRedundantInsts
	<< " redundancies\n");
	NumSpeculatedInstructions += NumSpecInsts;
	NumNewRedundantInstructions += NumRedundantInsts;

	// Each predecessor is numbered by its index in `SpecPreds`, so for each
	// instruction we speculate, the speculated instruction is stored in that
	// index of the vector associated with the original instruction. We also
	// store the incoming values for each predecessor from any PHIs used.
	SmallDenseMap<Instruction , SmallVector<Value , 2>, 16> SpeculatedValueMap;

	// Inject the synthetic mappings to rewrite PHIs to the appropriate incoming
	// value. This handles both the PHIs we are speculating around and any other
	// PHIs that happen to be used.
	for (auto *OrigI : SpecList)
	for (auto *OpV : OrigI->operand_values()) {
	auto *OpPN = dyn_cast<PHINode>(OpV);
	if (!OpPN \|\| OpPN->getParent() != ParentBB)
	continue;

	auto InsertResult = SpeculatedValueMap.insert({OpPN, {}});
	if (!InsertResult.second)
	continue;

	auto &SpeculatedVals = InsertResult.first->second;

	// Populating our structure for mapping is particularly annoying because
	// finding an incoming value for a particular predecessor block in a PHI
	// node is a linear time operation! To avoid quadratic behavior, we build
	// a map for this PHI node's incoming values and then translate it into
	// the more compact representation used below.
	SmallDenseMap<BasicBlock , Value , 16> IncomingValueMap;
	for (int i : llvm::seq<int>(0, OpPN->getNumIncomingValues()))
	IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i);

	for (auto *PredBB : SpecPreds)
	SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second);
	}

	// Speculate into each predecessor.
	for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) {
	auto *PredBB = SpecPreds[PredIdx];
	assert(PredBB->getSingleSuccessor() == ParentBB &&
	"We need a non-critical predecessor to speculate into.");

	for (auto *OrigI : SpecList) {
	auto *NewI = OrigI->clone();
	NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx));
	NewI->insertBefore(PredBB->getTerminator());

	// Rewrite all the operands to the previously speculated instructions.
	// Because we're walking in-order, the defs must precede the uses and we
	// should already have these mappings.
	for (Use &U : NewI->operands()) {
	auto *OpI = dyn_cast<Instruction>(U.get());
	if (!OpI)
	continue;
	auto MapIt = SpeculatedValueMap.find(OpI);
	if (MapIt == SpeculatedValueMap.end())
	continue;
	const auto &SpeculatedVals = MapIt->second;
	assert(SpeculatedVals[PredIdx] &&
	"Must have a speculated value for this predecessor!");
	assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() &&
	"Speculated value has the wrong type!");

	// Rewrite the use to this predecessor's speculated instruction.
	U.set(SpeculatedVals[PredIdx]);
	}

	// Commute instructions which now have a constant in the LHS but not the
	// RHS.
	if (NewI->isBinaryOp() && NewI->isCommutative() &&
	isa<Constant>(NewI->getOperand(0)) &&
	!isa<Constant>(NewI->getOperand(1)))
	NewI->getOperandUse(0).swap(NewI->getOperandUse(1));

	SpeculatedValueMap[OrigI].push_back(NewI);
	assert(SpeculatedValueMap[OrigI][PredIdx] == NewI &&
	"Mismatched speculated instruction index!");
	}
	}

	// Walk the speculated instruction list and if they have uses, insert a PHI
	// for them from the speculated versions, and replace the uses with the PHI.
	// Then erase the instructions as they have been fully speculated. The walk
	// needs to be in reverse so that we don't think there are users when we'll
	// actually eventually remove them later.
	IRBuilder<> IRB(SpecPNs[0]);
	for (auto *OrigI : llvm::reverse(SpecList)) {
	// Check if we need a PHI for any remaining users and if so, insert it.
	if (!OrigI->use_empty()) {
	auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(),
	Twine(OrigI->getName()) + ".phi");
	// Add the incoming values we speculated.
	auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second;
	for (int PredIdx : llvm::seq<int>(0, SpecPreds.size()))
	SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]);

	// And replace the uses with the PHI node.
	OrigI->replaceAllUsesWith(SpecIPN);
	}

	// It is important to immediately erase this so that it stops using other
	// instructions. This avoids inserting needless PHIs of them.
	OrigI->eraseFromParent();
	}

	// All of the uses of the speculated phi nodes should be removed at this
	// point, so erase them.
	for (auto *SpecPN : SpecPNs) {
	assert(SpecPN->use_empty() && "All users should have been speculated!");
	SpecPN->eraseFromParent();
	}
	}

	/// Try to speculate around a series of PHIs from a single basic block.
	///
	/// This routine checks whether any of these PHIs are profitable to speculate
	/// users around. If safe and profitable, it does the speculation. It returns
	/// true when at least some speculation occurs.
	static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
	DominatorTree &DT, TargetTransformInfo &TTI) {
	LLVM_DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");

	// Savings in cost from speculating around a PHI node.
	SmallDenseMap<PHINode *, int, 16> CostSavingsMap;

	// Remember the set of instructions that are candidates for speculation so
	// that we can quickly walk things within that space. This prunes out
	// instructions already available along edges, etc.
	SmallPtrSet<Instruction *, 16> PotentialSpecSet;

	// Remember the set of instructions that are (transitively) unsafe to
	// speculate into the incoming edges of this basic block. This avoids
	// recomputing them for each PHI node we check. This set is specific to this
	// block though as things are pruned out of it based on what is available
	// along incoming edges.
	SmallPtrSet<Instruction *, 16> UnsafeSet;

	// For each PHI node in this block, check whether there are immediate folding
	// opportunities from speculation, and whether that speculation will be
	// valid. This determise the set of safe PHIs to speculate.
	PNs.erase(llvm::remove_if(PNs,
	[&](PHINode *PN) {
	return !isSafeAndProfitableToSpeculateAroundPHI(
	*PN, CostSavingsMap, PotentialSpecSet,
	UnsafeSet, DT, TTI);
	}),
	PNs.end());
	// If no PHIs were profitable, skip.
	if (PNs.empty()) {
	LLVM_DEBUG(dbgs() << " No safe and profitable PHIs found!\n");
	return false;
	}

	// We need to know how much speculation will cost which is determined by how
	// many incoming edges will need a copy of each speculated instruction.
	SmallSetVector<BasicBlock *, 16> PredSet;
	for (auto *PredBB : PNs[0]->blocks()) {
	if (!PredSet.insert(PredBB))
	continue;

	// We cannot speculate when a predecessor is an indirect branch.
	// FIXME: We also can't reliably create a non-critical edge block for
	// speculation if the predecessor is an invoke. This doesn't seem
	// fundamental and we should probably be splitting critical edges
	// differently.
	- if (isa<IndirectBrInst>(PredBB->getTerminator()) \|\|
	- isa<InvokeInst>(PredBB->getTerminator())) {
	+ const auto *TermInst = PredBB->getTerminator();
	+ if (isa<IndirectBrInst>(TermInst) \|\|
	+ isa<InvokeInst>(TermInst) \|\|
	+ isa<CallBrInst>(TermInst)) {
	LLVM_DEBUG(dbgs() << " Invalid: predecessor terminator: "
	<< PredBB->getName() << "\n");
	return false;
	}
	}
	if (PredSet.size() < 2) {
	LLVM_DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n");
	return false;
	}

	SmallVector<PHINode *, 16> SpecPNs = findProfitablePHIs(
	PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI);
	if (SpecPNs.empty())
	// Nothing to do.
	return false;

	speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT);
	return true;
	}

	PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F,
	FunctionAnalysisManager &AM) {
	auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
	auto &TTI = AM.getResult<TargetIRAnalysis>(F);

	bool Changed = false;
	for (auto BB : ReversePostOrderTraversal<Function >(&F)) {
	SmallVector<PHINode *, 16> PNs;
	auto BBI = BB->begin();
	while (auto PN = dyn_cast<PHINode>(&BBI)) {
	PNs.push_back(PN);
	++BBI;
	}

	if (PNs.empty())
	continue;

	Changed \|= tryToSpeculatePHIs(PNs, DT, TTI);
	}

	if (!Changed)
	return PreservedAnalyses::all();

	PreservedAnalyses PA;
	return PA;
	}

File Metadata

Mime Type: application/octet-stream
Expires: Tue, May 21, 10:57 AM (1 d, 23 h)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: ZjXWA_agS6ZF
Default Alt Text: (7 MB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions